From d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Fri, 27 Apr 2018 10:37:02 -0700
Subject: Check in gVisor.

PiperOrigin-RevId: 194583126
Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
---
 runsc/BUILD                             |  17 +
 runsc/boot/BUILD                        |  88 +++
 runsc/boot/capability.go                | 120 ++++
 runsc/boot/config.go                    | 162 ++++++
 runsc/boot/controller.go                | 128 +++++
 runsc/boot/events.go                    |  81 +++
 runsc/boot/fds.go                       |  61 +++
 runsc/boot/filter/BUILD                 |  26 +
 runsc/boot/filter/config.go             | 175 ++++++
 runsc/boot/filter/extra_filters.go      |  24 +
 runsc/boot/filter/extra_filters_msan.go |  30 +
 runsc/boot/filter/extra_filters_race.go |  33 ++
 runsc/boot/filter/filter.go             |  67 +++
 runsc/boot/fs.go                        | 441 +++++++++++++++
 runsc/boot/limits.go                    |  60 ++
 runsc/boot/loader.go                    | 354 ++++++++++++
 runsc/boot/loader_test.go               | 238 ++++++++
 runsc/boot/network.go                   | 213 ++++++++
 runsc/boot/strace.go                    |  40 ++
 runsc/cmd/BUILD                         |  58 ++
 runsc/cmd/boot.go                       | 161 ++++++
 runsc/cmd/cmd.go                        |  77 +++
 runsc/cmd/create.go                     |  93 ++++
 runsc/cmd/delete.go                     |  74 +++
 runsc/cmd/events.go                     | 111 ++++
 runsc/cmd/exec.go                       | 375 +++++++++++++
 runsc/cmd/exec_test.go                  | 154 ++++++
 runsc/cmd/gofer.go                      | 134 +++++
 runsc/cmd/kill.go                       | 142 +++++
 runsc/cmd/list.go                       | 117 ++++
 runsc/cmd/path.go                       |  38 ++
 runsc/cmd/ps.go                         |  86 +++
 runsc/cmd/run.go                        |  82 +++
 runsc/cmd/start.go                      |  64 +++
 runsc/cmd/state.go                      |  73 +++
 runsc/fsgofer/BUILD                     |  33 ++
 runsc/fsgofer/fsgofer.go                | 937 ++++++++++++++++++++++++++++++++
 runsc/fsgofer/fsgofer_test.go           | 576 ++++++++++++++++++++
 runsc/fsgofer/fsgofer_unsafe.go         |  58 ++
 runsc/main.go                           | 199 +++++++
 runsc/sandbox/BUILD                     |  53 ++
 runsc/sandbox/console.go                |  60 ++
 runsc/sandbox/hook.go                   | 111 ++++
 runsc/sandbox/namespace.go              | 204 +++++++
 runsc/sandbox/network.go                | 348 ++++++++++++
 runsc/sandbox/sandbox.go                | 666 +++++++++++++++++++++++
 runsc/sandbox/sandbox_test.go           | 649 ++++++++++++++++++++++
 runsc/sandbox/status.go                 |  56 ++
 runsc/specutils/BUILD                   |  18 +
 runsc/specutils/specutils.go            | 183 +++++++
 50 files changed, 8348 insertions(+)
 create mode 100644 runsc/BUILD
 create mode 100644 runsc/boot/BUILD
 create mode 100644 runsc/boot/capability.go
 create mode 100644 runsc/boot/config.go
 create mode 100644 runsc/boot/controller.go
 create mode 100644 runsc/boot/events.go
 create mode 100644 runsc/boot/fds.go
 create mode 100644 runsc/boot/filter/BUILD
 create mode 100644 runsc/boot/filter/config.go
 create mode 100644 runsc/boot/filter/extra_filters.go
 create mode 100644 runsc/boot/filter/extra_filters_msan.go
 create mode 100644 runsc/boot/filter/extra_filters_race.go
 create mode 100644 runsc/boot/filter/filter.go
 create mode 100644 runsc/boot/fs.go
 create mode 100644 runsc/boot/limits.go
 create mode 100644 runsc/boot/loader.go
 create mode 100644 runsc/boot/loader_test.go
 create mode 100644 runsc/boot/network.go
 create mode 100644 runsc/boot/strace.go
 create mode 100644 runsc/cmd/BUILD
 create mode 100644 runsc/cmd/boot.go
 create mode 100644 runsc/cmd/cmd.go
 create mode 100644 runsc/cmd/create.go
 create mode 100644 runsc/cmd/delete.go
 create mode 100644 runsc/cmd/events.go
 create mode 100644 runsc/cmd/exec.go
 create mode 100644 runsc/cmd/exec_test.go
 create mode 100644 runsc/cmd/gofer.go
 create mode 100644 runsc/cmd/kill.go
 create mode 100644 runsc/cmd/list.go
 create mode 100644 runsc/cmd/path.go
 create mode 100644 runsc/cmd/ps.go
 create mode 100644 runsc/cmd/run.go
 create mode 100644 runsc/cmd/start.go
 create mode 100644 runsc/cmd/state.go
 create mode 100644 runsc/fsgofer/BUILD
 create mode 100644 runsc/fsgofer/fsgofer.go
 create mode 100644 runsc/fsgofer/fsgofer_test.go
 create mode 100644 runsc/fsgofer/fsgofer_unsafe.go
 create mode 100644 runsc/main.go
 create mode 100644 runsc/sandbox/BUILD
 create mode 100644 runsc/sandbox/console.go
 create mode 100644 runsc/sandbox/hook.go
 create mode 100644 runsc/sandbox/namespace.go
 create mode 100644 runsc/sandbox/network.go
 create mode 100644 runsc/sandbox/sandbox.go
 create mode 100644 runsc/sandbox/sandbox_test.go
 create mode 100644 runsc/sandbox/status.go
 create mode 100644 runsc/specutils/BUILD
 create mode 100644 runsc/specutils/specutils.go

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
new file mode 100644
index 000000000..3651c2d30
--- /dev/null
+++ b/runsc/BUILD
@@ -0,0 +1,17 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
+go_binary(
+    name = "runsc",
+    srcs = [
+        "main.go",
+    ],
+    pure = "on",
+    deps = [
+        "//pkg/log",
+        "//runsc/boot",
+        "//runsc/cmd",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
+)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
new file mode 100644
index 000000000..88736cfa4
--- /dev/null
+++ b/runsc/boot/BUILD
@@ -0,0 +1,88 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "boot",
+    srcs = [
+        "capability.go",
+        "config.go",
+        "controller.go",
+        "events.go",
+        "fds.go",
+        "fs.go",
+        "limits.go",
+        "loader.go",
+        "network.go",
+        "strace.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/control/server",
+        "//pkg/cpuid",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/control",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/dev",
+        "//pkg/sentry/fs/gofer",
+        "//pkg/sentry/fs/host",
+        "//pkg/sentry/fs/proc",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fs/sys",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+        "//pkg/sentry/sighandling",
+        "//pkg/sentry/socket/epsocket",
+        "//pkg/sentry/socket/hostinet",
+        "//pkg/sentry/socket/netlink",
+        "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/strace",
+        "//pkg/sentry/syscalls/linux",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/watchdog",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/arp",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/urpc",
+        "//runsc/boot/filter",
+        "//runsc/specutils",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
+    ],
+)
+
+go_test(
+    name = "boot_test",
+    size = "small",
+    srcs = ["loader_test.go"],
+    embed = [":boot"],
+    deps = [
+        "//pkg/control/server",
+        "//pkg/log",
+        "//pkg/sentry/context/contexttest",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/boot/capability.go b/runsc/boot/capability.go
new file mode 100644
index 000000000..4c6a59245
--- /dev/null
+++ b/runsc/boot/capability.go
@@ -0,0 +1,120 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"os"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
+)
+
+// ApplyCaps applies the capabilities in the spec to the current thread.
+//
+// Note that it must be called with current thread locked.
+func ApplyCaps(conf *Config, caps *specs.LinuxCapabilities) error {
+	setter, err := capability.NewPid2(os.Getpid())
+	if err != nil {
+		return err
+	}
+
+	bounding, err := capsFromNames(caps.Bounding)
+	if err != nil {
+		return err
+	}
+	effective, err := capsFromNames(caps.Effective)
+	if err != nil {
+		return err
+	}
+	permitted, err := capsFromNames(caps.Permitted)
+	if err != nil {
+		return err
+	}
+	inheritable, err := capsFromNames(caps.Inheritable)
+	if err != nil {
+		return err
+	}
+	ambient, err := capsFromNames(caps.Ambient)
+	if err != nil {
+		return err
+	}
+
+	// Ptrace platform requires extra capabilities.
+	if conf.Platform == PlatformPtrace {
+		bounding = append(bounding, capability.CAP_SYS_PTRACE)
+		effective = append(effective, capability.CAP_SYS_PTRACE)
+		permitted = append(permitted, capability.CAP_SYS_PTRACE)
+	}
+
+	setter.Set(capability.BOUNDS, bounding...)
+	setter.Set(capability.PERMITTED, permitted...)
+	setter.Set(capability.INHERITABLE, inheritable...)
+	setter.Set(capability.EFFECTIVE, effective...)
+	setter.Set(capability.AMBIENT, ambient...)
+	return setter.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS)
+}
+
+func capsFromNames(names []string) ([]capability.Cap, error) {
+	var caps []capability.Cap
+	for _, name := range names {
+		cap, ok := capFromName[name]
+		if !ok {
+			return nil, fmt.Errorf("invalid capability %q", name)
+		}
+		caps = append(caps, cap)
+	}
+	return caps, nil
+}
+
+var capFromName = map[string]capability.Cap{
+	"CAP_CHOWN":            capability.CAP_CHOWN,
+	"CAP_DAC_OVERRIDE":     capability.CAP_DAC_OVERRIDE,
+	"CAP_DAC_READ_SEARCH":  capability.CAP_DAC_READ_SEARCH,
+	"CAP_FOWNER":           capability.CAP_FOWNER,
+	"CAP_FSETID":           capability.CAP_FSETID,
+	"CAP_KILL":             capability.CAP_KILL,
+	"CAP_SETGID":           capability.CAP_SETGID,
+	"CAP_SETUID":           capability.CAP_SETUID,
+	"CAP_SETPCAP":          capability.CAP_SETPCAP,
+	"CAP_LINUX_IMMUTABLE":  capability.CAP_LINUX_IMMUTABLE,
+	"CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE,
+	"CAP_NET_BROAD_CAST":   capability.CAP_NET_BROADCAST,
+	"CAP_NET_ADMIN":        capability.CAP_NET_ADMIN,
+	"CAP_NET_RAW":          capability.CAP_NET_RAW,
+	"CAP_IPC_LOCK":         capability.CAP_IPC_LOCK,
+	"CAP_IPC_OWNER":        capability.CAP_IPC_OWNER,
+	"CAP_SYS_MODULE":       capability.CAP_SYS_MODULE,
+	"CAP_SYS_RAWIO":        capability.CAP_SYS_RAWIO,
+	"CAP_SYS_CHROOT":       capability.CAP_SYS_CHROOT,
+	"CAP_SYS_PTRACE":       capability.CAP_SYS_PTRACE,
+	"CAP_SYS_PACCT":        capability.CAP_SYS_PACCT,
+	"CAP_SYS_ADMIN":        capability.CAP_SYS_ADMIN,
+	"CAP_SYS_BOOT":         capability.CAP_SYS_BOOT,
+	"CAP_SYS_NICE":         capability.CAP_SYS_NICE,
+	"CAP_SYS_RESOURCE":     capability.CAP_SYS_RESOURCE,
+	"CAP_SYS_TIME":         capability.CAP_SYS_TIME,
+	"CAP_SYS_TTY_CONFIG":   capability.CAP_SYS_TTY_CONFIG,
+	"CAP_MKNOD":            capability.CAP_MKNOD,
+	"CAP_LEASE":            capability.CAP_LEASE,
+	"CAP_AUDIT_WRITE":      capability.CAP_AUDIT_WRITE,
+	"CAP_AUDIT_CONTROL":    capability.CAP_AUDIT_CONTROL,
+	"CAP_SETFCAP":          capability.CAP_SETFCAP,
+	"CAP_MAC_OVERRIDE":     capability.CAP_MAC_OVERRIDE,
+	"CAP_MAC_ADMIN":        capability.CAP_MAC_ADMIN,
+	"CAP_SYSLOG":           capability.CAP_SYSLOG,
+	"CAP_WAKE_ALARM":       capability.CAP_WAKE_ALARM,
+	"CAP_BLOCK_SUSPEND":    capability.CAP_BLOCK_SUSPEND,
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
new file mode 100644
index 000000000..f3e33e89a
--- /dev/null
+++ b/runsc/boot/config.go
@@ -0,0 +1,162 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import "fmt"
+
+// PlatformType tells which platform to use.
+type PlatformType int
+
+const (
+	// Ptrace runs the sandbox with the ptrace platform.
+	PlatformPtrace PlatformType = iota
+
+	// KVM runs the sandbox with the KVM platform.
+	PlatformKVM
+)
+
+// MakePlatformType converts type from string.
+func MakePlatformType(s string) (PlatformType, error) {
+	switch s {
+	case "ptrace":
+		return PlatformPtrace, nil
+	case "kvm":
+		return PlatformKVM, nil
+	default:
+		return 0, fmt.Errorf("invalid platform type %q", s)
+	}
+}
+
+func (p PlatformType) String() string {
+	switch p {
+	case PlatformPtrace:
+		return "ptrace"
+	case PlatformKVM:
+		return "kvm"
+	default:
+		return fmt.Sprintf("unknown(%d)", p)
+	}
+}
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+	// FileAccessProxy sends IO requests to a Gofer process that validates the
+	// requests and forwards them to the host.
+	FileAccessProxy FileAccessType = iota
+
+	// FileAccessDirect connects the sandbox directly to the host filesystem.
+	FileAccessDirect
+)
+
+// MakeFileAccessType converts type from string.
+func MakeFileAccessType(s string) (FileAccessType, error) {
+	switch s {
+	case "proxy":
+		return FileAccessProxy, nil
+	case "direct":
+		return FileAccessDirect, nil
+	default:
+		return 0, fmt.Errorf("invalid file access type %q", s)
+	}
+}
+
+func (f FileAccessType) String() string {
+	switch f {
+	case FileAccessProxy:
+		return "proxy"
+	case FileAccessDirect:
+		return "direct"
+	default:
+		return fmt.Sprintf("unknown(%d)", f)
+	}
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+	// NetworkSandbox uses internal network stack, isolated from the host.
+	NetworkSandbox NetworkType = iota
+
+	// NetworkHost redirects network related syscalls to the host network.
+	NetworkHost
+
+	// NetworkNone sets up just loopback using netstack.
+	NetworkNone
+)
+
+// MakeNetworkType converts type from string.
+func MakeNetworkType(s string) (NetworkType, error) {
+	switch s {
+	case "sandbox":
+		return NetworkSandbox, nil
+	case "host":
+		return NetworkHost, nil
+	case "none":
+		return NetworkNone, nil
+	default:
+		return 0, fmt.Errorf("invalid network type %q", s)
+	}
+}
+
+func (n NetworkType) String() string {
+	switch n {
+	case NetworkSandbox:
+		return "sandbox"
+	case NetworkHost:
+		return "host"
+	case NetworkNone:
+		return "none"
+	default:
+		return fmt.Sprintf("unknown(%d)", n)
+	}
+}
+
+// Config holds configuration that is not part of the runtime spec.
+type Config struct {
+	// RootDir is the runtime root directory.
+	RootDir string
+
+	// FileAccess indicates how the filesystem is accessed.
+	FileAccess FileAccessType
+
+	// Overlay is whether to wrap the root filesystem in an overlay.
+	Overlay bool
+
+	// Network indicates what type of network to use.
+	Network NetworkType
+
+	// LogPackets indicates that all network packets should be logged.
+	LogPackets bool
+
+	// Platform is the platform to run on.
+	Platform PlatformType
+
+	// Strace indicates that strace should be enabled.
+	Strace bool
+
+	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
+	// true and this list is empty, then all syscalls will be traced.
+	StraceSyscalls []string
+
+	// StraceLogSize is the max size of data blobs to display.
+	StraceLogSize uint
+
+	// DisableSeccomp indicates whether seccomp syscall filters should be
+	// disabled. Pardon the double negation, but default to enabled is important.
+	DisableSeccomp bool
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
new file mode 100644
index 000000000..4d4ef7256
--- /dev/null
+++ b/runsc/boot/controller.go
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+)
+
+const (
+	// ApplicationStart is the URPC endpoint for starting a sandboxed app.
+	ApplicationStart = "application.Start"
+
+	// ApplicationProcesses is the URPC endpoint for getting the list of
+	// processes running in a sandbox.
+	ApplicationProcesses = "application.Processes"
+
+	// ApplicationExecute is the URPC endpoint for executing a command in a
+	// sandbox.
+	ApplicationExecute = "application.Execute"
+
+	// ApplicationEvent is the URPC endpoint for getting stats about the
+	// container used by "runsc events".
+	ApplicationEvent = "application.Event"
+
+	// NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
+	// and routes in a network stack.
+	NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
+)
+
+// ControlSocketAddr generates an abstract unix socket name for the given id.
+func ControlSocketAddr(id string) string {
+	return fmt.Sprintf("\x00runsc-sandbox.%s", id)
+}
+
+// controller holds the control server, and is used for communication into the
+// sandbox.
+type controller struct {
+	// srv is the contorl server.
+	srv *server.Server
+
+	// app holds the application methods.
+	app *application
+}
+
+// newController creates a new controller and starts it listening.
+func newController(fd int, k *kernel.Kernel) (*controller, error) {
+	srv, err := server.CreateFromFD(fd)
+	if err != nil {
+		return nil, err
+	}
+
+	app := &application{
+		startChan:       make(chan struct{}),
+		startResultChan: make(chan error, 1),
+		k:               k,
+	}
+	srv.Register(app)
+
+	if eps, ok := k.NetworkStack().(*epsocket.Stack); ok {
+		net := &Network{
+			Stack: eps.Stack,
+		}
+		srv.Register(net)
+	}
+
+	if err := srv.StartServing(); err != nil {
+		return nil, err
+	}
+
+	return &controller{
+		srv: srv,
+		app: app,
+	}, nil
+}
+
+// application contains methods that control the sandboxed application.
+type application struct {
+	// startChan is used to signal when the application process should be
+	// started.
+	startChan chan struct{}
+
+	// startResultChan is used to signal when the application has started. Any
+	// errors encountered during startup will be sent to the channel. A nil value
+	// indicates success.
+	startResultChan chan error
+
+	// k is the emulated linux kernel on which the sandboxed
+	// application runs.
+	k *kernel.Kernel
+}
+
+// Start will start the application process.
+func (a *application) Start(_, _ *struct{}) error {
+	// Tell the application to start and wait for the result.
+	a.startChan <- struct{}{}
+	return <-a.startResultChan
+}
+
+// Processes retrieves information about processes running in the sandbox.
+func (a *application) Processes(_, out *[]*control.Process) error {
+	return control.Processes(a.k, out)
+}
+
+// Execute runs a command on a created or running sandbox.
+func (a *application) Execute(e *control.ExecArgs, waitStatus *uint32) error {
+	proc := control.Proc{Kernel: a.k}
+	if err := proc.Exec(e, waitStatus); err != nil {
+		return fmt.Errorf("error executing: %+v: %v", e, err)
+	}
+	return nil
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
new file mode 100644
index 000000000..ef6459b01
--- /dev/null
+++ b/runsc/boot/events.go
@@ -0,0 +1,81 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// Event struct for encoding the event data to JSON. Corresponds to runc's
+// main.event struct.
+type Event struct {
+	Type string      `json:"type"`
+	ID   string      `json:"id"`
+	Data interface{} `json:"data,omitempty"`
+}
+
+// Stats is the runc specific stats structure for stability when encoding and
+// decoding stats.
+// TODO: Many fields aren't obtainable due to a lack of cgroups.
+type Stats struct {
+	Memory Memory `json:"memory"`
+	Pids   Pids   `json:"pids"`
+}
+
+// Pids contains stats on processes.
+type Pids struct {
+	Current uint64 `json:"current,omitempty"`
+	Limit   uint64 `json:"limit,omitempty"`
+}
+
+// MemoryEntry contains stats on a kind of memory.
+type MemoryEntry struct {
+	Limit   uint64 `json:"limit"`
+	Usage   uint64 `json:"usage,omitempty"`
+	Max     uint64 `json:"max,omitempty"`
+	Failcnt uint64 `json:"failcnt"`
+}
+
+// Memory contains stats on memory.
+type Memory struct {
+	Cache     uint64            `json:"cache,omitempty"`
+	Usage     MemoryEntry       `json:"usage,omitempty"`
+	Swap      MemoryEntry       `json:"swap,omitempty"`
+	Kernel    MemoryEntry       `json:"kernel,omitempty"`
+	KernelTCP MemoryEntry       `json:"kernelTCP,omitempty"`
+	Raw       map[string]uint64 `json:"raw,omitempty"`
+}
+
+func (a *application) Event(_ *struct{}, out *Event) error {
+	stats := &Stats{}
+	stats.populateMemory(a.k)
+	stats.populatePIDs(a.k)
+	*out = Event{Type: "stats", Data: stats}
+	return nil
+}
+
+func (s *Stats) populateMemory(k *kernel.Kernel) {
+	mem := k.Platform.Memory()
+	mem.UpdateUsage()
+	_, totalUsage := usage.MemoryAccounting.Copy()
+	s.Memory.Usage = MemoryEntry{
+		Usage: totalUsage,
+	}
+}
+
+func (s *Stats) populatePIDs(k *kernel.Kernel) {
+	s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups()))
+}
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
new file mode 100644
index 000000000..0449e243d
--- /dev/null
+++ b/runsc/boot/fds.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// createFDMap creates an fd map that contains stdin, stdout, and stderr. If
+// console is true, then ioctl calls will be passed through to the host fd.
+//
+// TODO: We currently arn't passing any FDs in to the sandbox, so
+// there's not much else for this function to do.  It will get more complicated
+// when gofers enter the picture.  Also the LISTEN_FDS environment variable
+// allows passing arbitrary FDs to the sandbox, which we do not yet support.
+func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool) (*kernel.FDMap, error) {
+	fdm := k.NewFDMap()
+	defer fdm.DecRef()
+
+	// Maps sandbox fd to host fd.
+	fdMap := map[int]int{
+		0: syscall.Stdin,
+		1: syscall.Stdout,
+		2: syscall.Stderr,
+	}
+	mounter := fs.FileOwnerFromContext(ctx)
+
+	for sfd, hfd := range fdMap {
+		file, err := host.ImportFile(ctx, hfd, mounter, console /* allow ioctls */)
+		if err != nil {
+			return nil, fmt.Errorf("failed to import fd %d: %v", hfd, err)
+		}
+		defer file.DecRef()
+		if err := fdm.NewFDAt(kdefs.FD(sfd), file, kernel.FDFlags{}, l); err != nil {
+			return nil, fmt.Errorf("failed to add imported fd %d to FDMap: %v", hfd, err)
+		}
+	}
+
+	fdm.IncRef()
+	return fdm, nil
+}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
new file mode 100644
index 000000000..fd1b18717
--- /dev/null
+++ b/runsc/boot/filter/BUILD
@@ -0,0 +1,26 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "filter",
+    srcs = [
+        "config.go",
+        "extra_filters.go",
+        "extra_filters_msan.go",
+        "extra_filters_race.go",
+        "filter.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/boot/filter",
+    visibility = [
+        "//runsc/boot:__subpackages__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//pkg/seccomp",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
new file mode 100644
index 000000000..130e987df
--- /dev/null
+++ b/runsc/boot/filter/config.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+)
+
+// allowedSyscalls is the set of syscalls executed by the Sentry
+// to the host OS.
+var allowedSyscalls = []uintptr{
+	syscall.SYS_ACCEPT,
+	syscall.SYS_ARCH_PRCTL,
+	syscall.SYS_CLOCK_GETTIME,
+	syscall.SYS_CLONE,
+	syscall.SYS_CLOSE,
+	syscall.SYS_DUP,
+	syscall.SYS_DUP2,
+	syscall.SYS_EPOLL_CREATE1,
+	syscall.SYS_EPOLL_CTL,
+	syscall.SYS_EPOLL_PWAIT,
+	syscall.SYS_EPOLL_WAIT,
+	syscall.SYS_EVENTFD2,
+	syscall.SYS_EXIT,
+	syscall.SYS_EXIT_GROUP,
+	syscall.SYS_FALLOCATE,
+	syscall.SYS_FCHMOD,
+	syscall.SYS_FCNTL,
+	syscall.SYS_FSTAT,
+	syscall.SYS_FSYNC,
+	syscall.SYS_FTRUNCATE,
+	syscall.SYS_FUTEX,
+	syscall.SYS_GETDENTS64,
+	syscall.SYS_GETPID,
+	unix.SYS_GETRANDOM,
+	syscall.SYS_GETSOCKOPT,
+	syscall.SYS_GETTID,
+	syscall.SYS_GETTIMEOFDAY,
+	syscall.SYS_LISTEN,
+	syscall.SYS_LSEEK,
+	syscall.SYS_MADVISE,
+	syscall.SYS_MINCORE,
+	syscall.SYS_MMAP,
+	syscall.SYS_MPROTECT,
+	syscall.SYS_MUNMAP,
+	syscall.SYS_NEWFSTATAT,
+	syscall.SYS_POLL,
+	syscall.SYS_PREAD64,
+	syscall.SYS_PSELECT6,
+	syscall.SYS_PWRITE64,
+	syscall.SYS_READ,
+	syscall.SYS_READLINKAT,
+	syscall.SYS_READV,
+	syscall.SYS_RECVMSG,
+	syscall.SYS_RENAMEAT,
+	syscall.SYS_RESTART_SYSCALL,
+	syscall.SYS_RT_SIGACTION,
+	syscall.SYS_RT_SIGPROCMASK,
+	syscall.SYS_RT_SIGRETURN,
+	syscall.SYS_SCHED_YIELD,
+	syscall.SYS_SENDMSG,
+	syscall.SYS_SETITIMER,
+	syscall.SYS_SHUTDOWN,
+	syscall.SYS_SIGALTSTACK,
+	syscall.SYS_SYNC_FILE_RANGE,
+	syscall.SYS_TGKILL,
+	syscall.SYS_UTIMENSAT,
+	syscall.SYS_WRITE,
+	syscall.SYS_WRITEV,
+}
+
+// TODO: Ioctl is needed in order to support tty consoles.
+// Once filters support argument-checking, we should only allow ioctl
+// with tty-related arguments.
+func consoleFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_IOCTL,
+	}
+}
+
+// whitelistFSFilters returns syscalls made by whitelistFS. Using WhitelistFS
+// is less secure because it runs inside the Sentry and must be able to perform
+// file operations that would otherwise be disabled by seccomp when a Gofer is
+// used. When whitelistFS is not used, openning new FD in the Sentry is
+// disallowed.
+func whitelistFSFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_ACCESS,
+		syscall.SYS_FCHMOD,
+		syscall.SYS_FSTAT,
+		syscall.SYS_FSYNC,
+		syscall.SYS_FTRUNCATE,
+		syscall.SYS_GETCWD,
+		syscall.SYS_GETDENTS,
+		syscall.SYS_GETDENTS64,
+		syscall.SYS_LSEEK,
+		syscall.SYS_LSTAT,
+		syscall.SYS_MKDIR,
+		syscall.SYS_MKDIRAT,
+		syscall.SYS_NEWFSTATAT,
+		syscall.SYS_OPEN,
+		syscall.SYS_OPENAT,
+		syscall.SYS_PREAD64,
+		syscall.SYS_PWRITE64,
+		syscall.SYS_READ,
+		syscall.SYS_READLINK,
+		syscall.SYS_READLINKAT,
+		syscall.SYS_RENAMEAT,
+		syscall.SYS_STAT,
+		syscall.SYS_SYMLINK,
+		syscall.SYS_SYMLINKAT,
+		syscall.SYS_SYNC_FILE_RANGE,
+		syscall.SYS_UNLINK,
+		syscall.SYS_UNLINKAT,
+		syscall.SYS_UTIMENSAT,
+		syscall.SYS_WRITE,
+	}
+}
+
+// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
+func hostInetFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_ACCEPT4,
+		syscall.SYS_BIND,
+		syscall.SYS_CONNECT,
+		syscall.SYS_GETPEERNAME,
+		syscall.SYS_GETSOCKNAME,
+		syscall.SYS_GETSOCKOPT,
+		syscall.SYS_IOCTL,
+		syscall.SYS_LISTEN,
+		syscall.SYS_READV,
+		syscall.SYS_RECVFROM,
+		syscall.SYS_RECVMSG,
+		syscall.SYS_SENDMSG,
+		syscall.SYS_SENDTO,
+		syscall.SYS_SETSOCKOPT,
+		syscall.SYS_SHUTDOWN,
+		syscall.SYS_SOCKET,
+		syscall.SYS_WRITEV,
+	}
+}
+
+// ptraceFilters returns syscalls made exclusively by the ptrace platform.
+func ptraceFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_PTRACE,
+		syscall.SYS_WAIT4,
+		unix.SYS_GETCPU,
+		unix.SYS_SCHED_SETAFFINITY,
+	}
+}
+
+// kvmFilters returns syscalls made exclusively by the KVM platform.
+func kvmFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_IOCTL,
+		syscall.SYS_RT_SIGSUSPEND,
+		syscall.SYS_RT_SIGTIMEDWAIT,
+		0xffffffffffffffff, // KVM uses syscall -1 to transition to host.
+	}
+}
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
new file mode 100644
index 000000000..e10d9bf4c
--- /dev/null
+++ b/runsc/boot/filter/extra_filters.go
@@ -0,0 +1,24 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go intrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() []uintptr {
+	return nil
+}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
new file mode 100644
index 000000000..a862340f6
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+	"syscall"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() []uintptr {
+	Report("MSAN is enabled: syscall filters less restrictive!")
+	return []uintptr{
+		syscall.SYS_SCHED_GETAFFINITY,
+		syscall.SYS_SET_ROBUST_LIST,
+	}
+}
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
new file mode 100644
index 000000000..b0c74a58a
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+	"syscall"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() []uintptr {
+	Report("TSAN is enabled: syscall filters less restrictive!")
+	return []uintptr{
+		syscall.SYS_BRK,
+		syscall.SYS_MUNLOCK,
+		syscall.SYS_NANOSLEEP,
+		syscall.SYS_OPEN,
+		syscall.SYS_SET_ROBUST_LIST,
+	}
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
new file mode 100644
index 000000000..3ba56a318
--- /dev/null
+++ b/runsc/boot/filter/filter.go
@@ -0,0 +1,67 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the sandbox is allowed to make
+// to the host, and installs seccomp filters to prevent prohibited
+// syscalls in case it's compromised.
+package filter
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+)
+
+// Install installs seccomp filters for based on the given platform.
+func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error {
+	s := allowedSyscalls
+
+	// Set of additional filters used by -race and -msan. Returns empty
+	// when not enabled.
+	s = append(s, instrumentationFilters()...)
+
+	if whitelistFS {
+		Report("direct file access allows unrestricted file access!")
+		s = append(s, whitelistFSFilters()...)
+	}
+	if console {
+		Report("console is enabled: syscall filters less restrictive!")
+		s = append(s, consoleFilters()...)
+	}
+	if hostNetwork {
+		Report("host networking enabled: syscall filters less restrictive!")
+		s = append(s, hostInetFilters()...)
+	}
+
+	switch p := p.(type) {
+	case *ptrace.PTrace:
+		s = append(s, ptraceFilters()...)
+	case *kvm.KVM:
+		s = append(s, kvmFilters()...)
+	default:
+		return fmt.Errorf("unknown platform type %T", p)
+	}
+
+	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
+	return seccomp.Install(s, false)
+}
+
+// Report writes a warning message to the log.
+func Report(msg string) {
+	log.Warningf("*** SECCOMP WARNING: %s", msg)
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
new file mode 100644
index 000000000..2073bd0b1
--- /dev/null
+++ b/runsc/boot/fs.go
@@ -0,0 +1,441 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	// Include filesystem types that OCI spec might mount.
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type fdDispenser struct {
+	fds []int
+}
+
+func (f *fdDispenser) remove() int {
+	rv := f.fds[0]
+	f.fds = f.fds[1:]
+	return rv
+}
+
+func (f *fdDispenser) empty() bool {
+	return len(f.fds) == 0
+}
+
+// createMountNamespace creates a mount manager containing the root filesystem
+// and all mounts.
+func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
+	fds := &fdDispenser{fds: ioFDs}
+
+	// Create the MountNamespace from the root.
+	rootInode, err := createRootMount(ctx, spec, conf, fds)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create root overlay: %v", err)
+	}
+	mns, err := fs.NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		return nil, fmt.Errorf("failed to construct MountNamespace: %v", err)
+	}
+
+	// Keep track of whether proc, sys, and tmp were mounted.
+	var procMounted, sysMounted, tmpMounted bool
+
+	// Mount all submounts from the spec.
+	for _, m := range spec.Mounts {
+		// OCI spec uses many different mounts for the things inside of '/dev'. We
+		// have a single mount at '/dev' that is always mounted, regardless of
+		// whether it was asked for, as the spec says we SHOULD.
+		if strings.HasPrefix(m.Destination, "/dev") {
+			log.Warningf("ignoring dev mount at %q", m.Destination)
+			continue
+		}
+		switch m.Destination {
+		case "/proc":
+			procMounted = true
+		case "/sys":
+			sysMounted = true
+		case "/tmp":
+			tmpMounted = true
+		}
+
+		if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil {
+			return nil, err
+		}
+	}
+
+	// Always mount /dev.
+	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		Type:        "devtmpfs",
+		Destination: "/dev",
+	}); err != nil {
+		return nil, err
+	}
+
+	// Mount proc and sys even if the user did not ask for it, as the spec
+	// says we SHOULD.
+	if !procMounted {
+		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+			Type:        "proc",
+			Destination: "/proc",
+		}); err != nil {
+			return nil, err
+		}
+	}
+	if !sysMounted {
+		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+			Type:        "sysfs",
+			Destination: "/sys",
+		}); err != nil {
+			return nil, err
+		}
+	}
+
+	// Technically we don't have to mount tmpfs at /tmp, as we could just
+	// rely on the host /tmp, but this is a nice optimization, and fixes
+	// some apps that call mknod in /tmp.
+	if !tmpMounted {
+		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+			Type:        "tmpfs",
+			Destination: "/tmp",
+		}); err != nil {
+			return nil, err
+		}
+	}
+
+	if !fds.empty() {
+		return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
+	}
+
+	return mns, nil
+}
+
+// createRootMount creates the root filesystem.
+func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) {
+	// First construct the filesystem from the spec.Root.
+	mf := fs.MountSourceFlags{
+		ReadOnly: spec.Root.Readonly,
+		NoAtime:  true,
+	}
+
+	var (
+		rootInode *fs.Inode
+		err       error
+	)
+	switch conf.FileAccess {
+	case FileAccessProxy:
+		fd := fds.remove()
+		log.Infof("Mounting root over 9P, ioFD: %d", fd)
+		hostFS := mustFindFilesystem("9p")
+		rootInode, err = hostFS.Mount(ctx, "root", mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
+		if err != nil {
+			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
+		}
+
+	case FileAccessDirect:
+		hostFS := mustFindFilesystem("whitelistfs")
+		rootInode, err = hostFS.Mount(ctx, "root", mf, "root="+spec.Root.Path+",dont_translate_ownership=true")
+		if err != nil {
+			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
+		}
+
+	default:
+		return nil, fmt.Errorf("invalid file access type: %v", conf.FileAccess)
+	}
+
+	// We need to overlay the root on top of a ramfs with stub directories
+	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
+	// mounted even if they are not in the spec.
+	submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp")
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("error adding submount overlay: %v", err)
+	}
+
+	if conf.Overlay {
+		log.Debugf("Adding overlay on top of root mount")
+		// Overlay a tmpfs filesystem on top of the root.
+		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	log.Infof("Mounted %q to \"/\" type root", spec.Root.Path)
+	return rootInode, nil
+}
+
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+	// Upper layer uses the same flags as lower, but it must be read-write.
+	lowerFlags.ReadOnly = false
+
+	tmpFS := mustFindFilesystem("tmpfs")
+	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err)
+	}
+	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
+}
+
+func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	var data []string
+	var fsName string
+	var useOverlay bool
+	switch m.Type {
+	case "proc", "sysfs", "devtmpfs":
+		fsName = m.Type
+	case "none":
+		fsName = "sysfs"
+	case "tmpfs":
+		fsName = m.Type
+
+		// tmpfs has some extra supported options that we must pass through.
+		var err error
+		data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+		if err != nil {
+			return err
+		}
+	case "bind":
+		switch conf.FileAccess {
+		case FileAccessProxy:
+			fd := fds.remove()
+			fsName = "9p"
+			data = []string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}
+		case FileAccessDirect:
+			fsName = "whitelistfs"
+			data = []string{"root=" + m.Source, "dont_translate_ownership=true"}
+		default:
+			return fmt.Errorf("invalid file access type: %v", conf.FileAccess)
+		}
+
+		fi, err := os.Stat(m.Source)
+		if err != nil {
+			return err
+		}
+		// Add overlay to all writable mounts, except when mapping an individual file.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly && fi.Mode().IsDir()
+	default:
+		// TODO: Support all the mount types and make this a
+		// fatal error.  Most applications will "just work" without
+		// them, so this is a warning for now.
+		// we do not support.
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+		return nil
+	}
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(m.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+	mf.NoAtime = true
+
+	inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ","))
+	if err != nil {
+		return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
+	}
+
+	// If there are submounts, we need to overlay the mount on top of a
+	// ramfs with stub directories for submount paths.
+	//
+	// We do not do this for /dev, since there will usually be submounts in
+	// the spec, but our devfs implementation contains all the necessary
+	// directories and files (well, most of them anyways).
+	if m.Destination != "/dev" {
+		submounts := subtargets(m.Destination, spec.Mounts)
+		if len(submounts) > 0 {
+			log.Infof("Adding submount overlay over %q", m.Destination)
+			inode, err = addSubmountOverlay(ctx, inode, submounts)
+			if err != nil {
+				return fmt.Errorf("error adding submount overlay: %v", err)
+			}
+		}
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of mount %q", m.Destination)
+		if inode, err = addOverlay(ctx, conf, inode, m.Type, mf); err != nil {
+			return err
+		}
+	}
+
+	root := mns.Root()
+	defer root.DecRef()
+	dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
+	}
+	defer dirent.DecRef()
+	if err := mns.Mount(ctx, dirent, inode); err != nil {
+		return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err)
+	}
+
+	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	return nil
+}
+
+func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
+	root := mns.Root()
+	defer root.DecRef()
+
+	// Starting at the root, walk the path.
+	parent := root
+	ps := strings.Split(filepath.Clean(path), string(filepath.Separator))
+	for i := 0; i < len(ps); i++ {
+		if ps[i] == "" {
+			// This will be case for the first and last element, if the path
+			// begins or ends with '/'. Note that we always treat the path as
+			// absolute, regardless of what the first character contains.
+			continue
+		}
+		d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit)
+		if err == syserror.ENOENT {
+			// If we encounter a path that does not exist, then
+			// create it.
+			if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil {
+				return fmt.Errorf("failed to create directory %q: %v", ps[i], err)
+			}
+			if d, err = parent.Walk(ctx, root, ps[i]); err != nil {
+				return fmt.Errorf("walk to %q failed: %v", ps[i], err)
+			}
+		} else if err != nil {
+			return fmt.Errorf("failed to find inode %q: %v", ps[i], err)
+		}
+		parent = d
+	}
+	return nil
+}
+
+// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
+// keys.
+func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
+	var out []string
+	for _, o := range opts {
+		kv := strings.Split(o, "=")
+		switch len(kv) {
+		case 1:
+			if contains(allowedKeys, o) {
+				out = append(out, o)
+				continue
+			}
+			log.Warningf("ignoring unsupported key %q", kv)
+		case 2:
+			if contains(allowedKeys, kv[0]) {
+				out = append(out, o)
+				continue
+			}
+			log.Warningf("ignoring unsupported key %q", kv[0])
+		default:
+			return nil, fmt.Errorf("invalid option %q", o)
+		}
+	}
+	return out, nil
+}
+
+func destinations(mounts []specs.Mount, extra ...string) []string {
+	var ds []string
+	for _, m := range mounts {
+		ds = append(ds, m.Destination)
+	}
+	return append(ds, extra...)
+}
+
+func mountFlags(opts []string) fs.MountSourceFlags {
+	mf := fs.MountSourceFlags{}
+	for _, o := range opts {
+		switch o {
+		case "ro":
+			mf.ReadOnly = true
+		case "noatime":
+			mf.NoAtime = true
+		default:
+			log.Warningf("ignorning unknown mount option %q", o)
+		}
+	}
+	return mf
+}
+
+func contains(strs []string, str string) bool {
+	for _, s := range strs {
+		if s == str {
+			return true
+		}
+	}
+	return false
+}
+
+func mustFindFilesystem(name string) fs.Filesystem {
+	fs, ok := fs.FindFilesystem(name)
+	if !ok {
+		panic(fmt.Sprintf("could not find filesystem %q", name))
+	}
+	return fs
+}
+
+// addSubmountOverlay overlays the inode over a ramfs tree containing the given
+// paths.
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+	// There is no real filesystem backing this ramfs tree, so we pass in
+	// "nil" here.
+	mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts)
+	if err != nil {
+		return nil, fmt.Errorf("error creating mount tree: %v", err)
+	}
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to make mount overlay: %v", err)
+	}
+	return overlayInode, err
+}
+
+// subtargets takes a set of Mounts and returns only the targets that are
+// children of the given root. The returned paths are relative to the root.
+func subtargets(root string, mnts []specs.Mount) []string {
+	r := filepath.Clean(root)
+	var targets []string
+	for _, mnt := range mnts {
+		t := filepath.Clean(mnt.Destination)
+		if strings.HasPrefix(t, r) {
+			// Make the mnt path relative to the root path.  If the
+			// result is empty, then mnt IS the root mount, not a
+			// submount.  We don't want to include those.
+			if t := strings.TrimPrefix(t, r); t != "" {
+				targets = append(targets, t)
+			}
+		}
+	}
+	return targets
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
new file mode 100644
index 000000000..ea72de8e9
--- /dev/null
+++ b/runsc/boot/limits.go
@@ -0,0 +1,60 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// Mapping from linux resource names to limits.LimitType.
+var fromLinuxResource = map[string]limits.LimitType{
+	"RLIMIT_CPU":        limits.CPU,
+	"RLIMIT_FSIZE":      limits.FileSize,
+	"RLIMIT_DATA":       limits.Data,
+	"RLIMIT_STACK":      limits.Stack,
+	"RLIMIT_CORE":       limits.Core,
+	"RLIMIT_RSS":        limits.Rss,
+	"RLIMIT_NPROC":      limits.ProcessCount,
+	"RLIMIT_NOFILE":     limits.NumberOfFiles,
+	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
+	"RLIMIT_AS":         limits.AS,
+	"RLIMIT_LOCKS":      limits.Locks,
+	"RLIMIT_SIGPENDING": limits.SignalsPending,
+	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
+	"RLIMIT_NICE":       limits.Nice,
+	"RLIMIT_RTPRIO":     limits.RealTimePriority,
+	"RLIMIT_RTTIME":     limits.Rttime,
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+	ls, err := limits.NewLinuxDistroLimitSet()
+	if err != nil {
+		return nil, err
+	}
+	for _, rl := range spec.Process.Rlimits {
+		lt, ok := fromLinuxResource[rl.Type]
+		if !ok {
+			return nil, fmt.Errorf("unknown resource %q", rl.Type)
+		}
+		ls.SetUnchecked(lt, limits.Limit{
+			Cur: rl.Soft,
+			Max: rl.Hard,
+		})
+	}
+	return ls, nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
new file mode 100644
index 000000000..a470cb054
--- /dev/null
+++ b/runsc/boot/loader.go
@@ -0,0 +1,354 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package boot loads the kernel and runs the application.
+package boot
+
+import (
+	"fmt"
+	"math/rand"
+	"sync/atomic"
+	"syscall"
+	gtime "time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
+	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.googlesource.com/gvisor/runsc/boot/filter"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+
+	// Include supported socket providers.
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+)
+
+// Loader keeps state needed to start the kernel and run the application.
+type Loader struct {
+	// k is the kernel.
+	k *kernel.Kernel
+
+	// ctrl is the control server.
+	ctrl *controller
+
+	conf *Config
+
+	// console is set to true if terminal is enabled.
+	console bool
+
+	watchdog *watchdog.Watchdog
+
+	// stopSignalForwarding disables forwarding of signals to the sandboxed
+	// app. It should be called when a sandbox is destroyed.
+	stopSignalForwarding func()
+
+	// procArgs refers to the initial application task.
+	procArgs kernel.CreateProcessArgs
+}
+
+func init() {
+	// Initialize the random number generator.
+	rand.Seed(gtime.Now().UnixNano())
+
+	// Register the global syscall table.
+	kernel.RegisterSyscallTable(slinux.AMD64)
+}
+
+// New initializes a new kernel loader configured by spec.
+func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) {
+	// Create kernel and platform.
+	p, err := createPlatform(conf)
+	if err != nil {
+		return nil, fmt.Errorf("error creating platform: %v", err)
+	}
+	k := &kernel.Kernel{
+		Platform: p,
+	}
+
+	// Create VDSO.
+	vdso, err := loader.PrepareVDSO(p)
+	if err != nil {
+		return nil, fmt.Errorf("error creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("error creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	// Create initial limits.
+	ls, err := createLimitSet(spec)
+	if err != nil {
+		return nil, fmt.Errorf("error creating limits: %v", err)
+	}
+
+	// Create capabilities.
+	caps, err := specutils.Capabilities(spec.Process.Capabilities)
+	if err != nil {
+		return nil, fmt.Errorf("error creating capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+	for _, GID := range spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials.
+	creds := auth.NewUserCredentials(
+		auth.KUID(spec.Process.User.UID),
+		auth.KGID(spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		auth.NewRootUserNamespace())
+	if err != nil {
+		return nil, fmt.Errorf("error creating credentials: %v", err)
+	}
+
+	// Create user namespace.
+	// TODO: Not clear what domain name should be here.  It is
+	// not configurable from runtime spec.
+	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
+
+	ipcns := kernel.NewIPCNamespace()
+
+	if err := enableStrace(conf); err != nil {
+		return nil, fmt.Errorf("failed to enable strace: %v", err)
+	}
+
+	// Get the executable path, which is a bit tricky because we have to
+	// inspect the environment PATH which is relative to the root path.
+	exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
+	if err != nil {
+		return nil, fmt.Errorf("error getting executable path: %v", err)
+	}
+
+	// Create the process arguments.
+	procArgs := kernel.CreateProcessArgs{
+		Filename:         exec,
+		Argv:             spec.Process.Args,
+		Envv:             spec.Process.Env,
+		WorkingDirectory: spec.Process.Cwd,
+		Credentials:      creds,
+		// Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so
+		// it must wait until we have a Kernel.
+		Umask:                uint(syscall.Umask(0)),
+		Limits:               ls,
+		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+		UTSNamespace:         utsns,
+		IPCNamespace:         ipcns,
+	}
+
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
+	networkStack := newEmptyNetworkStack(conf)
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		FeatureSet:        cpuid.HostFeatureSet(),
+		Timekeeper:        tk,
+		RootUserNamespace: creds.UserNamespace,
+		NetworkStack:      networkStack,
+		ApplicationCores:  8,
+		Vdso:              vdso,
+		RootUTSNamespace:  utsns,
+		RootIPCNamespace:  ipcns,
+	}); err != nil {
+		return nil, fmt.Errorf("error initializing kernel: %v", err)
+	}
+
+	// Turn on packet logging if enabled.
+	if conf.LogPackets {
+		log.Infof("Packet logging enabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 1)
+	} else {
+		log.Infof("Packet logging disabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 0)
+	}
+
+	// Create the control server using the provided FD.
+	//
+	// This must be done *after* we have initialized the kernel since the
+	// controller is used to configure the kernel's network stack.
+	//
+	// This should also be *before* we create the process, since a
+	// misconfigured process will cause an error, and we want the control
+	// server up before that so that we don't time out trying to connect to
+	// it.
+	ctrl, err := newController(controllerFD, k)
+	if err != nil {
+		return nil, fmt.Errorf("error creating control server: %v", err)
+	}
+
+	ctx := procArgs.NewContext(k)
+
+	// Create the virtual filesystem.
+	mm, err := createMountNamespace(ctx, spec, conf, ioFDs)
+	if err != nil {
+		return nil, fmt.Errorf("error creating mounts: %v", err)
+	}
+	k.SetRootMountNamespace(mm)
+
+	// Create the FD map, which will set stdin, stdout, and stderr.  If console
+	// is true, then ioctl calls will be passed through to the host fd.
+	fdm, err := createFDMap(ctx, k, ls, console)
+	if err != nil {
+		return nil, fmt.Errorf("error importing fds: %v", err)
+	}
+
+	// CreateProcess takes a reference on FDMap if successful. We
+	// won't need ours either way.
+	procArgs.FDMap = fdm
+
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+	}
+	// Ensure that most signals received in sentry context are forwarded to
+	// the emulated kernel.
+	stopSignalForwarding := sighandling.StartForwarding(k)
+
+	watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning)
+	return &Loader{
+		k:                    k,
+		ctrl:                 ctrl,
+		conf:                 conf,
+		console:              console,
+		watchdog:             watchdog,
+		stopSignalForwarding: stopSignalForwarding,
+		procArgs:             procArgs,
+	}, nil
+}
+
+// Destroy cleans up all resources used by the loader.
+func (l *Loader) Destroy() {
+	if l.ctrl != nil {
+		// Shut down control server.
+		l.ctrl.srv.Stop()
+	}
+	l.stopSignalForwarding()
+	l.watchdog.Stop()
+}
+
+func createPlatform(conf *Config) (platform.Platform, error) {
+	switch conf.Platform {
+	case PlatformPtrace:
+		log.Infof("Platform: ptrace")
+		return ptrace.New()
+	case PlatformKVM:
+		log.Infof("Platform: kvm")
+		return kvm.New()
+	default:
+		return nil, fmt.Errorf("invalid platform %v", conf.Platform)
+	}
+}
+
+// Run runs the application.
+func (l *Loader) Run() error {
+	err := l.run()
+	l.ctrl.app.startResultChan <- err
+	return err
+}
+
+func (l *Loader) run() error {
+	if l.conf.Network == NetworkHost {
+		// Delay host network configuration to this point because network namespace
+		// is configured after the loader is created and before Run() is called.
+		log.Debugf("Configuring host network")
+		stack := l.k.NetworkStack().(*hostinet.Stack)
+		if err := stack.Configure(); err != nil {
+			return err
+		}
+	}
+
+	// Finally done with all configuration. Setup filters before user code
+	// is loaded.
+	if l.conf.DisableSeccomp {
+		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+	} else {
+		whitelistFS := l.conf.FileAccess == FileAccessDirect
+		hostNet := l.conf.Network == NetworkHost
+		if err := filter.Install(l.k.Platform, whitelistFS, l.console, hostNet); err != nil {
+			return fmt.Errorf("Failed to install seccomp filters: %v", err)
+		}
+	}
+
+	// Create the initial application task.
+	if _, err := l.k.CreateProcess(l.procArgs); err != nil {
+		return fmt.Errorf("failed to create init process: %v", err)
+	}
+
+	// CreateProcess takes a reference on FDMap if successful.
+	l.procArgs.FDMap.DecRef()
+
+	l.watchdog.Start()
+	return l.k.Start()
+}
+
+// WaitForStartSignal waits for a start signal from the control server.
+func (l *Loader) WaitForStartSignal() {
+	<-l.ctrl.app.startChan
+}
+
+// WaitExit waits for the application to exit, and returns the application's
+// exit status.
+func (l *Loader) WaitExit() kernel.ExitStatus {
+	// Wait for application.
+	l.k.WaitExited()
+
+	return l.k.GlobalInit().ExitStatus()
+}
+
+func newEmptyNetworkStack(conf *Config) inet.Stack {
+	switch conf.Network {
+	case NetworkHost:
+		return hostinet.NewStack()
+
+	case NetworkNone, NetworkSandbox:
+		// NetworkNone sets up loopback using netstack.
+		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
+		protoNames := []string{tcp.ProtocolName, udp.ProtocolName}
+		return &epsocket.Stack{stack.New(netProtos, protoNames)}
+
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
+}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
new file mode 100644
index 000000000..2fc16b241
--- /dev/null
+++ b/runsc/boot/loader_test.go
@@ -0,0 +1,238 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+}
+
+// testSpec returns a simple spec that can be used in tests.
+func testSpec() *specs.Spec {
+	return &specs.Spec{
+		// The host filesystem root is the sandbox root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: []string{"/bin/true"},
+		},
+	}
+}
+
+func createLoader() (*Loader, error) {
+	fd, err := server.CreateSocket(ControlSocketAddr("123"))
+	if err != nil {
+		return nil, err
+	}
+	conf := &Config{
+		RootDir:        "unused_root_dir",
+		Network:        NetworkNone,
+		FileAccess:     FileAccessDirect,
+		DisableSeccomp: true,
+	}
+	return New(testSpec(), conf, fd, nil, false)
+}
+
+// TestRun runs a simple application in a sandbox and checks that it succeeds.
+func TestRun(t *testing.T) {
+	s, err := createLoader()
+	if err != nil {
+		t.Fatalf("error creating loader: %v", err)
+	}
+	defer s.Destroy()
+
+	// Run the application.
+	if err := s.Run(); err != nil {
+		t.Errorf("error running application: %v", err)
+	}
+
+	// Wait for the application to exit.  It should succeed.
+	if status := s.WaitExit(); status.Code != 0 || status.Signo != 0 {
+		t.Errorf("application exited with status %+v, want 0", status)
+	}
+}
+
+// TestStartSignal tests that the controller Start message will cause
+// WaitForStartSignal to return.
+func TestStartSignal(t *testing.T) {
+	s, err := createLoader()
+	if err != nil {
+		t.Fatalf("error creating loader: %v", err)
+	}
+	defer s.Destroy()
+
+	// We aren't going to wait on this application, so the control server
+	// needs to be shut down manually.
+	defer s.ctrl.srv.Stop()
+
+	// Start a goroutine that calls WaitForStartSignal and writes to a
+	// channel when it returns.
+	waitFinished := make(chan struct{})
+	go func() {
+		s.WaitForStartSignal()
+		// Pretent that Run() executed and returned no error.
+		s.ctrl.app.startResultChan <- nil
+		waitFinished <- struct{}{}
+	}()
+
+	// Nothing has been written to the channel, so waitFinished should not
+	// return.  Give it a little bit of time to make sure the goroutine has
+	// started.
+	select {
+	case <-waitFinished:
+		t.Errorf("WaitForStartSignal completed but it should not have")
+	case <-time.After(50 * time.Millisecond):
+		// OK.
+	}
+
+	// Trigger the control server Start method.
+	if err := s.ctrl.app.Start(nil, nil); err != nil {
+		t.Errorf("error calling Start: %v", err)
+	}
+
+	// Now WaitForStartSignal should return (within a short amount of
+	// time).
+	select {
+	case <-waitFinished:
+		// OK.
+	case <-time.After(50 * time.Millisecond):
+		t.Errorf("WaitForStartSignal did not complete but it should have")
+	}
+
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespace(t *testing.T) {
+	conf := &Config{
+		RootDir:        "unused_root_dir",
+		FileAccess:     FileAccessDirect,
+		DisableSeccomp: true,
+	}
+
+	testCases := []struct {
+		name string
+		// Spec that will be used to create the mount manager.  Note
+		// that we can't mount procfs without a kernel, so each spec
+		// MUST contain something other than procfs mounted at /proc.
+		spec specs.Spec
+		// Paths that are expected to exist in the resulting fs.
+		expectedPaths []string
+	}{
+		{
+			// Only proc.
+			name: "only proc mount",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			// /proc, /dev, and /sys should always be mounted.
+			expectedPaths: []string{"/proc", "/dev", "/sys"},
+		},
+		{
+			// Mount at a deep path, with many components that do
+			// not exist in the root.
+			name: "deep mount path",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			// /some/deep/path should be mounted, along with /proc,
+			// /dev, and /sys.
+			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+		},
+		{
+			// Mounts are nested inside eachother.
+			name: "nested mounts",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/bar",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/bar/baz",
+						Type:        "tmpfs",
+					},
+					{
+						// A deep path that is in foo but not the other mounts.
+						Destination: "/foo/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+		},
+	}
+
+	for _, tc := range testCases {
+		ctx := contexttest.Context(t)
+		mm, err := createMountNamespace(ctx, &tc.spec, conf, nil)
+		if err != nil {
+			t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
+		}
+		defer mm.DecRef()
+		root := mm.Root()
+		defer root.DecRef()
+		for _, p := range tc.expectedPaths {
+			if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
+				t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+			}
+		}
+	}
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
new file mode 100644
index 000000000..d2b52c823
--- /dev/null
+++ b/runsc/boot/network.go
@@ -0,0 +1,213 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"math/rand"
+	"net"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// Network exposes methods that can be used to configure a network stack.
+type Network struct {
+	Stack *stack.Stack
+}
+
+// Route represents a route in the network stack.
+type Route struct {
+	Destination net.IP
+	Mask        net.IPMask
+	Gateway     net.IP
+}
+
+// DefaultRoute represents a catch all route to the default gateway.
+type DefaultRoute struct {
+	Route Route
+	Name  string
+}
+
+// FDBasedLink configures an fd-based link.
+type FDBasedLink struct {
+	Name      string
+	MTU       int
+	Addresses []net.IP
+	Routes    []Route
+}
+
+// LoopbackLink configures a loopback li nk.
+type LoopbackLink struct {
+	Name      string
+	Addresses []net.IP
+	Routes    []Route
+}
+
+// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
+type CreateLinksAndRoutesArgs struct {
+	// FilePayload contains the fds associated with the FDBasedLinks.  The
+	// two slices must have the same length.
+	urpc.FilePayload
+
+	LoopbackLinks []LoopbackLink
+	FDBasedLinks  []FDBasedLink
+
+	DefaultGateway DefaultRoute
+}
+
+// Empty returns true if route hasn't been set.
+func (r *Route) Empty() bool {
+	return r.Destination == nil && r.Mask == nil && r.Gateway == nil
+}
+
+func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
+	return tcpip.Route{
+		Destination: ipToAddress(r.Destination),
+		Gateway:     ipToAddress(r.Gateway),
+		Mask:        ipToAddress(net.IP(r.Mask)),
+		NIC:         id,
+	}
+}
+
+// CreateLinksAndRoutes creates links and routes in a network stack.  It should
+// only be called once.
+func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
+	if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
+		return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+	}
+
+	var nicID tcpip.NICID
+	nicids := make(map[string]tcpip.NICID)
+
+	// Collect routes from all links.
+	var routes []tcpip.Route
+
+	// Loopback normally appear before other interfaces.
+	for _, link := range args.LoopbackLinks {
+		nicID++
+		nicids[link.Name] = nicID
+
+		linkEP := loopback.New()
+
+		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+			return err
+		}
+
+		// Collect the routes from this link.
+		for _, r := range link.Routes {
+			routes = append(routes, r.toTcpipRoute(nicID))
+		}
+	}
+
+	for i, link := range args.FDBasedLinks {
+		nicID++
+		nicids[link.Name] = nicID
+
+		// Copy the underlying FD.
+		oldFD := args.FilePayload.Files[i].Fd()
+		newFD, err := syscall.Dup(int(oldFD))
+		if err != nil {
+			return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+		}
+
+		linkEP := fdbased.New(&fdbased.Options{
+			FD:              newFD,
+			MTU:             uint32(link.MTU),
+			ChecksumOffload: false,
+			EthernetHeader:  true,
+			Address:         tcpip.LinkAddress(generateRndMac()),
+		})
+
+		log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+			return err
+		}
+
+		// Collect the routes from this link.
+		for _, r := range link.Routes {
+			routes = append(routes, r.toTcpipRoute(nicID))
+		}
+	}
+
+	if !args.DefaultGateway.Route.Empty() {
+		nicID, ok := nicids[args.DefaultGateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+		}
+		routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID))
+	}
+
+	log.Infof("Setting routes %+v", routes)
+	n.Stack.SetRouteTable(routes)
+	return nil
+}
+
+// createNICWithAddrs creates a NIC in the network stack and adds the given
+// addresses.
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP) error {
+	if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
+		return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+	}
+
+	// Always start with an arp address for the NIC.
+	if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+		return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
+	}
+
+	for _, addr := range addrs {
+		proto, tcpipAddr := ipToAddressAndProto(addr)
+		if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+			return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
+		}
+	}
+	return nil
+}
+
+// ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
+//
+// Note: don't use 'len(ip)' to determine IP version because length is always 16.
+func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
+	if i4 := ip.To4(); i4 != nil {
+		return ipv4.ProtocolNumber, tcpip.Address(i4)
+	}
+	return ipv6.ProtocolNumber, tcpip.Address(ip)
+}
+
+// ipToAddress converts IP to tcpip.Address, ignoring the protocol.
+func ipToAddress(ip net.IP) tcpip.Address {
+	_, addr := ipToAddressAndProto(ip)
+	return addr
+}
+
+// generateRndMac returns a random local MAC address.
+// Copied from eth_random_addr() (include/linux/etherdevice.h)
+func generateRndMac() net.HardwareAddr {
+	mac := make(net.HardwareAddr, 6)
+	rand.Read(mac)
+	mac[0] &^= 0x1 // clear multicast bit
+	mac[0] |= 0x2  // set local assignment bit (IEEE802)
+	return mac
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
new file mode 100644
index 000000000..1e898672b
--- /dev/null
+++ b/runsc/boot/strace.go
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+)
+
+func enableStrace(conf *Config) error {
+	// We must initialize even if strace is not enabled.
+	strace.Initialize()
+
+	if !conf.Strace {
+		return nil
+	}
+
+	max := conf.StraceLogSize
+	if max == 0 {
+		max = 1024
+	}
+	strace.LogMaximumSize = max
+
+	if len(conf.StraceSyscalls) == 0 {
+		strace.EnableAll(strace.SinkTypeLog)
+		return nil
+	}
+	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
new file mode 100644
index 000000000..128c8f7e6
--- /dev/null
+++ b/runsc/cmd/BUILD
@@ -0,0 +1,58 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "cmd",
+    srcs = [
+        "boot.go",
+        "cmd.go",
+        "create.go",
+        "delete.go",
+        "events.go",
+        "exec.go",
+        "gofer.go",
+        "kill.go",
+        "list.go",
+        "path.go",
+        "ps.go",
+        "run.go",
+        "start.go",
+        "state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/cmd",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/unet",
+        "//pkg/urpc",
+        "//runsc/boot",
+        "//runsc/fsgofer",
+        "//runsc/sandbox",
+        "//runsc/specutils",
+        "@com_github_google_subcommands//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "cmd_test",
+    size = "small",
+    srcs = ["exec_test.go"],
+    embed = [":cmd"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/urpc",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
new file mode 100644
index 000000000..0dad6da79
--- /dev/null
+++ b/runsc/cmd/boot.go
@@ -0,0 +1,161 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"os"
+	"runtime"
+	"runtime/debug"
+	"strings"
+	"syscall"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Boot implements subcommands.Command for the "boot" command which starts a
+// new sandbox. It should not be called directly.
+type Boot struct {
+	// bundleDir is the path to the bundle directory.
+	bundleDir string
+
+	// controllerFD is the file descriptor of a stream socket for the
+	// control server that is donated to this process.
+	controllerFD int
+
+	// ioFDs is the list of FDs used to connect to FS gofers.
+	ioFDs intFlags
+
+	// console is set to true if the sandbox should allow terminal ioctl(2)
+	// syscalls.
+	console bool
+
+	// applyCaps determines if capabilities defined in the spec should be applied
+	// to the process.
+	applyCaps bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Boot) Name() string {
+	return "boot"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Boot) Synopsis() string {
+	return "launch a sandbox process (internal use only)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Boot) Usage() string {
+	return `boot [flags]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (b *Boot) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
+	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
+	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
+	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+}
+
+// Execute implements subcommands.Command.Execute.  It starts a sandbox in a
+// waiting state.
+func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if b.bundleDir == "" || b.controllerFD == -1 || f.NArg() != 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	// Ensure that if there is a panic, all goroutine stacks are printed.
+	debug.SetTraceback("all")
+
+	// Get the spec from the bundleDir.
+	spec, err := specutils.ReadSpec(b.bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	// Turn any relative paths in the spec to absolute by prepending the bundleDir.
+	spec.Root.Path = absPath(b.bundleDir, spec.Root.Path)
+	for _, m := range spec.Mounts {
+		if m.Source != "" {
+			m.Source = absPath(b.bundleDir, m.Source)
+		}
+	}
+
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	if b.applyCaps {
+		setCapsAndCallSelf(conf, spec)
+		Fatalf("setCapsAndCallSelf must never return")
+	}
+
+	// Create the loader.
+	s, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
+	if err != nil {
+		Fatalf("error creating loader: %v", err)
+	}
+	defer s.Destroy()
+
+	// Wait for the start signal from runsc.
+	s.WaitForStartSignal()
+
+	// Run the application and wait for it to finish.
+	if err := s.Run(); err != nil {
+		Fatalf("error running sandbox: %v", err)
+	}
+
+	ws := s.WaitExit()
+	log.Infof("application exiting with %+v", ws)
+	*waitStatus = syscall.WaitStatus(ws.Status())
+	return subcommands.ExitSuccess
+}
+
+// setCapsAndCallSelf sets capabilities to the current thread and then execve's
+// itself again with the same arguments except '--apply-caps' to restart the
+// whole process with the desired capabilities.
+func setCapsAndCallSelf(conf *boot.Config, spec *specs.Spec) {
+	// Keep thread locked while capabilities are changed.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if err := boot.ApplyCaps(conf, spec.Process.Capabilities); err != nil {
+		Fatalf("ApplyCaps, err: %v", err)
+	}
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	// Remove --apply-caps arg to call myself.
+	var args []string
+	for _, arg := range os.Args {
+		if !strings.Contains(arg, "apply-caps") {
+			args = append(args, arg)
+		}
+	}
+
+	log.Infof("Execve 'boot' again, bye!")
+	log.Infof("%s %v", binPath, args)
+	syscall.Exec(binPath, args, []string{})
+}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
new file mode 100644
index 000000000..d4b834213
--- /dev/null
+++ b/runsc/cmd/cmd.go
@@ -0,0 +1,77 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cmd holds implementations of the runsc commands.
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+
+	"flag"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// Fatalf logs to stderr and exits with a failure status code.
+func Fatalf(s string, args ...interface{}) {
+	// If runsc is being invoked by docker or cri-o, then we might not have
+	// access to stderr, so we log a serious-looking warning in addition to
+	// writing to stderr.
+	log.Warningf("FATAL ERROR: "+s, args...)
+	fmt.Fprintf(os.Stderr, s+"\n", args...)
+	// Return an error that is unlikely to be used by the application.
+	os.Exit(128)
+}
+
+// commandLineFlags returns a slice of all top-level command line flags that
+// have been set.
+func commandLineFlags() []string {
+	var args []string
+	flag.CommandLine.Visit(func(f *flag.Flag) {
+		args = append(args, fmt.Sprintf("--%s=%s", f.Name, f.Value.String()))
+	})
+	return args
+}
+
+// intFlags can be used with int flags that appear multiple times.
+type intFlags []int
+
+// String implements flag.Value.
+func (i *intFlags) String() string {
+	return fmt.Sprintf("%v", *i)
+}
+
+// Get implements flag.Value.
+func (i *intFlags) Get() interface{} {
+	return i
+}
+
+// GetArray returns array of FDs.
+func (i *intFlags) GetArray() []int {
+	return *i
+}
+
+// Set implements flag.Value.
+func (i *intFlags) Set(s string) error {
+	fd, err := strconv.Atoi(s)
+	if err != nil {
+		return fmt.Errorf("invalid flag value: %v", err)
+	}
+	if fd < 0 {
+		return fmt.Errorf("flag value must be greater than 0: %d", fd)
+	}
+	*i = append(*i, fd)
+	return nil
+}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
new file mode 100644
index 000000000..83cb09eb0
--- /dev/null
+++ b/runsc/cmd/create.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Create implements subcommands.Command for the "create" command.
+type Create struct {
+	// bundleDir is the path to the bundle directory (defaults to the
+	// current working directory).
+	bundleDir string
+
+	// pidFile is the filename that the sandbox pid will be written to.
+	// This file should only be created once the sandbox process is ready
+	// to use (i.e. control server has started and is listening).
+	pidFile string
+
+	// consoleSocket is the path to an AF_UNIX socket which will receive a
+	// file descriptor referencing the master end of the console's
+	// pseudoterminal.  This is ignored unless spec.Process.Terminal is
+	// true.
+	consoleSocket string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Create) Name() string {
+	return "create"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Create) Synopsis() string {
+	return "create a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Create) Usage() string {
+	return `create [flags] <container id> - create a secure container
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Create) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+	f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+	f.StringVar(&c.pidFile, "pid-file", "", "filename that the sandbox pid will be written to")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	bundleDir := c.bundleDir
+	if bundleDir == "" {
+		bundleDir = getwdOrDie()
+	}
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	// Create the sandbox process, passing additional command line
+	// arguments to the sandbox process.
+	if _, err := sandbox.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, commandLineFlags()); err != nil {
+		Fatalf("error creating sandbox: %v", err)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
new file mode 100644
index 000000000..a497c034d
--- /dev/null
+++ b/runsc/cmd/delete.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Delete implements subcommands.Command for the "delete" command.
+type Delete struct {
+	// force indicates that the sandbox should be terminated if running.
+	force bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Delete) Name() string {
+	return "delete"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Delete) Synopsis() string {
+	return "delete resources held by a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Delete) Usage() string {
+	return `delete [flags] <container ids>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (d *Delete) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&d.force, "force", false, "terminate sandbox if running")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() == 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	conf := args[0].(*boot.Config)
+
+	for i := 0; i < f.NArg(); i++ {
+		id := f.Arg(i)
+		s, err := sandbox.Load(conf.RootDir, id)
+		if err != nil {
+			Fatalf("error loading sandbox %q: %v", id, err)
+		}
+		if !d.force && (s.Status == sandbox.Running) {
+			Fatalf("cannot stop running sandbox without --force flag")
+		}
+		if err := s.Destroy(); err != nil {
+			Fatalf("error destroying sandbox: %v", err)
+		}
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
new file mode 100644
index 000000000..afd42c2f2
--- /dev/null
+++ b/runsc/cmd/events.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"os"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Events implements subcommands.Command for the "events" command.
+type Events struct {
+	// The interval between stats reporting.
+	intervalSec int
+	// If true, events will print a single group of stats and exit.
+	stats bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Events) Name() string {
+	return "events"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Events) Synopsis() string {
+	return "display container events such as OOM notifications, cpu, memory, and IO usage statistics"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Events) Usage() string {
+	return `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+The events command displays information about the container. By default the
+information is displayed once every 5 seconds.
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (evs *Events) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds")
+	f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandox: %v", err)
+	}
+
+	// Repeatedly get stats from the container.
+	for {
+		// Get the event and print it as JSON.
+		ev, err := s.Event()
+		if err != nil {
+			log.Warningf("error getting events for sandbox: %v", err)
+		}
+		// err must be preserved because it is used below when breaking
+		// out of the loop.
+		b, err := json.Marshal(ev)
+		if err != nil {
+			log.Warningf("error while marshalling event %v: %v", ev, err)
+		} else {
+			os.Stdout.Write(b)
+		}
+
+		// If we're only running once, break. If we're only running
+		// once and there was an error, the command failed.
+		if evs.stats {
+			if err != nil {
+				return subcommands.ExitFailure
+			}
+			break
+		}
+
+		time.Sleep(time.Duration(evs.intervalSec) * time.Second)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
new file mode 100644
index 000000000..8379f552d
--- /dev/null
+++ b/runsc/cmd/exec.go
@@ -0,0 +1,375 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Exec implements subcommands.Command for the "exec" command.
+type Exec struct {
+	cwd string
+	env stringSlice
+	// user contains the UID and GID with which to run the new process.
+	user        user
+	extraKGIDs  stringSlice
+	caps        stringSlice
+	detach      bool
+	processPath string
+	pidFile     string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Exec) Name() string {
+	return "exec"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Exec) Synopsis() string {
+	return "execute new process inside the container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Exec) Usage() string {
+	return `exec [command options] <container-id> <command> [command options] || --process process.json <container-id>
+
+
+Where "<container-id>" is the name for the instance of the container and
+"<command>" is the command to be executed in the container.
+"<command>" can't be empty unless a "-process" flag provided.
+
+EXAMPLE:
+If the container is configured to run /bin/ps the following will
+output a list of processes running in the container:
+
+       # runc exec <container-id> ps
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ex *Exec) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&ex.cwd, "cwd", "", "current working directory")
+	f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')")
+	f.Var(&ex.user, "user", "UID (format: <uid>[:<gid>])")
+	f.Var(&ex.extraKGIDs, "additional-gids", "additional gids")
+	f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process")
+	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
+	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
+	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the sandbox pid will be written to")
+}
+
+// Execute implements subcommands.Command.Execute. It starts a process in an
+// already created sandbox.
+func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	e, id, err := ex.parseArgs(f)
+	if err != nil {
+		Fatalf("error parsing process spec: %v", err)
+	}
+	e.Detach = ex.detach
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandox: %v", err)
+	}
+
+	if e.WorkingDirectory == "" {
+		e.WorkingDirectory = s.Spec.Process.Cwd
+	}
+
+	if e.Envv == nil {
+		e.Envv, err = resolveEnvs(s.Spec.Process.Env, ex.env)
+		if err != nil {
+			Fatalf("error getting environment variables: %v", err)
+		}
+	}
+
+	// containerd expects an actual process to represent the container being
+	// executed. If detach was specified, starts a child in non-detach mode,
+	// write the child's PID to the pid file. So when the container returns, the
+	// child process will also return and signal containerd.
+	if e.Detach {
+		binPath, err := specutils.BinPath()
+		if err != nil {
+			Fatalf("error getting bin path: %v", err)
+		}
+		var args []string
+		for _, a := range os.Args[1:] {
+			if !strings.Contains(a, "detach") {
+				args = append(args, a)
+			}
+		}
+		cmd := exec.Command(binPath, args...)
+		cmd.Stdin = os.Stdin
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		if err := cmd.Start(); err != nil {
+			Fatalf("failure to start child exec process, err: %v", err)
+		}
+
+		log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+
+		// Wait for PID file to ensure that child process has started. Otherwise,
+		// '--process' file is deleted as soon as this process returns and the child
+		// may fail to read it.
+		sleepTime := 10 * time.Millisecond
+		for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
+			_, err := os.Stat(ex.pidFile)
+			if err == nil {
+				break
+			}
+			if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
+				Fatalf("unexpected error waiting for PID file, err: %v", err)
+			}
+
+			log.Infof("Waiting for PID file to be created...")
+			time.Sleep(sleepTime)
+			sleepTime *= sleepTime * 2
+			if sleepTime > 1*time.Second {
+				sleepTime = 1 * time.Second
+			}
+		}
+		*waitStatus = 0
+		return subcommands.ExitSuccess
+	}
+
+	if ex.pidFile != "" {
+		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
+			Fatalf("error writing pid file: %v", err)
+		}
+	}
+
+	// Get the executable path, which is a bit tricky because we have to
+	// inspect the environment PATH which is relative to the root path.
+	// If the user is overriding environment variables, PATH may have been
+	// overwritten.
+	rootPath := s.Spec.Root.Path
+	e.Filename, err = specutils.GetExecutablePath(e.Argv[0], rootPath, e.Envv)
+	if err != nil {
+		Fatalf("error getting executable path: %v", err)
+	}
+
+	ws, err := s.Execute(e)
+	if err != nil {
+		Fatalf("error getting processes for sandbox: %v", err)
+	}
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
+
+// parseArgs parses exec information from the command line or a JSON file
+// depending on whether the --process flag was used. Returns an ExecArgs and
+// the ID of the sandbox to be used.
+func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
+	if ex.processPath == "" {
+		// Requires at least a container ID and command.
+		if f.NArg() < 2 {
+			f.Usage()
+			return nil, "", fmt.Errorf("both a container-id and command are required")
+		}
+		e, err := ex.argsFromCLI(f.Args()[1:])
+		return e, f.Arg(0), err
+	}
+	// Requires only the container ID.
+	if f.NArg() != 1 {
+		f.Usage()
+		return nil, "", fmt.Errorf("a container-id is required")
+	}
+	e, err := ex.argsFromProcessFile()
+	return e, f.Arg(0), err
+}
+
+func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
+	extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
+	for _, s := range ex.extraKGIDs {
+		kgid, err := strconv.Atoi(s)
+		if err != nil {
+			Fatalf("error parsing GID: %s, %v", s, err)
+		}
+		extraKGIDs = append(extraKGIDs, auth.KGID(kgid))
+	}
+
+	caps, err := capabilities(ex.caps)
+	if err != nil {
+		return nil, fmt.Errorf("capabilities error: %v", err)
+	}
+
+	return &control.ExecArgs{
+		Argv:             argv,
+		WorkingDirectory: ex.cwd,
+		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+		KUID:             ex.user.kuid,
+		KGID:             ex.user.kgid,
+		ExtraKGIDs:       extraKGIDs,
+		Capabilities:     caps,
+	}, nil
+}
+
+func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
+	f, err := os.Open(ex.processPath)
+	if err != nil {
+		return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
+	}
+	defer f.Close()
+	var p specs.Process
+	if err := json.NewDecoder(f).Decode(&p); err != nil {
+		return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
+	}
+	return argsFromProcess(&p)
+}
+
+// argsFromProcess performs all the non-IO conversion from the Process struct
+// to ExecArgs.
+func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
+	// Create capabilities.
+	caps, err := specutils.Capabilities(p.Capabilities)
+	if err != nil {
+		return nil, fmt.Errorf("error creating capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids))
+	for _, GID := range p.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	return &control.ExecArgs{
+		Argv:             p.Args,
+		Envv:             p.Env,
+		WorkingDirectory: p.Cwd,
+		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+		KUID:             auth.KUID(p.User.UID),
+		KGID:             auth.KGID(p.User.GID),
+		ExtraKGIDs:       extraKGIDs,
+		Capabilities:     caps,
+	}, nil
+}
+
+// resolveEnvs transforms lists of environment variables into a single list of
+// environment variables. If a variable is defined multiple times, the last
+// value is used.
+func resolveEnvs(envs ...[]string) ([]string, error) {
+	// First create a map of variable names to values. This removes any
+	// duplicates.
+	envMap := make(map[string]string)
+	for _, env := range envs {
+		for _, str := range env {
+			parts := strings.SplitN(str, "=", 2)
+			if len(parts) != 2 {
+				return nil, fmt.Errorf("invalid variable: %s", str)
+			}
+			envMap[parts[0]] = parts[1]
+		}
+	}
+	// Reassemble envMap into a list of environment variables of the form
+	// NAME=VALUE.
+	env := make([]string, 0, len(envMap))
+	for k, v := range envMap {
+		env = append(env, fmt.Sprintf("%s=%s", k, v))
+	}
+	return env, nil
+}
+
+// capabilities takes a list of capabilities as strings and returns an
+// auth.TaskCapabilities struct with those capabilities in every capability set.
+// This mimics runc's behavior.
+func capabilities(cs []string) (*auth.TaskCapabilities, error) {
+	var specCaps specs.LinuxCapabilities
+	for _, cap := range cs {
+		specCaps.Ambient = append(specCaps.Ambient, cap)
+		specCaps.Bounding = append(specCaps.Bounding, cap)
+		specCaps.Effective = append(specCaps.Effective, cap)
+		specCaps.Inheritable = append(specCaps.Inheritable, cap)
+		specCaps.Permitted = append(specCaps.Permitted, cap)
+	}
+	return specutils.Capabilities(&specCaps)
+}
+
+// stringSlice allows a flag to be used multiple times, where each occurrence
+// adds a value to the flag. For example, a flag called "x" could be invoked
+// via "runsc exec -x foo -x bar", and the corresponding stringSlice would be
+// {"x", "y"}.
+type stringSlice []string
+
+// String implements flag.Value.String.
+func (ss *stringSlice) String() string {
+	return fmt.Sprintf("%v", *ss)
+}
+
+// Get implements flag.Value.Get.
+func (ss *stringSlice) Get() interface{} {
+	return ss
+}
+
+// Set implements flag.Value.Set.
+func (ss *stringSlice) Set(s string) error {
+	*ss = append(*ss, s)
+	return nil
+}
+
+// user allows -user to convey a UID and, optionally, a GID separated by a
+// colon.
+type user struct {
+	kuid auth.KUID
+	kgid auth.KGID
+}
+
+func (u *user) String() string {
+	return fmt.Sprintf("%+v", *u)
+}
+
+func (u *user) Get() interface{} {
+	return u
+}
+
+func (u *user) Set(s string) error {
+	parts := strings.SplitN(s, ":", 2)
+	kuid, err := strconv.Atoi(parts[0])
+	if err != nil {
+		return fmt.Errorf("couldn't parse UID: %s", parts[0])
+	}
+	u.kuid = auth.KUID(kuid)
+	if len(parts) > 1 {
+		kgid, err := strconv.Atoi(parts[1])
+		if err != nil {
+			return fmt.Errorf("couldn't parse GID: %s", parts[1])
+		}
+		u.kgid = auth.KGID(kgid)
+	}
+	return nil
+}
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
new file mode 100644
index 000000000..623461e78
--- /dev/null
+++ b/runsc/cmd/exec_test.go
@@ -0,0 +1,154 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"os"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+func TestUser(t *testing.T) {
+	testCases := []struct {
+		input   string
+		want    user
+		wantErr bool
+	}{
+		{input: "0", want: user{kuid: 0, kgid: 0}},
+		{input: "7", want: user{kuid: 7, kgid: 0}},
+		{input: "49:343", want: user{kuid: 49, kgid: 343}},
+		{input: "0:2401", want: user{kuid: 0, kgid: 2401}},
+		{input: "", wantErr: true},
+		{input: "foo", wantErr: true},
+		{input: ":123", wantErr: true},
+		{input: "1:2:3", wantErr: true},
+	}
+
+	for _, tc := range testCases {
+		var u user
+		if err := u.Set(tc.input); err != nil && tc.wantErr {
+			// We got an error and wanted one.
+			continue
+		} else if err == nil && tc.wantErr {
+			t.Errorf("user.Set(%s): got no error, but wanted one", tc.input)
+		} else if err != nil && !tc.wantErr {
+			t.Errorf("user.Set(%s): got error %v, but wanted none", tc.input, err)
+		} else if u != tc.want {
+			t.Errorf("user.Set(%s): got %+v, but wanted %+v", tc.input, u, tc.want)
+		}
+	}
+}
+
+func TestCLIArgs(t *testing.T) {
+	testCases := []struct {
+		ex       Exec
+		argv     []string
+		expected control.ExecArgs
+	}{
+		{
+			ex: Exec{
+				cwd:         "/foo/bar",
+				user:        user{kuid: 0, kgid: 0},
+				extraKGIDs:  []string{"1", "2", "3"},
+				caps:        []string{"CAP_DAC_OVERRIDE"},
+				processPath: "",
+			},
+			argv: []string{"ls", "/"},
+			expected: control.ExecArgs{
+				Argv:             []string{"ls", "/"},
+				WorkingDirectory: "/foo/bar",
+				FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+				KUID:             0,
+				KGID:             0,
+				ExtraKGIDs:       []auth.KGID{1, 2, 3},
+				Capabilities: &auth.TaskCapabilities{
+					BoundingCaps:    auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					EffectiveCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					PermittedCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		e, err := tc.ex.argsFromCLI(tc.argv)
+		if err != nil {
+			t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err)
+		} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
+			t.Errorf("argsFromCLI(%+v): got %+v, but expected %+v", tc.ex, *e, tc.expected)
+		}
+	}
+}
+
+func TestJSONArgs(t *testing.T) {
+	testCases := []struct {
+		// ex is provided to make sure it is overridden by p.
+		ex       Exec
+		p        specs.Process
+		expected control.ExecArgs
+	}{
+		{
+			ex: Exec{
+				cwd:         "/baz/quux",
+				user:        user{kuid: 1, kgid: 1},
+				extraKGIDs:  []string{"4", "5", "6"},
+				caps:        []string{"CAP_SETGID"},
+				processPath: "/bin/foo",
+			},
+			p: specs.Process{
+				User: specs.User{UID: 0, GID: 0, AdditionalGids: []uint32{1, 2, 3}},
+				Args: []string{"ls", "/"},
+				Cwd:  "/foo/bar",
+				Capabilities: &specs.LinuxCapabilities{
+					Bounding:    []string{"CAP_DAC_OVERRIDE"},
+					Effective:   []string{"CAP_DAC_OVERRIDE"},
+					Inheritable: []string{"CAP_DAC_OVERRIDE"},
+					Permitted:   []string{"CAP_DAC_OVERRIDE"},
+				},
+			},
+			expected: control.ExecArgs{
+				Argv:             []string{"ls", "/"},
+				WorkingDirectory: "/foo/bar",
+				FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+				KUID:             0,
+				KGID:             0,
+				ExtraKGIDs:       []auth.KGID{1, 2, 3},
+				Capabilities: &auth.TaskCapabilities{
+					BoundingCaps:    auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					EffectiveCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					PermittedCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		e, err := argsFromProcess(&tc.p)
+		if err != nil {
+			t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err)
+		} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
+			t.Errorf("argsFromProcess(%+v): got %+v, but expected %+v", tc.p, *e, tc.expected)
+		}
+	}
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
new file mode 100644
index 000000000..844e16dbf
--- /dev/null
+++ b/runsc/cmd/gofer.go
@@ -0,0 +1,134 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"sync"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Gofer implements subcommands.Command for the "gofer" command, which starts a
+// filesystem gofer.  This command should not be called directly.
+type Gofer struct {
+	bundleDir string
+	ioFDs     intFlags
+}
+
+// Name implements subcommands.Command.
+func (*Gofer) Name() string {
+	return "gofer"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Gofer) Synopsis() string {
+	return "launch a gofer process that server files over 9P protocol (internal use only)"
+}
+
+// Usage implements subcommands.Command.
+func (*Gofer) Usage() string {
+	return `gofer [flags]`
+}
+
+// SetFlags implements subcommands.Command.
+func (g *Gofer) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
+}
+
+// Execute implements subcommands.Command.
+func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if g.bundleDir == "" || len(g.ioFDs) < 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	spec, err := specutils.ReadSpec(g.bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	// Start with root mount, then add any other addition mount as needed.
+	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
+	p := absPath(g.bundleDir, spec.Root.Path)
+	ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
+		ROMount: spec.Root.Readonly,
+		// Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when
+		// each file is opened as writable. Thus, we open files lazily to avoid copy-up.
+		LazyOpenForWrite: true,
+	}))
+	log.Infof("Serving %q mapped to %q on FD %d", "/", p, g.ioFDs[0])
+
+	mountIdx := 1 // first one is the root
+	for _, m := range spec.Mounts {
+		if specutils.Is9PMount(m) {
+			p = absPath(g.bundleDir, m.Source)
+			ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
+				ROMount:          isReadonlyMount(m.Options),
+				LazyOpenForWrite: false,
+			}))
+
+			if mountIdx >= len(g.ioFDs) {
+				Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
+			}
+			log.Infof("Serving %q mapped to %q on FD %d", m.Destination, p, g.ioFDs[mountIdx])
+			mountIdx++
+		}
+	}
+	if mountIdx != len(g.ioFDs) {
+		Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
+	}
+
+	runServers(ats, g.ioFDs)
+	return subcommands.ExitSuccess
+}
+
+func runServers(ats []p9.Attacher, ioFDs []int) {
+	// Run the loops and wait for all to exit.
+	var wg sync.WaitGroup
+	for i, ioFD := range ioFDs {
+		wg.Add(1)
+		go func(ioFD int, at p9.Attacher) {
+			socket, err := unet.NewSocket(ioFD)
+			if err != nil {
+				Fatalf("err creating server on FD %d: %v", ioFD, err)
+			}
+			s := p9.NewServer(at)
+			if err := s.Handle(socket); err != nil {
+				Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
+			}
+			wg.Done()
+		}(ioFD, ats[i])
+	}
+	wg.Wait()
+	log.Infof("All 9P servers exited.")
+}
+
+func isReadonlyMount(opts []string) bool {
+	for _, o := range opts {
+		if o == "ro" {
+			return true
+		}
+	}
+	return false
+}
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
new file mode 100644
index 000000000..f89e0077e
--- /dev/null
+++ b/runsc/cmd/kill.go
@@ -0,0 +1,142 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"syscall"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Kill implements subcommands.Command for the "kill" command.
+type Kill struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Kill) Name() string {
+	return "kill"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Kill) Synopsis() string {
+	return "sends a signal to the sandbox"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Kill) Usage() string {
+	return `kill <container id> [signal]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Kill) SetFlags(f *flag.FlagSet) {
+	// TODO: Implement this flag.  It is defined here just to
+	// prevent runsc from crashing if it is passed.
+	var all bool
+	f.BoolVar(&all, "all", false, "send the specified signal to all processes inside the container")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() == 0 || f.NArg() > 2 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandbox: %v", err)
+	}
+
+	// The OCI command-line spec says that the signal should be specified
+	// via a flag, but runc (and things that call runc) pass it as an
+	// argument.
+	signal := f.Arg(2)
+	if signal == "" {
+		signal = "TERM"
+	}
+
+	sig, err := parseSignal(signal)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+	if err := s.Signal(sig); err != nil {
+		Fatalf("%v", err)
+	}
+	return subcommands.ExitSuccess
+}
+
+func parseSignal(s string) (syscall.Signal, error) {
+	n, err := strconv.Atoi(s)
+	if err == nil {
+		sig := syscall.Signal(n)
+		for _, msig := range signalMap {
+			if sig == msig {
+				return sig, nil
+			}
+		}
+		return -1, fmt.Errorf("unknown signal %q", s)
+	}
+	if sig, ok := signalMap[strings.TrimPrefix(strings.ToUpper(s), "SIG")]; ok {
+		return sig, nil
+	}
+	return -1, fmt.Errorf("unknown signal %q", s)
+}
+
+var signalMap = map[string]syscall.Signal{
+	"ABRT":   unix.SIGABRT,
+	"ALRM":   unix.SIGALRM,
+	"BUS":    unix.SIGBUS,
+	"CHLD":   unix.SIGCHLD,
+	"CLD":    unix.SIGCLD,
+	"CONT":   unix.SIGCONT,
+	"FPE":    unix.SIGFPE,
+	"HUP":    unix.SIGHUP,
+	"ILL":    unix.SIGILL,
+	"INT":    unix.SIGINT,
+	"IO":     unix.SIGIO,
+	"IOT":    unix.SIGIOT,
+	"KILL":   unix.SIGKILL,
+	"PIPE":   unix.SIGPIPE,
+	"POLL":   unix.SIGPOLL,
+	"PROF":   unix.SIGPROF,
+	"PWR":    unix.SIGPWR,
+	"QUIT":   unix.SIGQUIT,
+	"SEGV":   unix.SIGSEGV,
+	"STKFLT": unix.SIGSTKFLT,
+	"STOP":   unix.SIGSTOP,
+	"SYS":    unix.SIGSYS,
+	"TERM":   unix.SIGTERM,
+	"TRAP":   unix.SIGTRAP,
+	"TSTP":   unix.SIGTSTP,
+	"TTIN":   unix.SIGTTIN,
+	"TTOU":   unix.SIGTTOU,
+	"URG":    unix.SIGURG,
+	"USR1":   unix.SIGUSR1,
+	"USR2":   unix.SIGUSR2,
+	"VTALRM": unix.SIGVTALRM,
+	"WINCH":  unix.SIGWINCH,
+	"XCPU":   unix.SIGXCPU,
+	"XFSZ":   unix.SIGXFSZ,
+}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
new file mode 100644
index 000000000..bf7cb41bb
--- /dev/null
+++ b/runsc/cmd/list.go
@@ -0,0 +1,117 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"text/tabwriter"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// List implements subcommands.Command for the "list" command for the "list" command.
+type List struct {
+	quiet  bool
+	format string
+}
+
+// Name implements subcommands.command.name.
+func (*List) Name() string {
+	return "list"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*List) Synopsis() string {
+	return "list contaners started by runsc with the given root"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*List) Usage() string {
+	return `list [flags]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (l *List) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&l.quiet, "quiet", false, "only list container ids")
+	f.StringVar(&l.format, "format", "text", "output format: 'text' (default) or 'json'")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	conf := args[0].(*boot.Config)
+	ids, err := sandbox.List(conf.RootDir)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	if l.quiet {
+		for _, id := range ids {
+			fmt.Println(id)
+		}
+		return subcommands.ExitSuccess
+	}
+
+	// Collect the sandboxes.
+	var sandboxes []*sandbox.Sandbox
+	for _, id := range ids {
+		s, err := sandbox.Load(conf.RootDir, id)
+		if err != nil {
+			Fatalf("error loading sandbox %q: %v", id, err)
+		}
+		sandboxes = append(sandboxes, s)
+	}
+
+	switch l.format {
+	case "text":
+		// Print a nice table.
+		w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
+		fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
+		for _, s := range sandboxes {
+			fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
+				s.ID,
+				s.Pid,
+				s.Status,
+				s.BundleDir,
+				s.CreatedAt.Format(time.RFC3339Nano),
+				s.Owner)
+		}
+		w.Flush()
+	case "json":
+		// Print just the states.
+		var states []specs.State
+		for _, s := range sandboxes {
+			states = append(states, s.State())
+		}
+		if err := json.NewEncoder(os.Stdout).Encode(states); err != nil {
+			Fatalf("error marshaling sandbox state: %v", err)
+		}
+	default:
+		Fatalf("unknown list format %q", l.format)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
new file mode 100644
index 000000000..4bb1dbb4f
--- /dev/null
+++ b/runsc/cmd/path.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"os"
+	"path/filepath"
+)
+
+// absPath turns the given path into an absolute path (if it is not already
+// absolute) by prepending the base path.
+func absPath(base, rel string) string {
+	if filepath.IsAbs(rel) {
+		return rel
+	}
+	return filepath.Join(base, rel)
+}
+
+// getwdOrDie returns the current working directory and dies if it cannot.
+func getwdOrDie() string {
+	wd, err := os.Getwd()
+	if err != nil {
+		Fatalf("error getting current working directory: %v", err)
+	}
+	return wd
+}
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
new file mode 100644
index 000000000..a667ec04c
--- /dev/null
+++ b/runsc/cmd/ps.go
@@ -0,0 +1,86 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// PS implements subcommands.Command for the "ps" command.
+type PS struct {
+	format string
+}
+
+// Name implements subcommands.Command.Name.
+func (*PS) Name() string {
+	return "ps"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*PS) Synopsis() string {
+	return "ps displays the processes running inside a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*PS) Usage() string {
+	return "<container-id> [ps options]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ps *PS) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&ps.format, "format", "table", "output format. Select one of: table or json (default: table)")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandox: %v", err)
+	}
+	pList, err := s.Processes()
+	if err != nil {
+		Fatalf("error getting processes for sandbox: %v", err)
+	}
+
+	switch ps.format {
+	case "table":
+		fmt.Println(control.ProcessListToTable(pList))
+	case "json":
+		o, err := control.PrintPIDsJSON(pList)
+		if err != nil {
+			Fatalf("error generating JSON: %v", err)
+		}
+		fmt.Println(o)
+	default:
+		Fatalf("Unsupported format: %s", ps.format)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
new file mode 100644
index 000000000..a61a6c73e
--- /dev/null
+++ b/runsc/cmd/run.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"syscall"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Run implements subcommands.Command for the "run" command.
+type Run struct {
+	// Run flags are a super-set of those for Create.
+	Create
+}
+
+// Name implements subcommands.Command.Name.
+func (*Run) Name() string {
+	return "run"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Run) Synopsis() string {
+	return "create and run a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Run) Usage() string {
+	return `run [flags] <container id> - create and run a secure container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Run) SetFlags(f *flag.FlagSet) {
+	r.Create.SetFlags(f)
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	bundleDir := r.bundleDir
+	if bundleDir == "" {
+		bundleDir = getwdOrDie()
+	}
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+
+	ws, err := sandbox.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, commandLineFlags())
+	if err != nil {
+		Fatalf("error running sandbox: %v", err)
+	}
+
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
new file mode 100644
index 000000000..a8e132497
--- /dev/null
+++ b/runsc/cmd/start.go
@@ -0,0 +1,64 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Start implements subcommands.Command for the "start" command.
+type Start struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Start) Name() string {
+	return "start"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Start) Synopsis() string {
+	return "start a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Start) Usage() string {
+	return `start <container id> - start a secure container.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Start) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandbox: %v", err)
+	}
+	if err := s.Start(conf); err != nil {
+		Fatalf("error starting sandbox: %v", err)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
new file mode 100644
index 000000000..0b47f290a
--- /dev/null
+++ b/runsc/cmd/state.go
@@ -0,0 +1,73 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"os"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// State implements subcommands.Command for the "state" command.
+type State struct{}
+
+// Name implements subcommands.Command.Name.
+func (*State) Name() string {
+	return "state"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*State) Synopsis() string {
+	return "get the state of a sandbox"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*State) Usage() string {
+	return `state [flags] <container id> - get the state of a sandbox`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*State) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandbox: %v", err)
+	}
+	log.Debugf("Returning state %+v", s)
+
+	// Write json-encoded state directly to stdout.
+	b, err := json.MarshalIndent(s.State(), "", "  ")
+	if err != nil {
+		Fatalf("error marshaling sandbox state: %v", err)
+	}
+	os.Stdout.Write(b)
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
new file mode 100644
index 000000000..24e172f48
--- /dev/null
+++ b/runsc/fsgofer/BUILD
@@ -0,0 +1,33 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "fsgofer",
+    srcs = [
+        "fsgofer.go",
+        "fsgofer_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/fsgofer",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/p9",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "fsgofer_test",
+    size = "small",
+    srcs = ["fsgofer_test.go"],
+    embed = [":fsgofer"],
+    deps = [
+        "//pkg/log",
+        "//pkg/p9",
+    ],
+)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
new file mode 100644
index 000000000..5ddc75a9d
--- /dev/null
+++ b/runsc/fsgofer/fsgofer.go
@@ -0,0 +1,937 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsgofer implements p9.File giving access to local files using
+// a simple mapping from a path prefix that is added to the path requested
+// by the sandbox. Ex:
+//
+//   prefix: "/docker/imgs/alpine"
+//   app path: /bin/ls => /docker/imgs/alpine/bin/ls
+package fsgofer
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+	"sync"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+)
+
+const (
+	// invalidMode is set to a value that doesn't match any other valid
+	// modes to ensure an unopened/closed file fails all mode checks.
+	invalidMode = p9.OpenFlags(math.MaxUint32)
+
+	openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
+)
+
+type fileType int
+
+const (
+	regular fileType = iota
+	directory
+	symlink
+)
+
+// String implements fmt.Stringer.
+func (f fileType) String() string {
+	switch f {
+	case regular:
+		return "regular"
+	case directory:
+		return "directory"
+	case symlink:
+		return "symlink"
+	}
+	return "unknown"
+}
+
+// Config sets configuration options for each attach point.
+type Config struct {
+	// ROMount is set to true if this is a readonly mount.
+	ROMount bool
+
+	// LazyOpenForWrite makes the underlying file to be opened in RDONLY
+	// mode initially and be reopened in case write access is desired.
+	// This is done to workaround the behavior in 'overlay2' that
+	// copies the entire file up eagerly when it's opened in write mode
+	// even if the file is never actually written to.
+	LazyOpenForWrite bool
+}
+
+type attachPoint struct {
+	prefix string
+	conf   Config
+}
+
+// NewAttachPoint creates a new attacher that gives local file
+// access to all files under 'prefix'.
+func NewAttachPoint(prefix string, c Config) p9.Attacher {
+	return &attachPoint{prefix: prefix, conf: c}
+}
+
+// Attach implements p9.Attacher.
+func (a *attachPoint) Attach(appPath string) (p9.File, error) {
+	if !path.IsAbs(appPath) {
+		return nil, fmt.Errorf("invalid path %q", appPath)
+	}
+
+	root := filepath.Join(a.prefix, appPath)
+	f, err := os.OpenFile(root, openFlags|syscall.O_RDONLY, 0)
+	if err != nil {
+		return nil, fmt.Errorf("unable to open file %q, err: %v", root, err)
+	}
+	stat, err := stat(int(f.Fd()))
+	if err != nil {
+		return nil, fmt.Errorf("failed to stat file %q, err: %v", root, err)
+	}
+	return newLocalFile(a.conf, f, root, stat)
+}
+
+func makeQID(stat syscall.Stat_t) p9.QID {
+	return p9.QID{
+		Type: p9.FileMode(stat.Mode).QIDType(),
+		Path: stat.Ino,
+	}
+}
+
+func isNameValid(name string) bool {
+	if name == "" || name == "." || name == ".." {
+		log.Warningf("Invalid name: %s", name)
+		return false
+	}
+	if strings.IndexByte(name, '/') >= 0 {
+		log.Warningf("Invalid name: %s", name)
+		return false
+	}
+	return true
+}
+
+// localFile implements p9.File wrapping a local file. The underlying file
+// is opened during Walk() and stored in 'controlFile' to be used with other
+// operations. The mode in which the file is opened varies depending on the
+// configuration (see below). 'controlFile' is dup'ed when Walk(nil) is called
+// to clone the file.
+//
+// 'openedFile' is assigned when Open() is called. If requested open mode is
+// a subset of controlFile's mode, it's possible to use the same file. If mode
+// is not a subset, then another file is opened. Consequently, 'openedFile'
+// could have a mode wider than requested and must be verified before read/write
+// operations. Before the file is opened and after it's closed, 'mode' is set to
+// an invalid value to prevent an unopened file from being used.
+//
+// localFile has 2 modes of operation based on the configuration:
+//
+// ** conf.lazyRWOpen == false **
+// This is the preferred mode. 'controlFile' is opened in RW mode in Walk()
+// and used across all functions. The file is never reopened as the mode will
+// always be a super set of the requested open mode. This reduces the number of
+// syscalls required per operation and makes it resilient to renames anywhere
+// in the path to the file.
+//
+// ** conf.lazyRWOpen == true **
+// This mode is used for better performance with 'overlay2' storage driver.
+// overlay2 eagerly copies the entire file up when it's opened in write mode
+// which makes the mode above perform badly when serveral of files are opened
+// for read (esp. startup). In this mode, 'controlFile' is opened as readonly
+// (or O_PATH for symlinks). Reopening the file is required if write mode
+// is requested in Open().
+type localFile struct {
+	p9.DefaultWalkGetAttr
+
+	// mu protects 'hostPath' when file is renamed.
+	mu sync.Mutex
+
+	// TODO: hostPath is not safe to use as path needs to be walked
+	// everytime (and can change underneath us). Remove all usages.
+	hostPath string
+
+	// controlFile is opened when localFile is created and it's never nil.
+	controlFile *os.File
+
+	// openedFile is nil until localFile is opened. It may point to controlFile
+	// or be a new file struct. See struct comment for more details.
+	openedFile *os.File
+
+	// mode is the mode in which the file was opened. Set to invalidMode
+	// if localFile isn't opened.
+	mode p9.OpenFlags
+
+	ft fileType
+
+	conf Config
+
+	// readDirMu protects against concurrent Readdir calls.
+	readDirMu sync.Mutex
+}
+
+func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
+	// Attempt to open file in the following mode in order:
+	//   1. RDWR: for files with rw mounts and LazyOpenForWrite disabled
+	//   2. RDONLY: for directories, ro mounts or LazyOpenForWrite enabled
+	//   3. PATH: for symlinks
+	modes := []int{syscall.O_RDWR, syscall.O_RDONLY, unix.O_PATH}
+	symlinkIdx := len(modes) - 1
+
+	startIdx := 0
+	if parent.conf.ROMount || parent.conf.LazyOpenForWrite {
+		// Skip attempt to open in RDWR based on configuration.
+		startIdx = 1
+	}
+
+	var err error
+	var fd int
+	for i := startIdx; i < len(modes); i++ {
+		fd, err = syscall.Openat(parent.controlFD(), name, openFlags|modes[i], 0)
+		if err == nil {
+			// openat succeeded, we're done.
+			break
+		}
+		switch e := extractErrno(err); e {
+		case syscall.ENOENT:
+			// File doesn't exist, no point in retrying.
+			return nil, "", e
+		case syscall.ELOOP:
+			if i < symlinkIdx {
+				// File was opened with O_NOFOLLOW, so this error can only happen when
+				// trying ot open a symlink. Jump straight to flags compatible with symlink.
+				i = symlinkIdx - 1
+			}
+		}
+		// openat failed. Try again with next mode, preserving 'err' in
+		// case this was the last attempt.
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|modes[i], parent.controlFile.Name(), name, err)
+	}
+	if err != nil {
+		// All attempts to open file have failed, return the last error.
+		log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.controlFile.Name(), name, err)
+		return nil, "", extractErrno(err)
+	}
+
+	parent.mu.Lock()
+	defer parent.mu.Unlock()
+	newPath := path.Join(parent.hostPath, name)
+
+	return os.NewFile(uintptr(fd), newPath), newPath, nil
+}
+
+func newLocalFile(conf Config, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) {
+	var ft fileType
+	switch stat.Mode & syscall.S_IFMT {
+	case syscall.S_IFREG:
+		ft = regular
+	case syscall.S_IFDIR:
+		ft = directory
+	case syscall.S_IFLNK:
+		ft = symlink
+	default:
+		return nil, syscall.EINVAL
+	}
+	return &localFile{
+		hostPath:    path,
+		controlFile: file,
+		conf:        conf,
+		mode:        invalidMode,
+		ft:          ft,
+	}, nil
+}
+
+// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as
+// non-blocking. If anything fails, returns nil. It's better to have a file
+// without host FD, than to fail the operation.
+func newFDMaybe(file *os.File) *fd.FD {
+	fd, err := fd.NewFromFile(file)
+	if err != nil {
+		return nil
+	}
+
+	// fd is blocking; non-blocking is required.
+	if err := syscall.SetNonblock(fd.FD(), true); err != nil {
+		fd.Close()
+		return nil
+	}
+	return fd
+}
+
+func stat(fd int) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(fd, &stat); err != nil {
+		return syscall.Stat_t{}, err
+	}
+	return stat, nil
+}
+
+func fchown(fd int, uid p9.UID, gid p9.GID) error {
+	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+}
+
+func (l *localFile) controlFD() int {
+	return int(l.controlFile.Fd())
+}
+
+func (l *localFile) openedFD() int {
+	if l.openedFile == nil {
+		panic(fmt.Sprintf("trying to use an unopened file: %q", l.controlFile.Name()))
+	}
+	return int(l.openedFile.Fd())
+}
+
+// Open implements p9.File.
+func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+	if l.openedFile != nil {
+		panic(fmt.Sprintf("attempting to open already opened file: %q", l.controlFile.Name()))
+	}
+
+	// Check if control file can be used or if a new open must be created.
+	var newFile *os.File
+	if mode == p9.ReadOnly || !l.conf.LazyOpenForWrite {
+		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name())
+		newFile = l.controlFile
+	} else {
+		// Ideally reopen would call name_to_handle_at (with empty name) and open_by_handle_at
+		// to reopen the file without using 'hostPath'. However, name_to_handle_at and
+		// open_by_handle_at aren't supported by overlay2.
+		log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
+		var err error
+
+		l.mu.Lock()
+		newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
+		if err != nil {
+			l.mu.Unlock()
+			return nil, p9.QID{}, 0, extractErrno(err)
+		}
+		l.mu.Unlock()
+	}
+
+	stat, err := stat(int(newFile.Fd()))
+	if err != nil {
+		newFile.Close()
+		return nil, p9.QID{}, 0, extractErrno(err)
+	}
+
+	var fd *fd.FD
+	if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+		// Donate FD for regular files only.
+		fd = newFDMaybe(newFile)
+	}
+
+	// Set fields on success
+	l.openedFile = newFile
+	l.mode = mode
+	return fd, makeQID(stat), 0, nil
+}
+
+// Create implements p9.File.
+func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
+	if l.conf.ROMount {
+		return nil, nil, p9.QID{}, 0, syscall.EBADF
+	}
+	if !isNameValid(name) {
+		return nil, nil, p9.QID{}, 0, syscall.EINVAL
+	}
+
+	// Use a single file for both 'controlFile' and 'openedFile'. Mode must include read for control
+	// and whichever else was requested by caller. Note that resulting file might have a wider mode
+	// than needed for each particular case.
+	flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+	if mode == p9.WriteOnly {
+		flags |= syscall.O_RDWR
+	} else {
+		flags |= mode.OSFlags()
+	}
+
+	fd, err := syscall.Openat(l.controlFD(), name, flags, uint32(perm.Permissions()))
+	if err != nil {
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+	if err := fchown(fd, uid, gid); err != nil {
+		syscall.Close(fd)
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+	stat, err := stat(fd)
+	if err != nil {
+		syscall.Close(fd)
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	cPath := path.Join(l.hostPath, name)
+	f := os.NewFile(uintptr(fd), cPath)
+	c := &localFile{
+		hostPath:    cPath,
+		controlFile: f,
+		openedFile:  f,
+		mode:        mode,
+		conf:        l.conf,
+	}
+	return newFDMaybe(c.openedFile), c, makeQID(stat), 0, nil
+}
+
+// Mkdir implements p9.File.
+func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	if l.conf.ROMount {
+		return p9.QID{}, syscall.EBADF
+	}
+
+	if !isNameValid(name) {
+		return p9.QID{}, syscall.EINVAL
+	}
+
+	if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+
+	// Open directory to change ownership and stat it.
+	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+	fd, err := syscall.Openat(l.controlFD(), name, flags, 0)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	defer syscall.Close(fd)
+
+	if err := fchown(fd, uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := stat(fd)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	return makeQID(stat), nil
+}
+
+// Walk implements p9.File.
+func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+	// Duplicate current file if 'names' is empty.
+	if len(names) == 0 {
+		newFd, err := syscall.Dup(l.controlFD())
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+		stat, err := stat(newFd)
+		if err != nil {
+			syscall.Close(newFd)
+			return nil, nil, extractErrno(err)
+		}
+
+		l.mu.Lock()
+		defer l.mu.Unlock()
+
+		c := &localFile{
+			hostPath:    l.hostPath,
+			controlFile: os.NewFile(uintptr(newFd), l.hostPath),
+			mode:        invalidMode,
+			conf:        l.conf,
+		}
+		return []p9.QID{makeQID(stat)}, c, nil
+	}
+
+	var qids []p9.QID
+	last := l
+	for _, name := range names {
+		if !isNameValid(name) {
+			return nil, nil, syscall.EINVAL
+		}
+
+		f, path, err := openAnyFile(last, name)
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+		stat, err := stat(int(f.Fd()))
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+		c, err := newLocalFile(last.conf, f, path, stat)
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+
+		qids = append(qids, makeQID(stat))
+		last = c
+	}
+	return qids, last, nil
+}
+
+// StatFS implements p9.File.
+func (l *localFile) StatFS() (p9.FSStat, error) {
+	var s syscall.Statfs_t
+	if err := syscall.Fstatfs(l.controlFD(), &s); err != nil {
+		return p9.FSStat{}, extractErrno(err)
+	}
+
+	// Populate with what's available.
+	return p9.FSStat{
+		Type:            uint32(s.Type),
+		BlockSize:       uint32(s.Bsize),
+		Blocks:          s.Blocks,
+		BlocksFree:      s.Bfree,
+		BlocksAvailable: s.Bavail,
+		Files:           s.Files,
+		FilesFree:       s.Ffree,
+		NameLength:      uint32(s.Namelen),
+	}, nil
+}
+
+// FSync implements p9.File.
+func (l *localFile) FSync() error {
+	if l.openedFile == nil {
+		return syscall.EBADF
+	}
+	if err := l.openedFile.Sync(); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// GetAttr implements p9.File.
+func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	stat, err := stat(l.controlFD())
+	if err != nil {
+		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
+	}
+
+	attr := p9.Attr{
+		Mode:             p9.FileMode(stat.Mode),
+		UID:              p9.UID(stat.Uid),
+		GID:              p9.GID(stat.Gid),
+		NLink:            stat.Nlink,
+		RDev:             stat.Rdev,
+		Size:             uint64(stat.Size),
+		BlockSize:        uint64(stat.Blksize),
+		Blocks:           uint64(stat.Blocks),
+		ATimeSeconds:     uint64(stat.Atim.Sec),
+		ATimeNanoSeconds: uint64(stat.Atim.Nsec),
+		MTimeSeconds:     uint64(stat.Mtim.Sec),
+		MTimeNanoSeconds: uint64(stat.Mtim.Nsec),
+		CTimeSeconds:     uint64(stat.Ctim.Sec),
+		CTimeNanoSeconds: uint64(stat.Ctim.Nsec),
+	}
+	valid := p9.AttrMask{
+		Mode:   true,
+		UID:    true,
+		GID:    true,
+		NLink:  true,
+		RDev:   true,
+		Size:   true,
+		Blocks: true,
+		ATime:  true,
+		MTime:  true,
+		CTime:  true,
+	}
+
+	return makeQID(stat), valid, attr, nil
+}
+
+// SetAttr implements p9.File. Due to mismatch in file API, options
+// cannot be changed atomicaly and user may see partial changes when
+// an error happens.
+func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
+	if l.conf.ROMount {
+		return syscall.EBADF
+	}
+
+	allowed := p9.SetAttrMask{
+		Permissions:        true,
+		UID:                true,
+		GID:                true,
+		Size:               true,
+		ATime:              true,
+		MTime:              true,
+		ATimeNotSystemTime: true,
+		MTimeNotSystemTime: true,
+	}
+
+	if valid.Empty() {
+		// Nothing to do.
+		return nil
+	}
+
+	// Handle all the sanity checks up front so that the client gets a
+	// consistent result that is not attribute dependent.
+	if !valid.IsSubsetOf(allowed) {
+		log.Warningf("SetAttr() failed for %q, mask: %v", l.controlFile.Name(), valid)
+		return syscall.EPERM
+	}
+
+	fd := l.controlFD()
+	if l.conf.LazyOpenForWrite && l.ft == regular {
+		// Regular files are opened in RO mode when lazy open is set.
+		// Thus it needs to be reopened here for write.
+		f, err := os.OpenFile(l.hostPath, openFlags|os.O_WRONLY, 0)
+		if err != nil {
+			return extractErrno(err)
+		}
+		defer f.Close()
+		fd = int(f.Fd())
+	}
+
+	// The semantics are to either return an error if no changes were made,
+	// or no error if *all* changes were made. Well, this can be impossible
+	// if the filesystem rejects at least one of the changes, especially
+	// since some operations are not easy to undo atomically.
+	//
+	// This could be made better if SetAttr actually returned the changes
+	// it did make, so the client can at least know what has changed. So
+	// we at least attempt to make all of the changes and return a generic
+	// error if any of them fails, which at least doesn't bias any change
+	// over another.
+	var err error
+	if valid.Permissions {
+		if cerr := syscall.Fchmod(fd, uint32(attr.Permissions)); cerr != nil {
+			log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
+			err = extractErrno(cerr)
+		}
+	}
+
+	if valid.Size {
+		if terr := syscall.Ftruncate(fd, int64(attr.Size)); terr != nil {
+			log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
+			err = extractErrno(terr)
+		}
+	}
+
+	if valid.ATime || valid.MTime {
+		utimes := [2]syscall.Timespec{
+			syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
+			syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
+		}
+		if valid.ATime {
+			if valid.ATimeNotSystemTime {
+				utimes[0].Sec = int64(attr.ATimeSeconds)
+				utimes[0].Nsec = int64(attr.ATimeNanoSeconds)
+			} else {
+				utimes[0].Nsec = linux.UTIME_NOW
+			}
+		}
+		if valid.MTime {
+			if valid.MTimeNotSystemTime {
+				utimes[1].Sec = int64(attr.MTimeSeconds)
+				utimes[1].Nsec = int64(attr.MTimeNanoSeconds)
+			} else {
+				utimes[1].Nsec = linux.UTIME_NOW
+			}
+		}
+
+		if l.ft == symlink {
+			// utimensat operates different that other syscalls. To operate on a
+			// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
+			// name.
+			f, err := os.OpenFile(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+			if err != nil {
+				return extractErrno(err)
+			}
+			defer f.Close()
+
+			if terr := utimensat(int(f.Fd()), path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
+				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+				err = extractErrno(terr)
+			}
+		} else {
+			// Directories and regular files can operate directly on the fd
+			// using empty name.
+			if terr := utimensat(fd, "", utimes, 0); terr != nil {
+				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+				err = extractErrno(terr)
+			}
+		}
+	}
+
+	if valid.UID || valid.GID {
+		uid := -1
+		if valid.UID {
+			uid = int(attr.UID)
+		}
+		gid := -1
+		if valid.GID {
+			gid = int(attr.GID)
+		}
+		if oerr := syscall.Fchownat(fd, "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
+			err = extractErrno(oerr)
+		}
+	}
+
+	return err
+}
+
+// Remove implements p9.File.
+//
+// This is deprecated in favor of UnlinkAt.
+func (*localFile) Remove() error {
+	return syscall.ENOSYS
+}
+
+// Rename implements p9.File.
+func (l *localFile) Rename(directory p9.File, name string) error {
+	if l.conf.ROMount {
+		return syscall.EBADF
+	}
+	if !isNameValid(name) {
+		return syscall.EINVAL
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	// TODO: change to renameat(2)
+	parent := directory.(*localFile)
+	newPath := path.Join(parent.hostPath, name)
+	if err := os.Rename(l.hostPath, newPath); err != nil {
+		return extractErrno(err)
+	}
+
+	// Update path on success.
+	// TODO: this doesn't cover cases where any of the
+	// parents have been renamed.
+	l.hostPath = newPath
+	return nil
+}
+
+// RenameAt implements p9.File.RenameAt.
+//
+// Code still uses [deprecated] Rename().
+func (*localFile) RenameAt(_ string, _ p9.File, _ string) error {
+	return syscall.ENOSYS
+}
+
+// ReadAt implements p9.File.
+func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
+	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+		return 0, syscall.EBADF
+	}
+	if l.openedFile == nil {
+		return 0, syscall.EBADF
+	}
+
+	r, err := l.openedFile.ReadAt(p, int64(offset))
+	switch err {
+	case nil, io.EOF:
+		return r, nil
+	default:
+		return r, extractErrno(err)
+	}
+}
+
+// WriteAt implements p9.File.
+func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
+	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+		return 0, syscall.EBADF
+	}
+	if l.openedFile == nil {
+		return 0, syscall.EBADF
+	}
+
+	w, err := l.openedFile.WriteAt(p, int64(offset))
+	if err != nil {
+		return w, extractErrno(err)
+	}
+	return w, nil
+}
+
+// Symlink implements p9.File.
+func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	if l.conf.ROMount {
+		return p9.QID{}, syscall.EBADF
+	}
+	if !isNameValid(newName) {
+		return p9.QID{}, syscall.EINVAL
+	}
+
+	if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+
+	// Open symlink to change ownership and stat it.
+	fd, err := syscall.Openat(l.controlFD(), newName, unix.O_PATH|openFlags, 0)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	defer syscall.Close(fd)
+
+	if err := fchown(fd, uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := stat(fd)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	return makeQID(stat), nil
+}
+
+// Link implements p9.File.
+func (l *localFile) Link(target p9.File, newName string) error {
+	if l.conf.ROMount {
+		return syscall.EBADF
+	}
+	if !isNameValid(newName) {
+		return syscall.EINVAL
+	}
+
+	targetFile := target.(*localFile)
+	if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// Mknod implements p9.File.
+//
+// Not implemented.
+func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+	return p9.QID{}, syscall.ENOSYS
+}
+
+// UnlinkAt implements p9.File.
+func (l *localFile) UnlinkAt(name string, flags uint32) error {
+	if l.conf.ROMount {
+		return syscall.EBADF
+	}
+	if !isNameValid(name) {
+		return syscall.EINVAL
+	}
+	if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// Readdir implements p9.File.
+func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
+	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+		return nil, syscall.EBADF
+	}
+	if l.openedFile == nil {
+		return nil, syscall.EBADF
+	}
+
+	// Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
+	// reading all directory contents. Take a lock because this operation is stateful.
+	l.readDirMu.Lock()
+	if _, err := l.openedFile.Seek(0, 0); err != nil {
+		l.readDirMu.Unlock()
+		return nil, extractErrno(err)
+	}
+	names, err := l.openedFile.Readdirnames(-1)
+	if err != nil {
+		l.readDirMu.Unlock()
+		return nil, extractErrno(err)
+	}
+	l.readDirMu.Unlock()
+
+	var dirents []p9.Dirent
+	for i := int(offset); i >= 0 && i < len(names); i++ {
+		stat, err := statAt(l.openedFD(), names[i])
+		if err != nil {
+			continue
+		}
+		qid := makeQID(stat)
+		dirents = append(dirents, p9.Dirent{
+			QID:    qid,
+			Type:   qid.Type,
+			Name:   names[i],
+			Offset: uint64(i + 1),
+		})
+	}
+	return dirents, nil
+}
+
+// Readlink implements p9.File.
+func (l *localFile) Readlink() (string, error) {
+	// Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
+	for len := 128; len < 1024*1024; len *= 2 {
+		b := make([]byte, len)
+		n, err := unix.Readlinkat(l.controlFD(), "", b)
+		if err != nil {
+			return "", extractErrno(err)
+		}
+		if n < len {
+			return string(b[:n]), nil
+		}
+	}
+	return "", syscall.ENOMEM
+}
+
+// Flush implements p9.File.
+func (l *localFile) Flush() error {
+	return nil
+}
+
+// Connect implements p9.File.
+func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
+	return nil, syscall.ECONNREFUSED
+}
+
+// Close implements p9.File.
+func (l *localFile) Close() error {
+	err := l.controlFile.Close()
+
+	// Close only once in case opened and control files point to
+	// the same os.File struct.
+	if l.openedFile != nil && l.openedFile != l.controlFile {
+		err = l.openedFile.Close()
+	}
+
+	l.openedFile = nil
+	l.controlFile = nil
+	l.mode = invalidMode
+	return err
+}
+
+// extractErrno tries to determine the errno.
+func extractErrno(err error) syscall.Errno {
+	if err == nil {
+		// This should never happen. The likely result will be that
+		// some user gets the frustration "error: SUCCESS" message.
+		log.Warningf("extractErrno called with nil error!")
+		return 0
+	}
+
+	switch err {
+	case os.ErrNotExist:
+		return syscall.ENOENT
+	case os.ErrExist:
+		return syscall.EEXIST
+	case os.ErrPermission:
+		return syscall.EACCES
+	case os.ErrInvalid:
+		return syscall.EINVAL
+	}
+
+	// See if it's an errno or a common wrapped error.
+	switch e := err.(type) {
+	case syscall.Errno:
+		return e
+	case *os.PathError:
+		return extractErrno(e.Err)
+	case *os.LinkError:
+		return extractErrno(e.Err)
+	case *os.SyscallError:
+		return extractErrno(e.Err)
+	}
+
+	// Fall back to EIO.
+	log.Debugf("Unknown error: %v, defaulting to EIO", err)
+	return syscall.EIO
+}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
new file mode 100644
index 000000000..7d834d596
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -0,0 +1,576 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+
+	allConfs = append(allConfs, rwConfs...)
+	allConfs = append(allConfs, roConfs...)
+}
+
+var (
+	allTypes = []fileType{regular, directory, symlink}
+
+	// allConfs is set in init() above.
+	allConfs []Config
+
+	rwConfs = []Config{
+		Config{ROMount: false, LazyOpenForWrite: false},
+		Config{ROMount: false, LazyOpenForWrite: true},
+	}
+	roConfs = []Config{
+		Config{ROMount: true, LazyOpenForWrite: false},
+		Config{ROMount: true, LazyOpenForWrite: true},
+	}
+)
+
+type state struct {
+	root *localFile
+	file *localFile
+	conf Config
+	ft   fileType
+}
+
+func (s state) String() string {
+	return fmt.Sprintf("lazyopen(%v)-%v", s.conf.LazyOpenForWrite, s.ft)
+}
+
+func runAll(t *testing.T, test func(*testing.T, state)) {
+	runCustom(t, allTypes, allConfs, test)
+}
+
+func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) {
+	for _, c := range confs {
+		t.Logf("Config: %+v", c)
+
+		for _, ft := range types {
+			t.Logf("File type: %v", ft)
+
+			path, name, err := setup(ft)
+			if err != nil {
+				t.Fatalf("%v", err)
+			}
+			defer os.RemoveAll(path)
+
+			a := NewAttachPoint(path, c)
+			root, err := a.Attach("/")
+			if err != nil {
+				t.Fatalf("Attach(%q) failed, err: %v", "/", err)
+			}
+
+			_, file, err := root.Walk([]string{name})
+			if err != nil {
+				root.Close()
+				t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
+			}
+
+			st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft}
+			test(t, st)
+			file.Close()
+			root.Close()
+		}
+	}
+}
+
+func setup(ft fileType) (string, string, error) {
+	path, err := ioutil.TempDir("", "root-")
+	if err != nil {
+		return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err)
+	}
+
+	// First attach with writable configuiration to setup tree.
+	a := NewAttachPoint(path, Config{})
+	root, err := a.Attach("/")
+	if err != nil {
+		return "", "", fmt.Errorf("Attach(%q) failed, err: %v", "/", err)
+	}
+	defer root.Close()
+
+	var name string
+	switch ft {
+	case regular:
+		name = "file"
+		_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
+		}
+		defer f.Close()
+	case directory:
+		name = "dir"
+		if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
+		}
+	case symlink:
+		name = "symlink"
+		if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
+		}
+	default:
+		panic(fmt.Sprintf("unknown file type %v", ft))
+	}
+	return path, name, nil
+}
+
+func createFile(dir *localFile, name string) (*localFile, error) {
+	_, f, _, _, err := dir.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+	if err != nil {
+		return nil, err
+	}
+	return f.(*localFile), nil
+}
+
+func TestReadWrite(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		child, err := createFile(s.file, "test")
+		if err != nil {
+			t.Fatalf("%v: createFile() failed, err: %v", s, err)
+		}
+		defer child.Close()
+		b := []byte("foobar")
+		w, err := child.WriteAt(b, 0)
+		if err != nil {
+			t.Fatalf("%v: Write() failed, err: %v", s, err)
+		}
+		if w != len(b) {
+			t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(b))
+		}
+		for _, test := range []struct {
+			flags p9.OpenFlags
+			read  bool
+			write bool
+		}{
+			{flags: p9.ReadOnly, read: true, write: false},
+			{flags: p9.WriteOnly, read: false, write: true},
+			{flags: p9.ReadWrite, read: true, write: true},
+		} {
+			_, l, err := s.file.Walk([]string{"test"})
+			if err != nil {
+				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
+			}
+			if _, _, _, err := l.Open(test.flags); err != nil {
+				t.Fatalf("%v: Open(%v) failed, err: %v", s, test.flags, err)
+			}
+
+			w, err = l.WriteAt(b, 0)
+			if test.write {
+				if err != nil {
+					t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+				}
+				if w != len(b) {
+					t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b))
+				}
+			} else {
+				if err == nil {
+					t.Fatalf("%v, %v: WriteAt() should have failed", s, test.flags)
+				}
+			}
+
+			rBuf := make([]byte, len(b))
+			r, err := l.ReadAt(rBuf, 0)
+			if test.read {
+				if err != nil {
+					t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err)
+				}
+				if r != len(rBuf) {
+					t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf))
+				}
+				if string(rBuf) != "foobar" {
+					t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar")
+				}
+			} else {
+				if err == nil {
+					t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags)
+				}
+			}
+		}
+	})
+}
+
+func TestCreate(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		for i, test := range []struct {
+			flags p9.OpenFlags
+			read  bool
+		}{
+			{flags: p9.WriteOnly, read: false},
+			{flags: p9.ReadWrite, read: true},
+		} {
+			_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), test.flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+			if err != nil {
+				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+			}
+
+			b := []byte("foobar")
+			w, err := l.WriteAt(b, 0)
+			if err != nil {
+				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+			}
+			if w != len(b) {
+				t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b))
+			}
+
+			rBuf := make([]byte, len(b))
+			r, err := l.ReadAt(rBuf, 0)
+			if test.read {
+				if err != nil {
+					t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err)
+				}
+				if r != len(rBuf) {
+					t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf))
+				}
+				if string(rBuf) != "foobar" {
+					t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar")
+				}
+			} else {
+				if err == nil {
+					t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags)
+				}
+			}
+		}
+	})
+}
+
+func TestUnopened(t *testing.T) {
+	runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) {
+		b := []byte("foobar")
+		if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
+			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF {
+			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Readdir(0, 100); err != syscall.EBADF {
+			t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.FSync(); err != syscall.EBADF {
+			t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+	})
+}
+
+func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) {
+	if err := l.SetAttr(valid, attr); err != nil {
+		return p9.Attr{}, err
+	}
+	_, _, a, err := l.GetAttr(p9.AttrMask{})
+	if err != nil {
+		return p9.Attr{}, err
+	}
+	return a, nil
+}
+
+func TestSetAttrPerm(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		valid := p9.SetAttrMask{Permissions: true}
+		attr := p9.SetAttr{Permissions: 0777}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if s.ft == symlink {
+			if err == nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
+			}
+		} else {
+			if err != nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Permissions, err)
+			}
+			if got.Mode.Permissions() != attr.Permissions {
+				t.Errorf("%v: wrong permission, got: %v, expected: %v", s, got.Mode.Permissions(), attr.Permissions)
+			}
+		}
+	})
+}
+
+func TestSetAttrSize(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		for _, size := range []uint64{1024, 0, 1024 * 1024} {
+			valid := p9.SetAttrMask{Size: true}
+			attr := p9.SetAttr{Size: size}
+			got, err := SetGetAttr(s.file, valid, attr)
+			if s.ft == symlink || s.ft == directory {
+				if err == nil {
+					t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
+				}
+				// Run for one size only, they will all fail the same way.
+				return
+			}
+			if err != nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Size, err)
+			}
+			if got.Size != size {
+				t.Errorf("%v: wrong size, got: %v, expected: %v", s, got.Size, size)
+			}
+		}
+	})
+}
+
+func TestSetAttrTime(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		valid := p9.SetAttrMask{ATime: true, ATimeNotSystemTime: true}
+		attr := p9.SetAttr{ATimeSeconds: 123, ATimeNanoSeconds: 456}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.ATimeSeconds, attr.ATimeNanoSeconds, err)
+		}
+		if got.ATimeSeconds != 123 {
+			t.Errorf("%v: wrong ATimeSeconds, got: %v, expected: %v", s, got.ATimeSeconds, 123)
+		}
+		if got.ATimeNanoSeconds != 456 {
+			t.Errorf("%v: wrong ATimeNanoSeconds, got: %v, expected: %v", s, got.ATimeNanoSeconds, 456)
+		}
+
+		valid = p9.SetAttrMask{MTime: true, MTimeNotSystemTime: true}
+		attr = p9.SetAttr{MTimeSeconds: 789, MTimeNanoSeconds: 012}
+		got, err = SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.MTimeSeconds, attr.MTimeNanoSeconds, err)
+		}
+		if got.MTimeSeconds != 789 {
+			t.Errorf("%v: wrong MTimeSeconds, got: %v, expected: %v", s, got.MTimeSeconds, 789)
+		}
+		if got.MTimeNanoSeconds != 012 {
+			t.Errorf("%v: wrong MTimeNanoSeconds, got: %v, expected: %v", s, got.MTimeNanoSeconds, 012)
+		}
+	})
+}
+
+func TestSetAttrOwner(t *testing.T) {
+	if os.Getuid() != 0 {
+		t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid())
+	}
+
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		newUID := os.Getuid() + 1
+		valid := p9.SetAttrMask{UID: true}
+		attr := p9.SetAttr{UID: p9.UID(newUID)}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.UID, err)
+		}
+		if got.UID != p9.UID(newUID) {
+			t.Errorf("%v: wrong uid, got: %v, expected: %v", s, got.UID, newUID)
+		}
+	})
+}
+
+func TestLink(t *testing.T) {
+	if os.Getuid() != 0 {
+		t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid())
+	}
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		const dirName = "linkdir"
+		const linkFile = "link"
+		if _, err := s.root.Mkdir(dirName, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, dirName, err)
+		}
+		_, dir, err := s.root.Walk([]string{dirName})
+		if err != nil {
+			t.Fatalf("%v: Walk({%s}) failed, err: %v", s, dirName, err)
+		}
+
+		err = dir.Link(s.file, linkFile)
+		if s.ft == directory {
+			if err != syscall.EPERM {
+				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
+			}
+			return
+		}
+		if err != nil {
+			t.Errorf("%v: Link(target, %s) failed, err: %v", s, linkFile, err)
+		}
+	})
+}
+
+func TestROMountChecks(t *testing.T) {
+	runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
+		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.Rename(s.file, ".."); err != syscall.EBADF {
+			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.UnlinkAt("..", 0); err != syscall.EBADF {
+			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.Link(s.file, ".."); err != syscall.EBADF {
+			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+
+		valid := p9.SetAttrMask{Size: true}
+		attr := p9.SetAttr{Size: 0}
+		if err := s.file.SetAttr(valid, attr); err != syscall.EBADF {
+			t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+	})
+}
+
+func TestInvalidName(t *testing.T) {
+	runCustom(t, []fileType{regular}, rwConfs, func(t *testing.T, s state) {
+		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if _, _, err := s.file.Walk([]string{".."}); err != syscall.EINVAL {
+			t.Errorf("%v: Walk() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if err := s.file.Rename(s.file, ".."); err != syscall.EINVAL {
+			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if err := s.file.UnlinkAt("..", 0); err != syscall.EINVAL {
+			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if err := s.file.Link(s.file, ".."); err != syscall.EINVAL {
+			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+	})
+}
+
+func TestIsNameValid(t *testing.T) {
+	valid := []string{
+		"name",
+		"123",
+		"!@#$%^&*()",
+		".name",
+		"..name",
+		"...",
+	}
+	for _, s := range valid {
+		if got := isNameValid(s); !got {
+			t.Errorf("isNameValid(%s) failed, got: %v, expected: true", s, got)
+		}
+	}
+	invalid := []string{
+		".",
+		"..",
+		"name/name",
+		"/name",
+		"name/",
+	}
+	for _, s := range invalid {
+		if got := isNameValid(s); got {
+			t.Errorf("isNameValid(%s) failed, got: %v, expected: false", s, got)
+		}
+	}
+}
+
+func TestWalkNotFound(t *testing.T) {
+	runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
+		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
+			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
+		}
+	})
+}
+
+func TestWalkDup(t *testing.T) {
+	runAll(t, func(t *testing.T, s state) {
+		_, dup, err := s.file.Walk([]string{})
+		if err != nil {
+			t.Fatalf("%v: Walk(nil) failed, err: %v", s, err)
+		}
+		// Check that 'dup' is usable.
+		if _, _, _, err := dup.GetAttr(p9.AttrMask{}); err != nil {
+			t.Errorf("%v: GetAttr() failed, err: %v", s, err)
+		}
+	})
+}
+
+func TestReaddir(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		name := "dir"
+		if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
+		}
+		name = "symlink"
+		if _, err := s.file.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: Symlink(%q) failed, err: %v", s, name, err)
+		}
+		name = "file"
+		_, f, _, _, err := s.file.Create(name, p9.ReadWrite, 0555, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			t.Fatalf("%v: createFile(root, %q) failed, err: %v", s, name, err)
+		}
+		f.Close()
+
+		if _, _, _, err := s.file.Open(p9.ReadOnly); err != nil {
+			t.Fatalf("%v: Open(ReadOnly) failed, err: %v", s, err)
+		}
+
+		dirents, err := s.file.Readdir(0, 10)
+		if err != nil {
+			t.Fatalf("%v: Readdir(0, 10) failed, err: %v", s, err)
+		}
+		if len(dirents) != 3 {
+			t.Fatalf("%v: Readdir(0, 10) wrong number of items, got: %v, expected: 3", s, len(dirents))
+		}
+		var dir, symlink, file bool
+		for _, d := range dirents {
+			switch d.Name {
+			case "dir":
+				if d.Type != p9.TypeDir {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeDir)
+				}
+				dir = true
+			case "symlink":
+				if d.Type != p9.TypeSymlink {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeSymlink)
+				}
+				symlink = true
+			case "file":
+				if d.Type != p9.TypeRegular {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeRegular)
+				}
+				file = true
+			default:
+				t.Errorf("%v: dirent.Name got: %v", s, d.Name)
+			}
+
+			_, f, err := s.file.Walk([]string{d.Name})
+			if err != nil {
+				t.Fatalf("%v: Walk({%s}) failed, err: %v", s, d.Name, err)
+			}
+			_, _, a, err := f.GetAttr(p9.AttrMask{})
+			if err != nil {
+				t.Fatalf("%v: GetAttr() failed, err: %v", s, err)
+			}
+			if d.Type != a.Mode.QIDType() {
+				t.Errorf("%v: dirent.Type different than GetAttr().Mode.QIDType(), got: %v, expected: %v", s, d.Type, a.Mode.QIDType())
+			}
+		}
+		if !dir || !symlink || !file {
+			t.Errorf("%v: Readdir(0, 10) wrong files returned, dir: %v, symlink: %v, file: %v", s, dir, symlink, file)
+		}
+	})
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
new file mode 100644
index 000000000..e676809ac
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -0,0 +1,58 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+	nameBytes, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return syscall.Stat_t{}, extractErrno(err)
+	}
+	namePtr := uintptr(unsafe.Pointer(nameBytes))
+
+	var stat syscall.Stat_t
+	statPtr := uintptr(unsafe.Pointer(&stat))
+
+	if _, _, err := syscall.Syscall6(syscall.SYS_NEWFSTATAT, uintptr(dirFd), namePtr, statPtr, linux.AT_SYMLINK_NOFOLLOW, 0, 0); err != 0 {
+		return syscall.Stat_t{}, err
+	}
+	return stat, nil
+}
+
+func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
+	// operate directly on 'dirFd' unlike other *at syscalls.
+	var namePtr uintptr
+	if name != "" {
+		nameBytes, err := syscall.BytePtrFromString(name)
+		if err != nil {
+			return extractErrno(err)
+		}
+		namePtr = uintptr(unsafe.Pointer(nameBytes))
+	}
+
+	timesPtr := uintptr(unsafe.Pointer(&times[0]))
+
+	if _, _, err := syscall.Syscall6(syscall.SYS_UTIMENSAT, uintptr(dirFd), namePtr, timesPtr, uintptr(flags), 0, 0); err != 0 {
+		return err
+	}
+	return nil
+}
diff --git a/runsc/main.go b/runsc/main.go
new file mode 100644
index 000000000..cf4b99d3f
--- /dev/null
+++ b/runsc/main.go
@@ -0,0 +1,199 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary runsc is an implementation of the Open Container Initiative Runtime
+// that runs applications inside a sandbox.
+package main
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"time"
+
+	"context"
+	"flag"
+
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/cmd"
+)
+
+var (
+	// Although these flags are not part of the OCI spec, they are used by
+	// Docker, and thus should not be changed.
+	rootDir     = flag.String("root", "", "root directory for storage of container state")
+	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
+	logFormat   = flag.String("log-format", "text", "log format: text (default) or json")
+	debug       = flag.Bool("debug", false, "enable debug logging")
+
+	// These flags are unique to runsc, and are used to configure parts of the
+	// system that are not covered by the runtime spec.
+
+	// Debugging flags.
+	debugLogDir = flag.String("debug-log-dir", "", "additional location for logs. It creates individual log files per command")
+	logPackets  = flag.Bool("log-packets", false, "enable network packet logging")
+
+	// Debugging flags: strace related
+	strace         = flag.Bool("strace", false, "enable strace")
+	straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
+
+	// Flags that control sandbox runtime behavior.
+	platform   = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+	network    = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+	fileAccess = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.")
+	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+)
+
+func main() {
+	// Help and flags commands are generated automatically.
+	subcommands.Register(subcommands.HelpCommand(), "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+
+	// Register user-facing runsc commands.
+	subcommands.Register(new(cmd.Create), "")
+	subcommands.Register(new(cmd.Delete), "")
+	subcommands.Register(new(cmd.Events), "")
+	subcommands.Register(new(cmd.Exec), "")
+	subcommands.Register(new(cmd.Gofer), "")
+	subcommands.Register(new(cmd.Kill), "")
+	subcommands.Register(new(cmd.List), "")
+	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Run), "")
+	subcommands.Register(new(cmd.Start), "")
+	subcommands.Register(new(cmd.State), "")
+
+	// Register internal commands with the internal group name. This causes
+	// them to be sorted below the user-facing commands with empty group.
+	// The string below will be printed above the commands.
+	const internalGroup = "internal use only"
+	subcommands.Register(new(cmd.Boot), internalGroup)
+	subcommands.Register(new(cmd.Gofer), internalGroup)
+
+	// All subcommands must be registered before flag parsing.
+	flag.Parse()
+
+	platformType, err := boot.MakePlatformType(*platform)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	fsAccess, err := boot.MakeFileAccessType(*fileAccess)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	netType, err := boot.MakeNetworkType(*network)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	// Create a new Config from the flags.
+	conf := &boot.Config{
+		RootDir:       *rootDir,
+		FileAccess:    fsAccess,
+		Overlay:       *overlay,
+		Network:       netType,
+		LogPackets:    *logPackets,
+		Platform:      platformType,
+		Strace:        *strace,
+		StraceLogSize: *straceLogSize,
+	}
+	if len(*straceSyscalls) != 0 {
+		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
+	}
+
+	// Set up logging.
+	if *debug {
+		log.SetLevel(log.Debug)
+	}
+
+	var logFile io.Writer = os.Stderr
+	if *logFilename != "" {
+		f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+		}
+		logFile = f
+	}
+
+	var e log.Emitter
+	switch *logFormat {
+	case "text":
+		e = log.GoogleEmitter{&log.Writer{Next: logFile}}
+	case "json":
+		e = log.JSONEmitter{log.Writer{Next: logFile}}
+	default:
+		cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat)
+	}
+
+	if *debugLogDir != "" {
+		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
+			cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err)
+		}
+
+		// Format: <debug-log-dir>/runsc.log.<yyymmdd-hhmmss.uuuuuu>.<command>
+		scmd := flag.CommandLine.Arg(0)
+		filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), scmd)
+		path := filepath.Join(*debugLogDir, filename)
+		f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", filename, err)
+		}
+		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
+	}
+
+	log.SetTarget(e)
+
+	log.Infof("***************************")
+	log.Infof("Args: %s", os.Args)
+	log.Infof("PID: %d", os.Getpid())
+	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
+	log.Infof("Configuration:")
+	log.Infof("\t\tRootDir: %s", conf.RootDir)
+	log.Infof("\t\tPlatform: %v", conf.Platform)
+	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
+	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
+	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+	log.Infof("***************************")
+
+	// Call the subcommand and pass in the configuration.
+	var ws syscall.WaitStatus
+	subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+	if subcmdCode == subcommands.ExitSuccess {
+		log.Infof("Exiting with status: %v", ws)
+		if ws.Signaled() {
+			// No good way to return it, emulate what the shell does. Maybe raise
+			// signall to self?
+			os.Exit(128 + int(ws.Signal()))
+		}
+		os.Exit(ws.ExitStatus())
+	}
+	// Return an error that is unlikely to be used by the application.
+	log.Warningf("Failure to execute command, err: %v", subcmdCode)
+	os.Exit(128)
+}
+
+func init() {
+	// Set default root dir to something (hopefully) user-writeable.
+	*rootDir = "/var/run/runsc"
+	if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+		*rootDir = filepath.Join(runtimeDir, "runsc")
+	}
+}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
new file mode 100644
index 000000000..bdd95903e
--- /dev/null
+++ b/runsc/sandbox/BUILD
@@ -0,0 +1,53 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "sandbox",
+    srcs = [
+        "console.go",
+        "hook.go",
+        "namespace.go",
+        "network.go",
+        "sandbox.go",
+        "status.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/sandbox",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/control/client",
+        "//pkg/control/server",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/urpc",
+        "//runsc/boot",
+        "//runsc/specutils",
+        "@com_github_kr_pty//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_vishvananda_netlink//:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "sandbox_test",
+    size = "small",
+    srcs = ["sandbox_test.go"],
+    pure = "on",
+    rundir = ".",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/unet",
+        "//runsc/boot",
+        "//runsc/cmd",
+        "//runsc/sandbox",
+        "@com_github_google_subcommands//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/sandbox/console.go b/runsc/sandbox/console.go
new file mode 100644
index 000000000..3f133e12a
--- /dev/null
+++ b/runsc/sandbox/console.go
@@ -0,0 +1,60 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"net"
+	"os"
+
+	"github.com/kr/pty"
+	"golang.org/x/sys/unix"
+)
+
+// setupConsole creates pty master/slave pair, sends the master FD over the
+// given socket, and returns the slave.
+func setupConsole(socketPath string) (*os.File, error) {
+	// Create a new pty master and slave.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		return nil, fmt.Errorf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+
+	// Get a connection to the socket path.
+	conn, err := net.Dial("unix", socketPath)
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
+	}
+	uc, ok := conn.(*net.UnixConn)
+	if !ok {
+		ptySlave.Close()
+		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
+	}
+	socket, err := uc.File()
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
+	}
+
+	// Send the master FD over the connection.
+	msg := unix.UnixRights(int(ptyMaster.Fd()))
+	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err)
+	}
+	return ptySlave, nil
+}
diff --git a/runsc/sandbox/hook.go b/runsc/sandbox/hook.go
new file mode 100644
index 000000000..40b064cdc
--- /dev/null
+++ b/runsc/sandbox/hook.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// This file implements hooks as defined in OCI spec:
+// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22
+//
+// "hooks":{
+// 		"prestart":[{
+// 			"path":"/usr/bin/dockerd",
+// 			"args":[
+// 				"libnetwork-setkey", "arg2",
+// 			]
+// 		}]
+// },
+
+// executeHooksBestEffort executes hooks and logs warning in case they fail.
+// Runs all hooks, always.
+func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
+	for _, h := range hooks {
+		if err := executeHook(h, s); err != nil {
+			log.Warningf("Failure to execute hook %+v, err: %v", h, err)
+		}
+	}
+}
+
+// executeHooks executes hooks until the first one fails or they all execute.
+func executeHooks(hooks []specs.Hook, s specs.State) error {
+	for _, h := range hooks {
+		if err := executeHook(h, s); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func executeHook(h specs.Hook, s specs.State) error {
+	log.Debugf("Executing hook %+v, state: %+v", h, s)
+
+	if strings.TrimSpace(h.Path) == "" {
+		return fmt.Errorf("empty path for hook")
+	}
+	if !filepath.IsAbs(h.Path) {
+		return fmt.Errorf("path for hook is not absolute: %q", h.Path)
+	}
+
+	b, err := json.Marshal(s)
+	if err != nil {
+		return err
+	}
+	var stdout, stderr bytes.Buffer
+	cmd := exec.Cmd{
+		Path:   h.Path,
+		Args:   h.Args,
+		Env:    h.Env,
+		Stdin:  bytes.NewReader(b),
+		Stdout: &stdout,
+		Stderr: &stderr,
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+
+	c := make(chan error, 1)
+	go func() {
+		c <- cmd.Wait()
+	}()
+
+	var timer <-chan time.Time
+	if h.Timeout != nil {
+		timer = time.After(time.Duration(*h.Timeout) * time.Second)
+	}
+	select {
+	case err := <-c:
+		if err != nil {
+			return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String())
+		}
+	case <-timer:
+		cmd.Process.Kill()
+		cmd.Wait()
+		return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String())
+	}
+
+	log.Debugf("Execute hook %q success!", h.Path)
+	return nil
+}
diff --git a/runsc/sandbox/namespace.go b/runsc/sandbox/namespace.go
new file mode 100644
index 000000000..1d3bcfbb5
--- /dev/null
+++ b/runsc/sandbox/namespace.go
@@ -0,0 +1,204 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// nsCloneFlag returns the clone flag that can be used to set a namespace of
+// the given type.
+func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
+	switch nst {
+	case specs.IPCNamespace:
+		return syscall.CLONE_NEWIPC
+	case specs.MountNamespace:
+		return syscall.CLONE_NEWNS
+	case specs.NetworkNamespace:
+		return syscall.CLONE_NEWNET
+	case specs.PIDNamespace:
+		return syscall.CLONE_NEWPID
+	case specs.UTSNamespace:
+		return syscall.CLONE_NEWUTS
+	case specs.UserNamespace:
+		return syscall.CLONE_NEWUSER
+	case specs.CgroupNamespace:
+		panic("cgroup namespace has no associated clone flag")
+	default:
+		panic(fmt.Sprintf("unknown namespace %v", nst))
+	}
+}
+
+// nsPath returns the path of the namespace for the current process and the
+// given namespace.
+func nsPath(nst specs.LinuxNamespaceType) string {
+	base := "/proc/self/ns"
+	switch nst {
+	case specs.CgroupNamespace:
+		return filepath.Join(base, "cgroup")
+	case specs.IPCNamespace:
+		return filepath.Join(base, "ipc")
+	case specs.MountNamespace:
+		return filepath.Join(base, "mnt")
+	case specs.NetworkNamespace:
+		return filepath.Join(base, "net")
+	case specs.PIDNamespace:
+		return filepath.Join(base, "pid")
+	case specs.UserNamespace:
+		return filepath.Join(base, "user")
+	case specs.UTSNamespace:
+		return filepath.Join(base, "uts")
+	default:
+		panic(fmt.Sprintf("unknown namespace %v", nst))
+	}
+}
+
+// getNS returns true and the namespace with the given type from the slice of
+// namespaces in the spec.  It returns false if the slice does not contain a
+// namespace with the type.
+func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
+	if s.Linux == nil {
+		return specs.LinuxNamespace{}, false
+	}
+	for _, ns := range s.Linux.Namespaces {
+		if ns.Type == nst {
+			return ns, true
+		}
+	}
+	return specs.LinuxNamespace{}, false
+}
+
+// filterNS returns a slice of namespaces from the spec with types that match
+// those in the `filter` slice.
+func filterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
+	if s.Linux == nil {
+		return nil
+	}
+	var out []specs.LinuxNamespace
+	for _, nst := range filter {
+		if ns, ok := getNS(nst, s); ok {
+			out = append(out, ns)
+		}
+	}
+	return out
+}
+
+// setNS sets the namespace of the given type.  It must be called with
+// OSThreadLocked.
+func setNS(fd, nsType uintptr) error {
+	if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
+		return err
+	}
+	return nil
+}
+
+// applyNS applies the namespace on the current thread and returns a function
+// that will restore the namespace to the original value.
+//
+// Preconditions: Must be called with os thread locked.
+func applyNS(ns specs.LinuxNamespace) (func(), error) {
+	log.Infof("applying namespace %v at path %q", ns.Type, ns.Path)
+	newNS, err := os.Open(ns.Path)
+	if err != nil {
+		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
+	}
+	defer newNS.Close()
+
+	// Store current netns to restore back after child is started.
+	curPath := nsPath(ns.Type)
+	oldNS, err := os.Open(curPath)
+	if err != nil {
+		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
+	}
+
+	// Set netns to the one requested and setup function to restore it back.
+	flag := nsCloneFlag(ns.Type)
+	if err := setNS(newNS.Fd(), flag); err != nil {
+		oldNS.Close()
+		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
+	}
+	return func() {
+		log.Infof("restoring namespace %v", ns.Type)
+		defer oldNS.Close()
+		if err := setNS(oldNS.Fd(), flag); err != nil {
+			panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
+		}
+	}, nil
+}
+
+// startInNS joins or creates the given namespaces and calls cmd.Start before
+// restoring the namespaces to the original values.
+func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
+	// We are about to setup namespaces, which requires the os thread being
+	// locked so that Go doesn't change the thread out from under us.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if cmd.SysProcAttr == nil {
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+
+	for _, ns := range nss {
+		if ns.Path == "" {
+			// No path.  Just set a flag to create a new namespace.
+			cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
+			continue
+		}
+		// Join the given namespace, and restore the current namespace
+		// before exiting.
+		restoreNS, err := applyNS(ns)
+		if err != nil {
+			return err
+		}
+		defer restoreNS()
+	}
+
+	return cmd.Start()
+}
+
+// setUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
+func setUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
+	if s.Linux == nil {
+		return
+	}
+	if cmd.SysProcAttr == nil {
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+	for _, idMap := range s.Linux.UIDMappings {
+		log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+		cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
+			ContainerID: int(idMap.ContainerID),
+			HostID:      int(idMap.HostID),
+			Size:        int(idMap.Size),
+		})
+	}
+	for _, idMap := range s.Linux.GIDMappings {
+		log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+		cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
+			ContainerID: int(idMap.ContainerID),
+			HostID:      int(idMap.HostID),
+			Size:        int(idMap.Size),
+		})
+	}
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
new file mode 100644
index 000000000..1b6a1d9a6
--- /dev/null
+++ b/runsc/sandbox/network.go
@@ -0,0 +1,348 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"net"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/vishvananda/netlink"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+)
+
+// setupNetwork configures the network stack to mimic the local network
+// configuration. Docker uses network namespaces with vnets to configure the
+// network for the container. The untrusted app expects to see the same network
+// inside the sandbox. Routing and port mapping is handled directly by docker
+// with most of network information not even available to the runtime.
+//
+// Netstack inside the sandbox speaks directly to the device using a raw socket.
+// All IP addresses assigned to the NIC, are removed and passed on to netstack's
+// device.
+//
+// If 'conf.Network' is NoNetwork, skips local configuration and creates a
+// loopback interface only.
+//
+// Run the following container to test it:
+//  docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+	log.Infof("Setting up network")
+
+	// HACK!
+	//
+	// When kubernetes starts a pod, it first creates a sandbox with an
+	// application that just pauses forever.  Later, when a container is
+	// added to the pod, kubernetes will create another sandbox with a
+	// config that corresponds to the containerized application, and add it
+	// to the same namespaces as the pause sandbox.
+	//
+	// Running a second sandbox currently breaks because the two sandboxes
+	// have the same network namespace and configuration, and try to create
+	// a tap device on the same host device which fails.
+	//
+	// Runsc will eventually need to detect that this container is meant to
+	// be run in the same sandbox as the pausing application, and somehow
+	// make that happen.
+	//
+	// For now the following HACK disables networking for the "pause"
+	// sandbox, allowing the second sandbox to start up successfully.
+	//
+	// Cri-o helpfully adds the "ContainerType" annotation that we can use
+	// to detect whether we are a pod or container.  Cri-containerd will
+	// support this eventually, but does not currently
+	// (https://github.com/kubernetes-incubator/cri-containerd/issues/512).
+	//
+	// Thus, to support cri-containerd, we check if the exec args is
+	// "/pause", which is pretty gross.
+	//
+	// TODO: Remove this once multiple containers per sandbox
+	// is properly supported.
+	if spec.Annotations["io.kubernetes.cri-o.ContainerType"] == "sandbox" || spec.Process.Args[0] == "/pause" {
+		log.Warningf("HACK: Disabling network")
+		conf.Network = boot.NetworkNone
+	}
+
+	switch conf.Network {
+	case boot.NetworkNone:
+		log.Infof("Network is disabled, create loopback interface only")
+		if err := createDefaultLoopbackInterface(conn); err != nil {
+			return fmt.Errorf("error creating default loopback interface: %v", err)
+		}
+	case boot.NetworkSandbox:
+		// Build the path to the net namespace of the sandbox process.
+		// This is what we will copy.
+		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath); err != nil {
+			return fmt.Errorf("error creating interfaces from net namespace %q: %v", nsPath, err)
+		}
+	case boot.NetworkHost:
+		// Nothing to do here.
+	default:
+		return fmt.Errorf("Invalid network type: %d", conf.Network)
+	}
+	return nil
+}
+
+func createDefaultLoopbackInterface(conn *urpc.Client) error {
+	link := boot.LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []boot.Route{
+			{
+				Destination: net.IP("\x7f\x00\x00\x00"),
+				Mask:        net.IPMask("\xff\x00\x00\x00"),
+			},
+			{
+				Destination: net.IPv6loopback,
+				Mask:        net.IPMask(strings.Repeat("\xff", 16)),
+			},
+		},
+	}
+	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
+		LoopbackLinks: []boot.LoopbackLink{link},
+	}, nil); err != nil {
+		return fmt.Errorf("error creating loopback link and routes: %v", err)
+	}
+	return nil
+}
+
+func joinNetNS(nsPath string) (func(), error) {
+	runtime.LockOSThread()
+	restoreNS, err := applyNS(specs.LinuxNamespace{
+		Type: specs.NetworkNamespace,
+		Path: nsPath,
+	})
+	if err != nil {
+		runtime.UnlockOSThread()
+		return nil, fmt.Errorf("error joining net namespace %q: %v", nsPath, err)
+	}
+	return func() {
+		restoreNS()
+		runtime.UnlockOSThread()
+	}, nil
+}
+
+// isRootNS determines whether we are running in the root net namespace.
+//
+// TODO: Find a better way to detect root network.
+func isRootNS(ifaces []net.Interface) bool {
+	for _, iface := range ifaces {
+		if iface.Name == "docker0" {
+			return true
+		}
+	}
+	return false
+
+}
+
+// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
+// net namespace with the given path, creates them in the sandbox, and removes
+// them from the host.
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
+	// Join the network namespace that we will be copying.
+	restore, err := joinNetNS(nsPath)
+	if err != nil {
+		return err
+	}
+	defer restore()
+
+	// Get all interfaces in the namespace.
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return fmt.Errorf("error querying interfaces: %v", err)
+	}
+
+	if isRootNS(ifaces) {
+		return fmt.Errorf("cannot run in with network enabled in root network namespace")
+	}
+
+	// Collect addresses and routes from the interfaces.
+	var args boot.CreateLinksAndRoutesArgs
+	for _, iface := range ifaces {
+		if iface.Flags&net.FlagUp == 0 {
+			log.Infof("Skipping down interface: %+v", iface)
+			continue
+		}
+
+		ifaddrs, err := iface.Addrs()
+		if err != nil {
+			return fmt.Errorf("error fetching interface addresses for %q: %v", iface.Name, err)
+		}
+
+		// We build our own loopback devices.
+		if iface.Flags&net.FlagLoopback != 0 {
+			links, err := loopbackLinks(iface, ifaddrs)
+			if err != nil {
+				return fmt.Errorf("error getting loopback routes and links for iface %q: %v", iface.Name, err)
+			}
+			args.LoopbackLinks = append(args.LoopbackLinks, links...)
+			continue
+		}
+
+		// Get the link for the interface.
+		ifaceLink, err := netlink.LinkByName(iface.Name)
+		if err != nil {
+			return fmt.Errorf("error getting link for interface %q: %v", iface.Name, err)
+		}
+
+		// Create the socket.
+		const protocol = 0x0300 // htons(ETH_P_ALL)
+		fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+		if err != nil {
+			return fmt.Errorf("unable to create raw socket: %v", err)
+		}
+		deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+
+		// Bind to the appropriate device.
+		ll := syscall.SockaddrLinklayer{
+			Protocol: protocol,
+			Ifindex:  ifaceLink.Attrs().Index,
+			Hatype:   0, // No ARP type.
+			Pkttype:  syscall.PACKET_OTHERHOST,
+		}
+		if err := syscall.Bind(fd, &ll); err != nil {
+			return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+		}
+
+		// Scrape the routes before removing the address, since that
+		// will remove the routes as well.
+		routes, def, err := routesForIface(iface)
+		if err != nil {
+			return fmt.Errorf("error getting routes for interface %q: %v", iface.Name, err)
+		}
+		if def != nil {
+			if !args.DefaultGateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+			}
+			args.DefaultGateway.Route = *def
+			args.DefaultGateway.Name = iface.Name
+		}
+
+		link := boot.FDBasedLink{
+			Name:   iface.Name,
+			MTU:    iface.MTU,
+			Routes: routes,
+		}
+
+		// Collect the addresses for the interface, enable forwarding,
+		// and remove them from the host.
+		for _, ifaddr := range ifaddrs {
+			ipNet, ok := ifaddr.(*net.IPNet)
+			if !ok {
+				return fmt.Errorf("address is not IPNet: %t %+v", ifaddr, ifaddr)
+			}
+			link.Addresses = append(link.Addresses, ipNet.IP)
+
+			// Steal IP address from NIC.
+			if err := removeAddress(ifaceLink, ipNet.String()); err != nil {
+				return fmt.Errorf("error removing address %v from device %q: %v", iface.Name, ipNet, err)
+			}
+		}
+
+		args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
+		args.FDBasedLinks = append(args.FDBasedLinks, link)
+	}
+
+	log.Debugf("Setting up network, config: %+v", args)
+	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
+		return fmt.Errorf("error creating links and routes: %v", err)
+	}
+	return nil
+}
+
+// loopbackLinks collects the links for a loopback interface.
+func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
+	var links []boot.LoopbackLink
+	for _, addr := range addrs {
+		ipNet, ok := addr.(*net.IPNet)
+		if !ok {
+			return nil, fmt.Errorf("address is not IPNet: %t %+v", addr, addr)
+		}
+		links = append(links, boot.LoopbackLink{
+			Name:      iface.Name,
+			Addresses: []net.IP{ipNet.IP},
+			Routes: []boot.Route{{
+				Destination: ipNet.IP.Mask(ipNet.Mask),
+				Mask:        ipNet.Mask,
+			}},
+		})
+	}
+	return links, nil
+}
+
+// routesForIface iterates over all routes for the given interface and converts
+// them to boot.Routes.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+	link, err := netlink.LinkByIndex(iface.Index)
+	if err != nil {
+		return nil, nil, err
+	}
+	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error getting routes from %q: %v", iface.Name, err)
+	}
+
+	var def *boot.Route
+	var routes []boot.Route
+	for _, r := range rs {
+		// Is it a default route?
+		if r.Dst == nil {
+			if r.Gw == nil {
+				return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
+			}
+			if def != nil {
+				return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+			}
+			emptyAddr := net.IPv6zero
+			if r.Gw.To4() != nil {
+				emptyAddr = net.IPv4zero
+			}
+			// Create a catch all route to the gateway.
+			def = &boot.Route{
+				Destination: emptyAddr,
+				Mask:        net.IPMask(emptyAddr),
+				Gateway:     r.Gw,
+			}
+			continue
+		}
+		routes = append(routes, boot.Route{
+			Destination: r.Dst.IP.Mask(r.Dst.Mask),
+			Mask:        r.Dst.Mask,
+		})
+	}
+	return routes, def, nil
+}
+
+// removeAddress removes IP address from network device. It's equivalent to:
+//   ip addr del <ipAndMask> dev <name>
+func removeAddress(source netlink.Link, ipAndMask string) error {
+	addr, err := netlink.ParseAddr(ipAndMask)
+	if err != nil {
+		return err
+	}
+	return netlink.AddrDel(source, addr)
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
new file mode 100644
index 000000000..b2fa1d58e
--- /dev/null
+++ b/runsc/sandbox/sandbox.go
@@ -0,0 +1,666 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sandbox creates and manipulates sandboxes.
+package sandbox
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"strconv"
+	"syscall"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/control/client"
+	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// metadataFilename is the name of the metadata file relative to sandboxRoot
+// that holds sandbox metadata.
+const metadataFilename = "meta.json"
+
+// See libcontainer/factory_linux.go
+var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
+
+// validateID validates the sandbox id.
+func validateID(id string) error {
+	if !idRegex.MatchString(id) {
+		return fmt.Errorf("invalid sandbox id: %v", id)
+	}
+	return nil
+}
+
+// Sandbox wraps a child sandbox process, and is responsible for saving and
+// loading sandbox metadata to disk.
+//
+// Within a root directory, we maintain subdirectories for each sandbox named
+// with the sandbox id.  The sandbox metadata is is stored as json within the
+// sandbox directoy in a file named "meta.json".  This metadata format is
+// defined by us, and is not part of the OCI spec.
+//
+// Sandboxes must write this metadata file after any change to their internal
+// state.  The entire sandbox directory is deleted when the sandbox is
+// destroyed.
+//
+// TODO: Protect against concurrent changes to the sandbox metadata
+// file.
+type Sandbox struct {
+	// ID is the sandbox ID.
+	ID string `json:"id"`
+
+	// Spec is the OCI runtime spec that configures this sandbox.
+	Spec *specs.Spec `json:"spec"`
+
+	// BundleDir is the directory containing the sandbox bundle.
+	BundleDir string `json:"bundleDir"`
+
+	// SandboxRoot is the directory containing the sandbox metadata file.
+	SandboxRoot string `json:"sandboxRoot"`
+
+	// CreatedAt is the time the sandbox was created.
+	CreatedAt time.Time `json:"createdAt"`
+
+	// Owner is the sandbox owner.
+	Owner string `json:"owner"`
+
+	// ConsoleSocket is the path to a unix domain socket that will receive
+	// the console FD.  It is only used during create, so we don't need to
+	// store it in the metadata.
+	ConsoleSocket string `json:"-"`
+
+	// Pid is the pid of the running sandbox.  Only valid if Status is
+	// Created or Running.
+	Pid int `json:"pid"`
+
+	// GoferPid is the pid of the gofer running along side the sandbox. May be 0
+	// if the gofer has been killed or it's not being used.
+	GoferPid int `json:"goferPid"`
+
+	// Status is the current sandbox Status.
+	Status Status `json:"status"`
+}
+
+// Create creates the sandbox subprocess and writes the metadata file.  Args
+// are additional arguments that will be passed to the sandbox process.
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (*Sandbox, error) {
+	log.Debugf("Create sandbox %q in root dir: %s", id, conf.RootDir)
+	if err := validateID(id); err != nil {
+		return nil, err
+	}
+
+	sandboxRoot := filepath.Join(conf.RootDir, id)
+	if exists(sandboxRoot) {
+		return nil, fmt.Errorf("sandbox with id %q already exists: %q ", id, sandboxRoot)
+	}
+
+	s := &Sandbox{
+		ID:            id,
+		Spec:          spec,
+		ConsoleSocket: consoleSocket,
+		BundleDir:     bundleDir,
+		SandboxRoot:   sandboxRoot,
+		Status:        Creating,
+		Owner:         os.Getenv("USER"),
+	}
+
+	// Create sandbox process. If anything errors between now and the end of this
+	// function, we MUST clean up all sandbox resources.
+	if err := s.createProcesses(conf, args); err != nil {
+		s.Destroy()
+		return nil, err
+	}
+
+	// Wait for the control server to come up (or timeout).  The sandbox is
+	// not "created" until that happens.
+	if err := s.waitForCreated(10 * time.Second); err != nil {
+		s.Destroy()
+		return nil, err
+	}
+
+	s.Status = Created
+	s.CreatedAt = time.Now()
+
+	// Save the metadata file.
+	if err := s.save(); err != nil {
+		s.Destroy()
+		return nil, err
+	}
+
+	// Write the pid file.  Containerd consideres the create complete after
+	// this file is created, so it must be the last thing we do.
+	if pidFile != "" {
+		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(s.Pid)), 0644); err != nil {
+			s.Destroy()
+			return nil, fmt.Errorf("error writing pid file: %v", err)
+		}
+	}
+
+	return s, nil
+}
+
+// Run is a helper that calls Create + Start + Wait.
+func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (syscall.WaitStatus, error) {
+	s, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, args)
+	if err != nil {
+		return 0, fmt.Errorf("error creating sandbox: %v", err)
+	}
+	if err := s.Start(conf); err != nil {
+		return 0, fmt.Errorf("error starting sandbox: %v", err)
+	}
+	return s.Wait()
+}
+
+// Load loads a sandbox from with the given id from a metadata file.
+func Load(rootDir, id string) (*Sandbox, error) {
+	log.Debugf("Load sandbox %q %q", rootDir, id)
+	if err := validateID(id); err != nil {
+		return nil, err
+	}
+	sandboxRoot := filepath.Join(rootDir, id)
+	if !exists(sandboxRoot) {
+		return nil, fmt.Errorf("sandbox with id %q does not exist", id)
+	}
+	metaFile := filepath.Join(sandboxRoot, metadataFilename)
+	if !exists(metaFile) {
+		return nil, fmt.Errorf("sandbox with id %q does not have metadata file %q", id, metaFile)
+	}
+	metaBytes, err := ioutil.ReadFile(metaFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading sandbox metadata file %q: %v", metaFile, err)
+	}
+	var s Sandbox
+	if err := json.Unmarshal(metaBytes, &s); err != nil {
+		return nil, fmt.Errorf("error unmarshaling sandbox metadata from %q: %v", metaFile, err)
+	}
+
+	// If the status is "Running" or "Created", check that the process
+	// still exists, and set it to Stopped if it does not.
+	//
+	// This is inherintly racey.
+	if s.Status == Running || s.Status == Created {
+		// Send signal 0 to check if process exists.
+		if err := s.Signal(0); err != nil {
+			// Process no longer exists.
+			s.Status = Stopped
+			s.Pid = 0
+		}
+	}
+
+	return &s, nil
+}
+
+// List returns all sandbox ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+	log.Debugf("List sandboxes %q", rootDir)
+	fs, err := ioutil.ReadDir(rootDir)
+	if err != nil {
+		return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err)
+	}
+	var out []string
+	for _, f := range fs {
+		out = append(out, f.Name())
+	}
+	return out, nil
+}
+
+// State returns the metadata of the sandbox.
+func (s *Sandbox) State() specs.State {
+	return specs.State{
+		Version: specs.Version,
+		ID:      s.ID,
+		Status:  s.Status.String(),
+		Pid:     s.Pid,
+		Bundle:  s.BundleDir,
+	}
+}
+
+// Start starts running the containerized process inside the sandbox.
+func (s *Sandbox) Start(conf *boot.Config) error {
+	log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid)
+	if s.Status != Created {
+		return fmt.Errorf("cannot start container in state %s", s.Status)
+	}
+
+	// "If any prestart hook fails, the runtime MUST generate an error,
+	// stop and destroy the container".
+	if s.Spec.Hooks != nil {
+		if err := executeHooks(s.Spec.Hooks.Prestart, s.State()); err != nil {
+			s.Destroy()
+			return err
+		}
+	}
+
+	c, err := s.connect()
+	if err != nil {
+		s.Destroy()
+		return err
+	}
+	defer c.Close()
+
+	// Configure the network.
+	if err := setupNetwork(c, s.Pid, s.Spec, conf); err != nil {
+		s.Destroy()
+		return fmt.Errorf("error setting up network: %v", err)
+	}
+
+	// Send a message to the sandbox control server to start the
+	// application.
+	if err := c.Call(boot.ApplicationStart, nil, nil); err != nil {
+		s.Destroy()
+		return fmt.Errorf("error starting sandbox: %v", err)
+	}
+
+	// "If any poststart hook fails, the runtime MUST log a warning, but
+	// the remaining hooks and lifecycle continue as if the hook had
+	// succeeded".
+	if s.Spec.Hooks != nil {
+		executeHooksBestEffort(s.Spec.Hooks.Poststart, s.State())
+	}
+
+	s.Status = Running
+	return s.save()
+}
+
+// Processes retrieves the list of processes and associated metadata inside a
+// sandbox.
+func (s *Sandbox) Processes() ([]*control.Process, error) {
+	if s.Status != Running {
+		return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", s.ID, s.Status)
+	}
+
+	c, err := s.connect()
+	if err != nil {
+		return nil, err
+	}
+	defer c.Close()
+
+	var pl []*control.Process
+	if err := c.Call(boot.ApplicationProcesses, nil, &pl); err != nil {
+		return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
+	}
+	return pl, nil
+}
+
+// Execute runs the specified command in the sandbox.
+func (s *Sandbox) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) {
+	log.Debugf("Execute in sandbox %q, pid: %d, args: %+v", s.ID, s.Pid, e)
+	if s.Status != Created && s.Status != Running {
+		return 0, fmt.Errorf("cannot exec in container in state %s", s.Status)
+	}
+
+	log.Debugf("Connecting to sandbox...")
+	c, err := s.connect()
+	if err != nil {
+		return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+	}
+	defer c.Close()
+
+	// Send a message to the sandbox control server to start the application.
+	var waitStatus uint32
+	if err := c.Call(boot.ApplicationExecute, e, &waitStatus); err != nil {
+		return 0, fmt.Errorf("error executing in sandbox: %v", err)
+	}
+
+	return syscall.WaitStatus(waitStatus), nil
+}
+
+// Event retrieves stats about the sandbox such as memory and CPU utilization.
+func (s *Sandbox) Event() (*boot.Event, error) {
+	if s.Status != Running && s.Status != Created {
+		return nil, fmt.Errorf("cannot get events for container in state: %s", s.Status)
+	}
+
+	c, err := s.connect()
+	if err != nil {
+		return nil, err
+	}
+	defer c.Close()
+
+	var e boot.Event
+	if err := c.Call(boot.ApplicationEvent, nil, &e); err != nil {
+		return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err)
+	}
+	e.ID = s.ID
+	return &e, nil
+}
+
+func (s *Sandbox) connect() (*urpc.Client, error) {
+	log.Debugf("Connecting to sandbox...")
+	c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
+	if err != nil {
+		return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+	}
+	return c, nil
+}
+
+func (s *Sandbox) createProcesses(conf *boot.Config, args []string) error {
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		return err
+	}
+
+	ioFiles, err := s.createGoferProcess(conf, binPath, args)
+	if err != nil {
+		return err
+	}
+	return s.createSandboxProcess(conf, binPath, args, ioFiles)
+}
+
+func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonArgs []string) ([]*os.File, error) {
+	if conf.FileAccess != boot.FileAccessProxy {
+		// Don't start a gofer. The sandbox will access host FS directly.
+		return nil, nil
+	}
+
+	var args []string
+	args = append(args, commonArgs...)
+	args = append(args, "gofer", "--bundle", s.BundleDir)
+
+	// Start with root mount and then add any other additional mount.
+	mountCount := 1
+	for _, m := range s.Spec.Mounts {
+		if specutils.Is9PMount(m) {
+			mountCount++
+		}
+	}
+
+	sandEnds := make([]*os.File, 0, mountCount)
+	goferEnds := make([]*os.File, 0, mountCount)
+	for i := 0; i < mountCount; i++ {
+		// Create socket that connects the sandbox and gofer.
+		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+		if err != nil {
+			return nil, err
+		}
+		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd"))
+
+		goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd")
+		defer goferEnd.Close()
+		goferEnds = append(goferEnds, goferEnd)
+
+		args = append(args, fmt.Sprintf("--io-fds=%d", 3+i))
+	}
+
+	cmd := exec.Command(binPath, args...)
+	cmd.ExtraFiles = goferEnds
+
+	// Setup any uid/gid mappings, and create or join the configured user
+	// namespace so the gofer's view of the filesystem aligns with the
+	// users in the sandbox.
+	setUIDGIDMappings(cmd, s.Spec)
+	nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, s.Spec)
+
+	// Start the gofer in the given namespace.
+	log.Debugf("Starting gofer: %s %v", binPath, args)
+	if err := startInNS(cmd, nss); err != nil {
+		return nil, err
+	}
+	s.GoferPid = cmd.Process.Pid
+	log.Infof("Gofer started, pid: %d", cmd.Process.Pid)
+	return sandEnds, nil
+}
+
+// createSandboxProcess starts the sandbox as a subprocess by running the "boot"
+// command, passing in the bundle dir.
+func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, commonArgs []string, ioFiles []*os.File) error {
+	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
+	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
+	nextFD := 3
+
+	// Create control server socket here and donate FD to child process because
+	// it may be in a different network namespace and won't be reachable from
+	// outside.
+	fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID))
+	if err != nil {
+		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
+	}
+
+	consoleEnabled := s.ConsoleSocket != ""
+
+	cmd := exec.Command(binPath, commonArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{}
+	cmd.Args = append(cmd.Args,
+		"boot",
+		"--bundle", s.BundleDir,
+		"--controller-fd="+strconv.Itoa(nextFD),
+		fmt.Sprintf("--console=%t", consoleEnabled))
+	nextFD++
+
+	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
+	defer controllerFile.Close()
+	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
+
+	// If there is a gofer, sends all socket ends to the sandbox.
+	for _, f := range ioFiles {
+		defer f.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+		cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	// If the console control socket file is provided, then create a new
+	// pty master/slave pair and set the tty on the sandox process.
+	if consoleEnabled {
+		// setupConsole will send the master on the socket, and return
+		// the slave.
+		tty, err := setupConsole(s.ConsoleSocket)
+		if err != nil {
+			return fmt.Errorf("error setting up control socket %q: %v", s.ConsoleSocket, err)
+		}
+		defer tty.Close()
+
+		cmd.Stdin = tty
+		cmd.Stdout = tty
+		cmd.Stderr = tty
+		cmd.SysProcAttr.Setctty = true
+		cmd.SysProcAttr.Ctty = int(tty.Fd())
+	} else {
+		cmd.Stdin = os.Stdin
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+	}
+
+	// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
+	// when re-parented.
+	cmd.SysProcAttr.Setsid = true
+
+	// nss is the set of namespaces to join or create before starting the sandbox
+	// process. IPC and UTS namespaces from the host are not used as they
+	// are virtualized inside the sandbox. Be paranoid and run inside an empty
+	// namespace for these.
+	log.Infof("Sandbox will be started in empty IPC and UTS namespaces")
+	nss := []specs.LinuxNamespace{
+		specs.LinuxNamespace{Type: specs.IPCNamespace},
+		specs.LinuxNamespace{Type: specs.UTSNamespace},
+	}
+
+	if conf.Platform == boot.PlatformPtrace {
+		// TODO: Also set an empty PID namespace so that we limit
+		// access to other host processes.
+		log.Infof("Sandbox will be started in the current PID namespace")
+	} else {
+		log.Infof("Sandbox will be started in empty PID namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
+	}
+
+	if conf.FileAccess == boot.FileAccessProxy {
+		log.Infof("Sandbox will be started in empty mount namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
+	} else {
+		log.Infof("Sandbox will be started in the current mount namespace")
+	}
+
+	// Joins the network namespace if network is enabled. the sandbox talks
+	// directly to the host network, which may have been configured in the
+	// namespace.
+	if ns, ok := getNS(specs.NetworkNamespace, s.Spec); ok && conf.Network != boot.NetworkNone {
+		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
+		nss = append(nss, ns)
+	} else {
+		log.Infof("Sandbox will be started in empty network namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
+	}
+
+	// User namespace depends on the following options:
+	//   - Host network/filesystem: requires to run inside the user namespace
+	//       specified in the spec or the current namespace if none is configured.
+	//   - Gofer: when using a Gofer, the sandbox process can run isolated in an
+	//       empty namespace.
+	if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect {
+		if userns, ok := getNS(specs.UserNamespace, s.Spec); ok {
+			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
+			nss = append(nss, userns)
+			setUIDGIDMappings(cmd, s.Spec)
+		} else {
+			// TODO: Retrict capabilities since it's using current user
+			// namespace, i.e. root.
+			log.Infof("Sandbox will be started in the current user namespace")
+		}
+		// When running in the caller's defined user namespace, apply the same
+		// capabilities to the sandbox process to ensure it abides to the same
+		// rules.
+		cmd.Args = append(cmd.Args, "--apply-caps=true")
+
+	} else {
+		log.Infof("Sandbox will be started in empty user namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+	}
+
+	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
+	if err := startInNS(cmd, nss); err != nil {
+		return err
+	}
+	s.Pid = cmd.Process.Pid
+	log.Infof("Sandbox started, pid: %d", s.Pid)
+	return nil
+}
+
+// waitForCreated waits for the sandbox subprocess control server to be
+// running, at which point the sandbox is in Created state.
+func (s *Sandbox) waitForCreated(timeout time.Duration) error {
+	log.Debugf("Waiting for sandbox %q creation", s.ID)
+	tchan := time.After(timeout)
+	for {
+		select {
+		case <-tchan:
+			return fmt.Errorf("timed out waiting for sandbox control server")
+		default:
+			if c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)); err == nil {
+				// It's alive!
+				c.Close()
+				return nil
+			}
+		}
+	}
+}
+
+// Wait waits for the containerized process to exit, and returns its WaitStatus.
+func (s *Sandbox) Wait() (syscall.WaitStatus, error) {
+	log.Debugf("Wait on sandbox %q with pid %d", s.ID, s.Pid)
+	p, err := os.FindProcess(s.Pid)
+	if err != nil {
+		// "On Unix systems, FindProcess always succeeds and returns a
+		// Process for the given pid."
+		panic(err)
+	}
+	ps, err := p.Wait()
+	if err != nil {
+		return 0, err
+	}
+	return ps.Sys().(syscall.WaitStatus), nil
+}
+
+// Destroy frees all resources associated with the sandbox.
+func (s *Sandbox) Destroy() error {
+	log.Debugf("Destroy sandbox %q", s.ID)
+	if s.Pid != 0 {
+		// TODO: Too harsh?
+		log.Debugf("Killing sandbox %q", s.ID)
+		sendSignal(s.Pid, unix.SIGKILL)
+		s.Pid = 0
+	}
+	if s.GoferPid != 0 {
+		log.Debugf("Killing gofer for sandbox %q", s.ID)
+		sendSignal(s.GoferPid, unix.SIGKILL)
+		s.GoferPid = 0
+	}
+	if err := os.RemoveAll(s.SandboxRoot); err != nil {
+		log.Warningf("Failed to delete sandbox root directory %q, err: %v", s.SandboxRoot, err)
+	}
+
+	// "If any poststop hook fails, the runtime MUST log a warning, but the
+	// remaining hooks and lifecycle continue as if the hook had succeeded".
+	if s.Spec.Hooks != nil && (s.Status == Created || s.Status == Running) {
+		executeHooksBestEffort(s.Spec.Hooks.Poststop, s.State())
+	}
+
+	s.Status = Stopped
+	return nil
+}
+
+// Signal sends the signal to the sandbox.
+func (s *Sandbox) Signal(sig syscall.Signal) error {
+	log.Debugf("Signal sandbox %q", s.ID)
+	if s.Status == Stopped {
+		log.Warningf("sandbox %q not running, not sending signal %v to pid %d", s.ID, sig, s.Pid)
+		return nil
+	}
+	return sendSignal(s.Pid, sig)
+}
+
+func sendSignal(pid int, sig syscall.Signal) error {
+	if err := syscall.Kill(pid, sig); err != nil {
+		return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err)
+	}
+	return nil
+}
+
+// save saves the sandbox metadata to a file.
+func (s *Sandbox) save() error {
+	log.Debugf("Save sandbox %q", s.ID)
+	if err := os.MkdirAll(s.SandboxRoot, 0711); err != nil {
+		return fmt.Errorf("error creating sandbox root directory %q: %v", s.SandboxRoot, err)
+	}
+	meta, err := json.Marshal(s)
+	if err != nil {
+		return fmt.Errorf("error marshaling sandbox metadata: %v", err)
+	}
+	metaFile := filepath.Join(s.SandboxRoot, metadataFilename)
+	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
+		return fmt.Errorf("error writing sandbox metadata: %v", err)
+	}
+	return nil
+}
+
+// exists returns true if the given file exists.
+func exists(f string) bool {
+	if _, err := os.Stat(f); err == nil {
+		return true
+	} else if !os.IsNotExist(err) {
+		log.Warningf("error checking for file %q: %v", f, err)
+	}
+	return false
+}
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
new file mode 100644
index 000000000..6c71cac30
--- /dev/null
+++ b/runsc/sandbox/sandbox_test.go
@@ -0,0 +1,649 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox_test
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"reflect"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/cmd"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+}
+
+// writeSpec writes the spec to disk in the given directory.
+func writeSpec(dir string, spec *specs.Spec) error {
+	b, err := json.Marshal(spec)
+	if err != nil {
+		return err
+	}
+	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
+}
+
+// newSpecWithArgs creates a simple spec with the given args suitable for use
+// in tests.
+func newSpecWithArgs(args ...string) *specs.Spec {
+	spec := &specs.Spec{
+		// The host filesystem root is the sandbox root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: args,
+			Env: []string{
+				"PATH=" + os.Getenv("PATH"),
+			},
+		},
+	}
+	return spec
+}
+
+// shutdownSignal will be sent to the sandbox in order to shut down cleanly.
+const shutdownSignal = syscall.SIGUSR2
+
+// setupSandbox creates a bundle and root dir for the sandbox, generates a test
+// config, and writes the spec to config.json in the bundle dir.
+func setupSandbox(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
+	rootDir, err = ioutil.TempDir("", "sandboxes")
+	if err != nil {
+		return "", "", nil, fmt.Errorf("error creating root dir: %v", err)
+	}
+
+	bundleDir, err = ioutil.TempDir("", "bundle")
+	if err != nil {
+		return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+	}
+
+	if err = writeSpec(bundleDir, spec); err != nil {
+		return "", "", nil, fmt.Errorf("error writing spec: %v", err)
+	}
+
+	conf = &boot.Config{
+		RootDir: rootDir,
+		Network: boot.NetworkNone,
+	}
+
+	return rootDir, bundleDir, conf, nil
+}
+
+// uniqueSandboxID generates a unique sandbox id for each test.
+//
+// The sandbox id is used to create an abstract unix domain socket, which must
+// be unique.  While the sandbox forbids creating two sandboxes with the same
+// name, sometimes between test runs the socket does not get cleaned up quickly
+// enough, causing sandbox creation to fail.
+func uniqueSandboxID() string {
+	return fmt.Sprintf("test-sandbox-%d", time.Now().UnixNano())
+}
+
+// waitForProcessList waits for the given process list to show up in the sandbox.
+func waitForProcessList(s *sandbox.Sandbox, expected []*control.Process) error {
+	var got []*control.Process
+	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
+		var err error
+		got, err := s.Processes()
+		if err != nil {
+			return fmt.Errorf("error getting process data from sandbox: %v", err)
+		}
+		if procListsEqual(got, expected) {
+			return nil
+		}
+		// Process might not have started, try again...
+		time.Sleep(10 * time.Millisecond)
+	}
+	return fmt.Errorf("sandbox got process list: %s, want: %s", procListToString(got), procListToString(expected))
+}
+
+// TestLifecycle tests the basic Create/Start/Signal/Destory sandbox lifecycle.
+// It verifies after each step that the sandbox can be loaded from disk, and
+// has the correct status.
+func TestLifecycle(t *testing.T) {
+	// The sandbox will just sleep for a long time.  We will kill it before
+	// it finishes sleeping.
+	spec := newSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// expectedPL lists the expected process state of the sandbox.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+	// Create the sandbox.
+	id := uniqueSandboxID()
+	if _, err := sandbox.Create(id, spec, conf, bundleDir, "", "", nil); err != nil {
+		t.Fatalf("error creating sandbox: %v", err)
+	}
+	// Load the sandbox from disk and check the status.
+	s, err := sandbox.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading sandbox: %v", err)
+	}
+	if got, want := s.Status, sandbox.Created; got != want {
+		t.Errorf("sandbox status got %v, want %v", got, want)
+	}
+
+	// List should return the sandbox id.
+	ids, err := sandbox.List(rootDir)
+	if err != nil {
+		t.Fatalf("error listing sandboxes: %v", err)
+	}
+	if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
+		t.Errorf("sandbox list got %v, want %v", got, want)
+	}
+
+	// Start the sandbox.
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting sandbox: %v", err)
+	}
+	// Load the sandbox from disk and check the status.
+	s, err = sandbox.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading sandbox: %v", err)
+	}
+	if got, want := s.Status, sandbox.Running; got != want {
+		t.Errorf("sandbox status got %v, want %v", got, want)
+	}
+
+	// Verify that "sleep 100" is running.
+	if err := waitForProcessList(s, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Send the sandbox a signal, which we catch and use to cleanly
+	// shutdown.
+	if err := s.Signal(shutdownSignal); err != nil {
+		t.Fatalf("error sending signal %v to sandbox: %v", shutdownSignal, err)
+	}
+	// Wait for it to die.
+	if _, err := s.Wait(); err != nil {
+		t.Fatalf("error waiting on sandbox: %v", err)
+	}
+	// Load the sandbox from disk and check the status.
+	s, err = sandbox.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading sandbox: %v", err)
+	}
+	if got, want := s.Status, sandbox.Stopped; got != want {
+		t.Errorf("sandbox status got %v, want %v", got, want)
+	}
+
+	// Destroy the sandbox.
+	if err := s.Destroy(); err != nil {
+		t.Fatalf("error destroying sandbox: %v", err)
+	}
+
+	// List should not return the sandbox id.
+	ids, err = sandbox.List(rootDir)
+	if err != nil {
+		t.Fatalf("error listing sandboxes: %v", err)
+	}
+	if len(ids) != 0 {
+		t.Errorf("expected sandbox list to be empty, but got %v", ids)
+	}
+
+	// Loading the sandbox by id should fail.
+	if _, err = sandbox.Load(rootDir, id); err == nil {
+		t.Errorf("expected loading destroyed sandbox to fail, but it did not")
+	}
+}
+
+// Test the we can execute the application with different path formats.
+func TestExePath(t *testing.T) {
+	for _, test := range []struct {
+		path    string
+		success bool
+	}{
+		{path: "true", success: true},
+		{path: "bin/true", success: true},
+		{path: "/bin/true", success: true},
+		{path: "thisfiledoesntexit", success: false},
+		{path: "bin/thisfiledoesntexit", success: false},
+		{path: "/bin/thisfiledoesntexit", success: false},
+	} {
+		spec := newSpecWithArgs(test.path)
+		rootDir, bundleDir, conf, err := setupSandbox(spec)
+		if err != nil {
+			t.Fatalf("exec: %s, error setting up sandbox: %v", test.path, err)
+		}
+
+		ws, err := sandbox.Run(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+
+		os.RemoveAll(rootDir)
+		os.RemoveAll(bundleDir)
+
+		if test.success {
+			if err != nil {
+				t.Errorf("exec: %s, error running sandbox: %v", test.path, err)
+			}
+			if ws.ExitStatus() != 0 {
+				t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
+			}
+		} else {
+			if err == nil {
+				t.Errorf("exec: %s, got: no error, want: error", test.path)
+			}
+		}
+	}
+}
+
+// Test the we can retrieve the application exit status from the sandbox.
+func TestAppExitStatus(t *testing.T) {
+	// First sandbox will succeed.
+	succSpec := newSpecWithArgs("true")
+
+	rootDir, bundleDir, conf, err := setupSandbox(succSpec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	ws, err := sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir, "", "", nil)
+	if err != nil {
+		t.Fatalf("error running sandbox: %v", err)
+	}
+	if ws.ExitStatus() != 0 {
+		t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0)
+	}
+
+	// Second sandbox exits with non-zero status.
+	wantStatus := 123
+	errSpec := newSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
+
+	rootDir2, bundleDir2, conf, err := setupSandbox(errSpec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir2)
+	defer os.RemoveAll(bundleDir2)
+
+	ws, err = sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir2, "", "", nil)
+	if err != nil {
+		t.Fatalf("error running sandbox: %v", err)
+	}
+	if ws.ExitStatus() != wantStatus {
+		t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus)
+	}
+}
+
+// TestExec verifies that a sandbox can exec a new program.
+func TestExec(t *testing.T) {
+	const uid = 343
+	spec := newSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the sandbox.
+	s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+	if err != nil {
+		t.Fatalf("error creating sandbox: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting sandbox: %v", err)
+	}
+
+	// expectedPL lists the expected process state of the sandbox.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  uid,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Verify that "sleep 100" is running.
+	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		t.Error(err)
+	}
+
+	execArgs := control.ExecArgs{
+		Filename:         "/bin/sleep",
+		Argv:             []string{"sleep", "5"},
+		Envv:             []string{"PATH=" + os.Getenv("PATH")},
+		WorkingDirectory: "/",
+		KUID:             uid,
+		Detach:           false,
+	}
+
+	// Verify that "sleep 100" and "sleep 5" are running after exec.
+	// First, start running exec (whick blocks).
+	status := make(chan error, 1)
+	go func() {
+		exitStatus, err := s.Execute(&execArgs)
+		if err != nil {
+			status <- err
+		} else if exitStatus != 0 {
+			status <- fmt.Errorf("failed with exit status: %v", exitStatus)
+		} else {
+			status <- nil
+		}
+	}()
+
+	if err := waitForProcessList(s, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	// Ensure that exec finished without error.
+	select {
+	case <-time.After(10 * time.Second):
+		t.Fatalf("sandbox timed out waiting for exec to finish.")
+	case st := <-status:
+		if st != nil {
+			t.Errorf("sandbox failed to exec %v: %v", execArgs, err)
+		}
+	}
+}
+
+// TestCapabilities verifies that:
+// - Running exec as non-root UID and GID will result in an error (because the
+//   executable file can't be read).
+// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips
+//   this check.
+func TestCapabilities(t *testing.T) {
+	const uid = 343
+	const gid = 2401
+	spec := newSpecWithArgs("sleep", "100")
+
+	// We generate files in the host temporary directory.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: os.TempDir(),
+		Source:      os.TempDir(),
+		Type:        "bind",
+	})
+
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the sandbox.
+	s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+	if err != nil {
+		t.Fatalf("error creating sandbox: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting sandbox: %v", err)
+	}
+
+	// expectedPL lists the expected process state of the sandbox.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  uid,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "exe",
+		},
+	}
+	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+	}
+
+	// Create an executable that can't be run with the specified UID:GID.
+	// This shouldn't be callable within the sandbox until we add the
+	// CAP_DAC_OVERRIDE capability to skip the access check.
+	exePath := filepath.Join(rootDir, "exe")
+	if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+		t.Fatalf("couldn't create executable: %v", err)
+	}
+	defer os.Remove(exePath)
+
+	// Need to traverse the intermediate directory.
+	os.Chmod(rootDir, 0755)
+
+	execArgs := control.ExecArgs{
+		Filename:         exePath,
+		Argv:             []string{exePath},
+		Envv:             []string{"PATH=" + os.Getenv("PATH")},
+		WorkingDirectory: "/",
+		KUID:             uid,
+		KGID:             gid,
+		Capabilities:     &auth.TaskCapabilities{},
+		Detach:           true,
+	}
+
+	// "exe" should fail because we don't have the necessary permissions.
+	if _, err := s.Execute(&execArgs); err == nil {
+		t.Fatalf("sandbox executed without error, but an error was expected")
+	}
+
+	// Now we run with the capability enabled and should succeed.
+	execArgs.Capabilities = &auth.TaskCapabilities{
+		EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+	}
+	// First, start running exec.
+	if _, err := s.Execute(&execArgs); err != nil {
+		t.Fatalf("sandbox failed to exec %v: %v", execArgs, err)
+	}
+
+	if err := waitForProcessList(s, expectedPL); err != nil {
+		t.Error(err)
+	}
+}
+
+// Test that an tty FD is sent over the console socket if one is provided.
+func TestConsoleSocket(t *testing.T) {
+	spec := newSpecWithArgs("true")
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create a named socket and start listening.  We use a relative path
+	// to avoid overflowing the unix path length limit (108 chars).
+	socketPath := filepath.Join(bundleDir, "socket")
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatalf("error getting cwd: %v", err)
+	}
+	socketRelPath, err := filepath.Rel(cwd, socketPath)
+	if err != nil {
+		t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+	}
+	if len(socketRelPath) > len(socketPath) {
+		socketRelPath = socketPath
+	}
+	srv, err := unet.BindAndListen(socketRelPath, false)
+	if err != nil {
+		t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
+	}
+	defer os.Remove(socketPath)
+
+	// Create the sandbox and pass the socket name.
+	id := uniqueSandboxID()
+	s, err := sandbox.Create(id, spec, conf, bundleDir, socketRelPath, "", nil)
+	if err != nil {
+		t.Fatalf("error creating sandbox: %v", err)
+	}
+
+	// Open the othe end of the socket.
+	sock, err := srv.Accept()
+	if err != nil {
+		t.Fatalf("error accepting socket connection: %v", err)
+	}
+
+	// Allow 3 fds to be received.  We only expect 1.
+	r := sock.Reader(true /* blocking */)
+	r.EnableFDs(1)
+
+	// The socket is closed right after sending the FD, so EOF is
+	// an allowed error.
+	b := [][]byte{{}}
+	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+		t.Fatalf("error reading from socket connection: %v", err)
+	}
+
+	// We should have gotten a control message.
+	fds, err := r.ExtractFDs()
+	if err != nil {
+		t.Fatalf("error extracting fds from socket connection: %v", err)
+	}
+	if len(fds) != 1 {
+		t.Fatalf("got %d fds from socket, wanted 1", len(fds))
+	}
+
+	// Verify that the fd is a terminal.
+	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+		t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+	}
+
+	// Shut it down.
+	if err := s.Destroy(); err != nil {
+		t.Fatalf("error destroying sandbox: %v", err)
+	}
+
+	// Close socket.
+	if err := srv.Close(); err != nil {
+		t.Fatalf("error destroying sandbox: %v", err)
+	}
+}
+
+// procListsEqual is used to check whether 2 Process lists are equal for all
+// implemented fields.
+func procListsEqual(got, want []*control.Process) bool {
+	if len(got) != len(want) {
+		return false
+	}
+	for i := range got {
+		pd1 := got[i]
+		pd2 := want[i]
+		// Zero out unimplemented and timing dependant fields.
+		pd1.Time, pd2.Time = "", ""
+		pd1.STime, pd2.STime = "", ""
+		pd1.C, pd2.C = 0, 0
+		if *pd1 != *pd2 {
+			return false
+		}
+	}
+	return true
+}
+
+func procListToString(pl []*control.Process) string {
+	strs := make([]string, 0, len(pl))
+	for _, p := range pl {
+		strs = append(strs, fmt.Sprintf("%+v", p))
+	}
+	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
+}
+
+// TestMain acts like runsc if it is called with the "boot" argument, otherwise
+// it just runs the tests.  This is required because creating a sandbox will
+// call "/proc/self/exe boot".  Normally /proc/self/exe is the runsc binary,
+// but for tests we have to fake it.
+func TestMain(m *testing.M) {
+	// exit writes coverage data before exiting.
+	exit := func(status int) {
+		os.Exit(status)
+	}
+
+	if !flag.Parsed() {
+		flag.Parse()
+	}
+
+	// If we are passed one of the commands then run it.
+	subcommands.Register(new(cmd.Boot), "boot")
+	subcommands.Register(new(cmd.Gofer), "gofer")
+	switch flag.Arg(0) {
+	case "boot", "gofer":
+		// Run the command in a goroutine so we can block the main
+		// thread waiting for shutdownSignal.
+		go func() {
+			conf := &boot.Config{
+				RootDir: "unused-root-dir",
+				Network: boot.NetworkNone,
+			}
+			var ws syscall.WaitStatus
+			subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+			if subcmdCode != subcommands.ExitSuccess {
+				panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode))
+			}
+			// Sandbox exited normally. Shut down this process.
+			os.Exit(ws.ExitStatus())
+		}()
+
+		// Shutdown cleanly when the shutdownSignal is received.  This
+		// allows us to write coverage data before exiting.
+		sigc := make(chan os.Signal, 1)
+		signal.Notify(sigc, shutdownSignal)
+		<-sigc
+		exit(0)
+	default:
+		// Otherwise run the tests.
+		exit(m.Run())
+	}
+}
diff --git a/runsc/sandbox/status.go b/runsc/sandbox/status.go
new file mode 100644
index 000000000..6fc936aba
--- /dev/null
+++ b/runsc/sandbox/status.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+// Status enumerates sandbox statuses.  The statuses and their semantics are
+// part of the runtime CLI spec.
+//
+// TODO: Get precise about the transitions between statuses.
+type Status int
+
+const (
+	// Creating indicates "the container is being created".
+	Creating Status = iota
+
+	// Created indicates "the runtime has finished the create operation and
+	// the container process has neither exited nor executed the
+	// user-specified program".
+	Created
+
+	// Running indicates "the container process has executed the
+	// user-specified program but has not exited".
+	Running
+
+	// Stopped indicates "the container process has exited".
+	Stopped
+)
+
+// String converts a Status to a string.  These strings are part of the runtime
+// CLI spec and should not be changed.
+func (s Status) String() string {
+	switch s {
+	case Creating:
+		return "creating"
+	case Created:
+		return "created"
+	case Running:
+		return "running"
+	case Stopped:
+		return "stopped"
+	default:
+		return "unknown"
+	}
+
+}
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
new file mode 100644
index 000000000..ae89260d2
--- /dev/null
+++ b/runsc/specutils/BUILD
@@ -0,0 +1,18 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "specutils",
+    srcs = ["specutils.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/kernel/auth",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
new file mode 100644
index 000000000..bed0f75eb
--- /dev/null
+++ b/runsc/specutils/specutils.go
@@ -0,0 +1,183 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package specutils contains utility functions for working with OCI runtime
+// specs.
+package specutils
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// LogSpec logs the spec in a human-friendly way.
+func LogSpec(spec *specs.Spec) {
+	log.Debugf("Spec: %+v", spec)
+	log.Debugf("Spec.Hooks: %+v", spec.Hooks)
+	log.Debugf("Spec.Linux: %+v", spec.Linux)
+	log.Debugf("Spec.Process: %+v", spec.Process)
+	log.Debugf("Spec.Root: %+v", spec.Root)
+}
+
+// ReadSpec reads an OCI runtime spec from the given bundle directory.
+//
+// TODO: This should validate the spec.
+func ReadSpec(bundleDir string) (*specs.Spec, error) {
+	// The spec file must be in "config.json" inside the bundle directory.
+	specFile := filepath.Join(bundleDir, "config.json")
+	specBytes, err := ioutil.ReadFile(specFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile, err)
+	}
+	var spec specs.Spec
+	if err := json.Unmarshal(specBytes, &spec); err != nil {
+		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile, err, string(specBytes))
+	}
+	return &spec, nil
+}
+
+// GetExecutablePath returns the absolute path to the executable, relative to
+// the root.  It searches the environment PATH for the first file that exists
+// with the given name.
+func GetExecutablePath(exec, root string, env []string) (string, error) {
+	exec = filepath.Clean(exec)
+
+	// Don't search PATH if exec is a path to a file (absolute or relative).
+	if strings.IndexByte(exec, '/') >= 0 {
+		return exec, nil
+	}
+
+	// Get the PATH from the environment.
+	const prefix = "PATH="
+	var path []string
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			path = strings.Split(strings.TrimPrefix(e, prefix), ":")
+			break
+		}
+	}
+
+	// Search the PATH for a file whose name matches the one we are looking
+	// for.
+	for _, p := range path {
+		abs := filepath.Join(root, p, exec)
+		if _, err := os.Stat(abs); err == nil {
+			// We found it!  Return the path relative to the root.
+			return filepath.Join("/", p, exec), nil
+		}
+	}
+
+	// Could not find a suitable path, just return the original string.
+	log.Warningf("could not find executable %s in path %s", exec, path)
+	return exec, nil
+}
+
+// Capabilities takes in spec and returns a TaskCapabilities corresponding to
+// the spec.
+func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+	var caps auth.TaskCapabilities
+	if specCaps != nil {
+		var err error
+		if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding); err != nil {
+			return nil, err
+		}
+		if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective); err != nil {
+			return nil, err
+		}
+		if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable); err != nil {
+			return nil, err
+		}
+		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted); err != nil {
+			return nil, err
+		}
+		// TODO: Support ambient capabilities.
+	}
+	return &caps, nil
+}
+
+var capFromName = map[string]linux.Capability{
+	"CAP_CHOWN":            linux.CAP_CHOWN,
+	"CAP_DAC_OVERRIDE":     linux.CAP_DAC_OVERRIDE,
+	"CAP_DAC_READ_SEARCH":  linux.CAP_DAC_READ_SEARCH,
+	"CAP_FOWNER":           linux.CAP_FOWNER,
+	"CAP_FSETID":           linux.CAP_FSETID,
+	"CAP_KILL":             linux.CAP_KILL,
+	"CAP_SETGID":           linux.CAP_SETGID,
+	"CAP_SETUID":           linux.CAP_SETUID,
+	"CAP_SETPCAP":          linux.CAP_SETPCAP,
+	"CAP_LINUX_IMMUTABLE":  linux.CAP_LINUX_IMMUTABLE,
+	"CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE,
+	"CAP_NET_BROAD_CAST":   linux.CAP_NET_BROAD_CAST,
+	"CAP_NET_ADMIN":        linux.CAP_NET_ADMIN,
+	"CAP_NET_RAW":          linux.CAP_NET_RAW,
+	"CAP_IPC_LOCK":         linux.CAP_IPC_LOCK,
+	"CAP_IPC_OWNER":        linux.CAP_IPC_OWNER,
+	"CAP_SYS_MODULE":       linux.CAP_SYS_MODULE,
+	"CAP_SYS_RAWIO":        linux.CAP_SYS_RAWIO,
+	"CAP_SYS_CHROOT":       linux.CAP_SYS_CHROOT,
+	"CAP_SYS_PTRACE":       linux.CAP_SYS_PTRACE,
+	"CAP_SYS_PACCT":        linux.CAP_SYS_PACCT,
+	"CAP_SYS_ADMIN":        linux.CAP_SYS_ADMIN,
+	"CAP_SYS_BOOT":         linux.CAP_SYS_BOOT,
+	"CAP_SYS_NICE":         linux.CAP_SYS_NICE,
+	"CAP_SYS_RESOURCE":     linux.CAP_SYS_RESOURCE,
+	"CAP_SYS_TIME":         linux.CAP_SYS_TIME,
+	"CAP_SYS_TTY_CONFIG":   linux.CAP_SYS_TTY_CONFIG,
+	"CAP_MKNOD":            linux.CAP_MKNOD,
+	"CAP_LEASE":            linux.CAP_LEASE,
+	"CAP_AUDIT_WRITE":      linux.CAP_AUDIT_WRITE,
+	"CAP_AUDIT_CONTROL":    linux.CAP_AUDIT_CONTROL,
+	"CAP_SETFCAP":          linux.CAP_SETFCAP,
+	"CAP_MAC_OVERRIDE":     linux.CAP_MAC_OVERRIDE,
+	"CAP_MAC_ADMIN":        linux.CAP_MAC_ADMIN,
+	"CAP_SYSLOG":           linux.CAP_SYSLOG,
+	"CAP_WAKE_ALARM":       linux.CAP_WAKE_ALARM,
+	"CAP_BLOCK_SUSPEND":    linux.CAP_BLOCK_SUSPEND,
+}
+
+func capsFromNames(names []string) (auth.CapabilitySet, error) {
+	var caps []linux.Capability
+	for _, n := range names {
+		c, ok := capFromName[n]
+		if !ok {
+			return 0, fmt.Errorf("unknown capability %q", n)
+		}
+		caps = append(caps, c)
+	}
+	return auth.CapabilitySetOfMany(caps), nil
+}
+
+// Is9PMount returns true if the given mount can be mounted as an external gofer.
+func Is9PMount(m specs.Mount) bool {
+	return m.Type == "bind" && m.Source != "" && !strings.HasPrefix(m.Destination, "/dev")
+}
+
+// BinPath returns the real path to self, resolving symbolink links. This is done
+// to make the process name appears as 'runsc', instead of 'exe'.
+func BinPath() (string, error) {
+	binPath, err := filepath.EvalSymlinks("/proc/self/exe")
+	if err != nil {
+		return "", fmt.Errorf(`error resolving "/proc/self/exe" symlink: %v`, err)
+	}
+	return binPath, nil
+}
-- 
cgit v1.2.3


From 5eab7a41a3b5419fd0ee0e68116b50fd72b4cdec Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 1 May 2018 09:44:58 -0700
Subject: Remove stale TODO

PiperOrigin-RevId: 194949678
Change-Id: I60a30c4bb7418e17583c66f437273fd17e9e99ba
---
 runsc/sandbox/sandbox.go | 2 --
 1 file changed, 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index b2fa1d58e..64810b4ea 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -535,8 +535,6 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, common
 			nss = append(nss, userns)
 			setUIDGIDMappings(cmd, s.Spec)
 		} else {
-			// TODO: Retrict capabilities since it's using current user
-			// namespace, i.e. root.
 			log.Infof("Sandbox will be started in the current user namespace")
 		}
 		// When running in the caller's defined user namespace, apply the same
-- 
cgit v1.2.3


From 3d3deef573a54e031cb98038b9f617f5fac31044 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 1 May 2018 22:11:07 -0700
Subject: Implement SO_TIMESTAMP

PiperOrigin-RevId: 195047018
Change-Id: I6d99528a00a2125f414e1e51e067205289ec9d3d
---
 pkg/dhcp/client.go                                 |  4 +-
 pkg/dhcp/dhcp_test.go                              |  2 +-
 pkg/dhcp/server.go                                 |  2 +-
 pkg/sentry/fs/host/socket_test.go                  |  2 +-
 pkg/sentry/kernel/kernel.go                        |  9 +++
 pkg/sentry/socket/BUILD                            |  1 +
 pkg/sentry/socket/control/control.go               | 35 +++++++++++
 pkg/sentry/socket/epsocket/epsocket.go             | 69 +++++++++++++++-------
 pkg/sentry/socket/hostinet/socket.go               | 10 ++--
 pkg/sentry/socket/netlink/socket.go                | 16 ++---
 pkg/sentry/socket/rpcinet/socket.go                | 20 +++----
 pkg/sentry/socket/socket.go                        | 12 +++-
 pkg/sentry/socket/unix/unix.go                     | 14 ++---
 pkg/sentry/strace/socket.go                        | 29 ++++++++-
 pkg/sentry/syscalls/linux/sys_socket.go            | 21 ++++---
 pkg/tcpip/adapters/gonet/gonet.go                  |  4 +-
 pkg/tcpip/adapters/gonet/gonet_test.go             |  2 +-
 pkg/tcpip/network/arp/arp_test.go                  |  2 +-
 pkg/tcpip/network/ipv4/icmp_test.go                |  2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go           |  4 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go              |  4 +-
 pkg/tcpip/stack/stack.go                           | 17 +++++-
 pkg/tcpip/stack/stack_test.go                      | 22 +++----
 pkg/tcpip/stack/transport_test.go                  | 16 ++---
 pkg/tcpip/tcpip.go                                 | 48 +++++++++++++--
 pkg/tcpip/transport/tcp/endpoint.go                | 22 +++----
 pkg/tcpip/transport/tcp/tcp_test.go                | 46 +++++++--------
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go      |  4 +-
 pkg/tcpip/transport/tcp/testing/context/context.go |  2 +-
 pkg/tcpip/transport/udp/endpoint.go                | 37 ++++++++++--
 pkg/tcpip/transport/udp/udp_test.go                | 10 ++--
 runsc/boot/loader.go                               |  7 ++-
 32 files changed, 345 insertions(+), 150 deletions(-)

(limited to 'runsc')

diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 9a4fd7ae4..37deb69ff 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -162,7 +162,7 @@ func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) error
 	// DHCPOFFER
 	for {
 		var addr tcpip.FullAddress
-		v, err := epin.Read(&addr)
+		v, _, err := epin.Read(&addr)
 		if err == tcpip.ErrWouldBlock {
 			select {
 			case <-ch:
@@ -216,7 +216,7 @@ func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) error
 	// DHCPACK
 	for {
 		var addr tcpip.FullAddress
-		v, err := epin.Read(&addr)
+		v, _, err := epin.Read(&addr)
 		if err == tcpip.ErrWouldBlock {
 			select {
 			case <-ch:
diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index d56b93997..ed884fcb6 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -36,7 +36,7 @@ func TestDHCP(t *testing.T) {
 		}
 	}()
 
-	s := stack.New([]string{ipv4.ProtocolName}, []string{udp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{udp.ProtocolName})
 
 	const nicid tcpip.NICID = 1
 	if err := s.CreateNIC(nicid, id); err != nil {
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index d132d90b4..8816203a8 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -104,7 +104,7 @@ func (s *Server) reader(ctx context.Context) {
 
 	for {
 		var addr tcpip.FullAddress
-		v, err := s.ep.Read(&addr)
+		v, _, err := s.ep.Read(&addr)
 		if err == tcpip.ErrWouldBlock {
 			select {
 			case <-ch:
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 80c46dcfa..9b73c5173 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -142,7 +142,7 @@ func TestSocketSendMsgLen0(t *testing.T) {
 	defer sfile.DecRef()
 
 	s := sfile.FileOperations.(socket.Socket)
-	n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, unix.ControlMessages{})
+	n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, socket.ControlMessages{})
 	if n != 0 {
 		t.Fatalf("socket sendmsg() failed: %v wrote: %d", terr, n)
 	}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 0932965e0..25c8dd885 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -887,6 +887,15 @@ func (k *Kernel) SetExitError(err error) {
 	}
 }
 
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (k *Kernel) NowNanoseconds() int64 {
+	now, err := k.timekeeper.GetTime(sentrytime.Realtime)
+	if err != nil {
+		panic("Kernel.NowNanoseconds: " + err.Error())
+	}
+	return now
+}
+
 // SupervisorContext returns a Context with maximum privileges in k. It should
 // only be used by goroutines outside the control of the emulated kernel
 // defined by e.
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 87e32df37..5500a676e 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/syserr",
+        "//pkg/tcpip",
         "//pkg/tcpip/transport/unix",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index cb34cbc85..17ecdd11c 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -208,6 +208,31 @@ func putCmsg(buf []byte, msgType uint32, align uint, data []int32) []byte {
 	return alignSlice(buf, align)
 }
 
+func putCmsgStruct(buf []byte, msgType uint32, align uint, data interface{}) []byte {
+	if cap(buf)-len(buf) < linux.SizeOfControlMessageHeader {
+		return buf
+	}
+	ob := buf
+
+	buf = putUint64(buf, uint64(linux.SizeOfControlMessageHeader))
+	buf = putUint32(buf, linux.SOL_SOCKET)
+	buf = putUint32(buf, msgType)
+
+	hdrBuf := buf
+
+	buf = binary.Marshal(buf, usermem.ByteOrder, data)
+
+	// Check if we went over.
+	if cap(buf) != cap(ob) {
+		return hdrBuf
+	}
+
+	// Fix up length.
+	putUint64(ob, uint64(len(buf)-len(ob)))
+
+	return alignSlice(buf, align)
+}
+
 // Credentials implements SCMCredentials.Credentials.
 func (c *scmCredentials) Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) {
 	// "When a process's user and group IDs are passed over a UNIX domain
@@ -261,6 +286,16 @@ func alignSlice(buf []byte, align uint) []byte {
 	return buf[:aligned]
 }
 
+// PackTimestamp packs a SO_TIMESTAMP socket control message.
+func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte {
+	return putCmsgStruct(
+		buf,
+		linux.SO_TIMESTAMP,
+		t.Arch().Width(),
+		linux.NsecToTimeval(timestamp),
+	)
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.ControlMessages, error) {
 	var (
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 3fc3ea58f..5701ecfac 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -109,6 +109,7 @@ type SocketOperations struct {
 	// readMu protects access to readView, control, and sender.
 	readMu   sync.Mutex `state:"nosave"`
 	readView buffer.View
+	readCM   tcpip.ControlMessages
 	sender   tcpip.FullAddress
 }
 
@@ -210,12 +211,13 @@ func (s *SocketOperations) fetchReadView() *syserr.Error {
 	s.readView = nil
 	s.sender = tcpip.FullAddress{}
 
-	v, err := s.Endpoint.Read(&s.sender)
+	v, cms, err := s.Endpoint.Read(&s.sender)
 	if err != nil {
 		return syserr.TranslateNetstackError(err)
 	}
 
 	s.readView = v
+	s.readCM = cms
 
 	return nil
 }
@@ -230,7 +232,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
-	n, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
+	n, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
 	if err == syserr.ErrWouldBlock {
 		return int64(n), syserror.ErrWouldBlock
 	}
@@ -552,6 +554,18 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 			}
 
 			return linux.NsecToTimeval(s.RecvTimeout()), nil
+
+		case linux.SO_TIMESTAMP:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var v tcpip.TimestampOption
+			if err := ep.GetSockOpt(&v); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			return int32(v), nil
 		}
 
 	case syscall.SOL_TCP:
@@ -659,6 +673,14 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n
 			binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
 			s.SetRecvTimeout(v.ToNsecCapped())
 			return nil
+
+		case linux.SO_TIMESTAMP:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TimestampOption(v)))
 		}
 
 	case syscall.SOL_TCP:
@@ -823,7 +845,9 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 }
 
 // nonBlockingRead issues a non-blocking read.
-func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, interface{}, uint32, *syserr.Error) {
+//
+// TODO: Support timestamps for stream sockets.
+func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	isPacket := s.isPacketBased()
 
 	// Fast path for regular reads from stream (e.g., TCP) endpoints. Note
@@ -839,14 +863,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readMu.Lock()
 		n, err := s.coalescingRead(ctx, dst, trunc)
 		s.readMu.Unlock()
-		return n, nil, 0, err
+		return n, nil, 0, socket.ControlMessages{}, err
 	}
 
 	s.readMu.Lock()
 	defer s.readMu.Unlock()
 
 	if err := s.fetchReadView(); err != nil {
-		return 0, nil, 0, err
+		return 0, nil, 0, socket.ControlMessages{}, err
 	}
 
 	if !isPacket && peek && trunc {
@@ -854,14 +878,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		// amount that could be read.
 		var rql tcpip.ReceiveQueueSizeOption
 		if err := s.Endpoint.GetSockOpt(&rql); err != nil {
-			return 0, nil, 0, syserr.TranslateNetstackError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
 		}
 		available := len(s.readView) + int(rql)
 		bufLen := int(dst.NumBytes())
 		if available < bufLen {
-			return available, nil, 0, nil
+			return available, nil, 0, socket.ControlMessages{}, nil
 		}
-		return bufLen, nil, 0, nil
+		return bufLen, nil, 0, socket.ControlMessages{}, nil
 	}
 
 	n, err := dst.CopyOut(ctx, s.readView)
@@ -874,17 +898,18 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	if peek {
 		if l := len(s.readView); trunc && l > n {
 			// isPacket must be true.
-			return l, addr, addrLen, syserr.FromError(err)
+			return l, addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 		}
 
 		if isPacket || err != nil {
-			return int(n), addr, addrLen, syserr.FromError(err)
+			return int(n), addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 		}
 
 		// We need to peek beyond the first message.
 		dst = dst.DropFirst(n)
 		num, err := dst.CopyOutFrom(ctx, safemem.FromVecReaderFunc{func(dsts [][]byte) (int64, error) {
-			n, err := s.Endpoint.Peek(dsts)
+			n, _, err := s.Endpoint.Peek(dsts)
+			// TODO: Handle peek timestamp.
 			if err != nil {
 				return int64(n), syserr.TranslateNetstackError(err).ToError()
 			}
@@ -895,7 +920,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 			// We got some data, so no need to return an error.
 			err = nil
 		}
-		return int(n), nil, 0, syserr.FromError(err)
+		return int(n), nil, 0, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 	}
 
 	var msgLen int
@@ -908,15 +933,15 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	}
 
 	if trunc {
-		return msgLen, addr, addrLen, syserr.FromError(err)
+		return msgLen, addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 	}
 
-	return int(n), addr, addrLen, syserr.FromError(err)
+	return int(n), addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 }
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 
 	peek := flags&linux.MSG_PEEK != 0
@@ -924,7 +949,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		// Stream sockets ignore the sender address.
 		senderRequested = false
 	}
-	n, senderAddr, senderAddrLen, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+	n, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
 	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		return
 	}
@@ -936,25 +961,25 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	defer s.EventUnregister(&e)
 
 	for {
-		n, senderAddr, senderAddrLen, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+		n, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
 		if err != syserr.ErrWouldBlock {
 			return
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
 
 // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
-	// Reject control messages.
-	if !controlMessages.Empty() {
+func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+	// Reject Unix control messages.
+	if !controlMessages.Unix.Empty() {
 		return 0, syserr.ErrInvalidArgument
 	}
 
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index defa3db2c..02fad1c60 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -57,6 +57,8 @@ type socketOperations struct {
 	queue waiter.Queue
 }
 
+var _ = socket.Socket(&socketOperations{})
+
 func newSocketFile(ctx context.Context, fd int, nonblock bool) (*fs.File, *syserr.Error) {
 	s := &socketOperations{fd: fd}
 	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
@@ -339,14 +341,14 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	// Whitelist flags.
 	//
 	// FIXME: We can't support MSG_ERRQUEUE because it uses ancillary
 	// messages that netstack/tcpip/transport/unix doesn't understand. Kill the
 	// Socket interface's dependence on netstack.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 {
-		return 0, nil, 0, unix.ControlMessages{}, syserr.ErrInvalidArgument
+		return 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
 	}
 
 	var senderAddr []byte
@@ -411,11 +413,11 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		}
 	}
 
-	return int(n), senderAddr, uint32(len(senderAddr)), unix.ControlMessages{}, syserr.FromError(err)
+	return int(n), senderAddr, uint32(len(senderAddr)), socket.ControlMessages{}, syserr.FromError(err)
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Whitelist flags.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
 		return 0, syserr.ErrInvalidArgument
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 2d0e59ceb..0b8f528d0 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -305,7 +305,7 @@ func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	from := linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
 		PortID: 0,
@@ -323,7 +323,7 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 		if trunc {
 			n = int64(r.MsgSize)
 		}
-		return int(n), from, fromLen, unix.ControlMessages{}, syserr.FromError(err)
+		return int(n), from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
 	}
 
 	// We'll have to block. Register for notification and keep trying to
@@ -337,14 +337,14 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 			if trunc {
 				n = int64(r.MsgSize)
 			}
-			return int(n), from, fromLen, unix.ControlMessages{}, syserr.FromError(err)
+			return int(n), from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
@@ -459,7 +459,7 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 }
 
 // sendMsg is the core of message send, used for SendMsg and Write.
-func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	dstPort := int32(0)
 
 	if len(to) != 0 {
@@ -506,12 +506,12 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte,
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	return s.sendMsg(t, src, to, flags, controlMessages)
 }
 
 // Write implements fs.FileOperations.Write.
 func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
-	n, err := s.sendMsg(ctx, src, nil, 0, unix.ControlMessages{})
+	n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
 	return int64(n), err.ToError()
 }
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 574d99ba5..15047df01 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -402,7 +402,7 @@ func rpcRecvMsg(t *kernel.Task, req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResp
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
 		Fd:     s.fd,
 		Length: uint32(dst.NumBytes()),
@@ -414,10 +414,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	res, err := rpcRecvMsg(t, req)
 	if err == nil {
 		n, e := dst.CopyOut(t, res.Data)
-		return int(n), res.Address.GetAddress(), res.Address.GetLength(), unix.ControlMessages{}, syserr.FromError(e)
+		return int(n), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 	}
 	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
-		return 0, nil, 0, unix.ControlMessages{}, err
+		return 0, nil, 0, socket.ControlMessages{}, err
 	}
 
 	// We'll have to block. Register for notifications and keep trying to
@@ -430,17 +430,17 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		res, err := rpcRecvMsg(t, req)
 		if err == nil {
 			n, e := dst.CopyOut(t, res.Data)
-			return int(n), res.Address.GetAddress(), res.Address.GetLength(), unix.ControlMessages{}, syserr.FromError(e)
+			return int(n), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 		}
 		if err != syserr.ErrWouldBlock {
-			return 0, nil, 0, unix.ControlMessages{}, err
+			return 0, nil, 0, socket.ControlMessages{}, err
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
@@ -459,14 +459,14 @@ func rpcSendMsg(t *kernel.Task, req *pb.SyscallRequest_Sendmsg) (uint32, *syserr
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Whitelist flags.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
 		return 0, syserr.ErrInvalidArgument
 	}
 
-	// Reject control messages.
-	if !controlMessages.Empty() {
+	// Reject Unix control messages.
+	if !controlMessages.Unix.Empty() {
 		return 0, syserr.ErrInvalidArgument
 	}
 
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index be3026bfa..bd4858a34 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -31,9 +31,17 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
+// ControlMessages represents the union of unix control messages and tcpip
+// control messages.
+type ControlMessages struct {
+	Unix unix.ControlMessages
+	IP   tcpip.ControlMessages
+}
+
 // Socket is the interface containing socket syscalls used by the syscall layer
 // to redirect them to the appropriate implementation.
 type Socket interface {
@@ -78,11 +86,11 @@ type Socket interface {
 	//
 	// senderAddrLen is the address length to be returned to the application,
 	// not necessarily the actual length of the address.
-	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error)
+	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
 
 	// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
 	// ownership of the ControlMessage on error.
-	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (n int, err *syserr.Error)
+	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages ControlMessages) (n int, err *syserr.Error)
 
 	// SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means
 	// no timeout.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index a4b414851..f83156c8e 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -358,10 +358,10 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 
 // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
 // a unix.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	w := EndpointWriter{
 		Endpoint: s.ep,
-		Control:  controlMessages,
+		Control:  controlMessages.Unix,
 		To:       nil,
 	}
 	if len(to) > 0 {
@@ -452,7 +452,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // a unix.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 
@@ -490,7 +490,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		if trunc {
 			n = int64(r.MsgSize)
 		}
-		return int(n), from, fromLen, r.Control, syserr.FromError(err)
+		return int(n), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
 	}
 
 	// We'll have to block. Register for notification and keep trying to
@@ -509,14 +509,14 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			if trunc {
 				n = int64(r.MsgSize)
 			}
-			return int(n), from, fromLen, r.Control, syserr.FromError(err)
+			return int(n), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 48c072e96..1a2e8573e 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -440,6 +440,7 @@ var SocketProtocol = map[int32]abi.ValueSet{
 var controlMessageType = map[int32]string{
 	linux.SCM_RIGHTS:      "SCM_RIGHTS",
 	linux.SCM_CREDENTIALS: "SCM_CREDENTIALS",
+	linux.SO_TIMESTAMP:    "SO_TIMESTAMP",
 }
 
 func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) string {
@@ -477,7 +478,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 			typ = fmt.Sprint(h.Type)
 		}
 
-		if h.Length > uint64(len(buf)-i) {
+		if h.Length > uint64(len(buf)-i+linux.SizeOfControlMessageHeader) {
 			strs = append(strs, fmt.Sprintf(
 				"{level=%s, type=%s, length=%d, content extends beyond buffer}",
 				level,
@@ -546,6 +547,32 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 
 			i += control.AlignUp(length, width)
 
+		case linux.SO_TIMESTAMP:
+			if length < linux.SizeOfTimeval {
+				strs = append(strs, fmt.Sprintf(
+					"{level=%s, type=%s, length=%d, content too short}",
+					level,
+					typ,
+					h.Length,
+				))
+				i += control.AlignUp(length, width)
+				break
+			}
+
+			var tv linux.Timeval
+			binary.Unmarshal(buf[i:i+linux.SizeOfTimeval], usermem.ByteOrder, &tv)
+
+			strs = append(strs, fmt.Sprintf(
+				"{level=%s, type=%s, length=%d, Sec: %d, Usec: %d}",
+				level,
+				typ,
+				h.Length,
+				tv.Sec,
+				tv.Usec,
+			))
+
+			i += control.AlignUp(length, width)
+
 		default:
 			panic("unreachable")
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 70c618398..6258a1539 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -731,10 +731,11 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 
 	// Fast path when no control message nor name buffers are provided.
 	if msg.ControlLen == 0 && msg.NameLen == 0 {
-		n, _, _, _, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		n, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
 		if err != nil {
 			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
 		}
+		cms.Unix.Release()
 		return uintptr(n), nil
 	}
 
@@ -745,17 +746,21 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
-	defer cms.Release()
+	defer cms.Unix.Release()
 
 	controlData := make([]byte, 0, msg.ControlLen)
 
 	if cr, ok := s.(unix.Credentialer); ok && cr.Passcred() {
-		creds, _ := cms.Credentials.(control.SCMCredentials)
+		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
 		controlData = control.PackCredentials(t, creds, controlData)
 	}
 
-	if cms.Rights != nil {
-		controlData = control.PackRights(t, cms.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData)
+	if cms.IP.HasTimestamp {
+		controlData = control.PackTimestamp(t, cms.IP.Timestamp, controlData)
+	}
+
+	if cms.Unix.Rights != nil {
+		controlData = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData)
 	}
 
 	// Copy the address to the caller.
@@ -823,7 +828,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 	}
 
 	n, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
-	cm.Release()
+	cm.Unix.Release()
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
@@ -997,7 +1002,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 	}
 
 	// Call the syscall implementation.
-	n, e := s.SendMsg(t, src, to, int(flags), controlMessages)
+	n, e := s.SendMsg(t, src, to, int(flags), socket.ControlMessages{Unix: controlMessages})
 	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
 	if err != nil {
 		controlMessages.Release()
@@ -1048,7 +1053,7 @@ func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, fla
 	}
 
 	// Call the syscall implementation.
-	n, e := s.SendMsg(t, src, to, int(flags), control.New(t, s, nil))
+	n, e := s.SendMsg(t, src, to, int(flags), socket.ControlMessages{Unix: control.New(t, s, nil)})
 	return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
 }
 
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 96a2d670d..5aa6b1aa2 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -268,7 +268,7 @@ type opErrorer interface {
 // commonRead implements the common logic between net.Conn.Read and
 // net.PacketConn.ReadFrom.
 func commonRead(ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, addr *tcpip.FullAddress, errorer opErrorer) ([]byte, error) {
-	read, err := ep.Read(addr)
+	read, _, err := ep.Read(addr)
 
 	if err == tcpip.ErrWouldBlock {
 		// Create wait queue entry that notifies a channel.
@@ -276,7 +276,7 @@ func commonRead(ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, a
 		wq.EventRegister(&waitEntry, waiter.EventIn)
 		defer wq.EventUnregister(&waitEntry)
 		for {
-			read, err = ep.Read(addr)
+			read, _, err = ep.Read(addr)
 			if err != tcpip.ErrWouldBlock {
 				break
 			}
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 2f86469eb..e3d0c6c84 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -47,7 +47,7 @@ func TestTimeouts(t *testing.T) {
 
 func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
 	// Create the stack and add a NIC.
-	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName, udp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName, udp.ProtocolName})
 
 	if err := s.CreateNIC(NICID, loopback.New()); err != nil {
 		return nil, err
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 91ffdce4b..47b10e64e 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -32,7 +32,7 @@ type testContext struct {
 }
 
 func newTestContext(t *testing.T) *testContext {
-	s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, []string{ipv4.PingProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, arp.ProtocolName}, []string{ipv4.PingProtocolName})
 
 	const defaultMTU = 65536
 	id, linkEP := channel.New(256, defaultMTU, stackLinkAddr)
diff --git a/pkg/tcpip/network/ipv4/icmp_test.go b/pkg/tcpip/network/ipv4/icmp_test.go
index 378fba74b..c55aa1835 100644
--- a/pkg/tcpip/network/ipv4/icmp_test.go
+++ b/pkg/tcpip/network/ipv4/icmp_test.go
@@ -26,7 +26,7 @@ type testContext struct {
 }
 
 func newTestContext(t *testing.T) *testContext {
-	s := stack.New([]string{ipv4.ProtocolName}, []string{ipv4.PingProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{ipv4.PingProtocolName})
 
 	const defaultMTU = 65536
 	id, linkEP := channel.New(256, defaultMTU, "")
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 332929c85..ef5c7ec60 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -113,7 +113,7 @@ func main() {
 
 	// Create the stack with ipv4 and tcp protocols, then add a tun-based
 	// NIC and ipv4 address.
-	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
 
 	mtu, err := rawfile.GetMTU(tunName)
 	if err != nil {
@@ -183,7 +183,7 @@ func main() {
 	// connection from its side.
 	wq.EventRegister(&waitEntry, waiter.EventIn)
 	for {
-		v, err := ep.Read(nil)
+		v, _, err := ep.Read(nil)
 		if err != nil {
 			if err == tcpip.ErrClosedForReceive {
 				break
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 10cd701af..8c166f643 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -42,7 +42,7 @@ func echo(wq *waiter.Queue, ep tcpip.Endpoint) {
 	defer wq.EventUnregister(&waitEntry)
 
 	for {
-		v, err := ep.Read(nil)
+		v, _, err := ep.Read(nil)
 		if err != nil {
 			if err == tcpip.ErrWouldBlock {
 				<-notifyCh
@@ -99,7 +99,7 @@ func main() {
 
 	// Create the stack with ip and tcp protocols, then add a tun-based
 	// NIC and address.
-	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}, []string{tcp.ProtocolName})
 
 	mtu, err := rawfile.GetMTU(tunName)
 	if err != nil {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 558ecdb72..b480bf812 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -270,6 +270,9 @@ type Stack struct {
 	// If not nil, then any new endpoints will have this probe function
 	// invoked everytime they receive a TCP segment.
 	tcpProbeFunc TCPProbeFunc
+
+	// clock is used to generate user-visible times.
+	clock tcpip.Clock
 }
 
 // New allocates a new networking stack with only the requested networking and
@@ -279,7 +282,7 @@ type Stack struct {
 // SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
 // stack. Please refer to individual protocol implementations as to what options
 // are supported.
-func New(network []string, transport []string) *Stack {
+func New(clock tcpip.Clock, network []string, transport []string) *Stack {
 	s := &Stack{
 		transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
 		networkProtocols:   make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
@@ -287,6 +290,7 @@ func New(network []string, transport []string) *Stack {
 		nics:               make(map[tcpip.NICID]*NIC),
 		linkAddrCache:      newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
 		PortManager:        ports.NewPortManager(),
+		clock:              clock,
 	}
 
 	// Add specified network protocols.
@@ -388,6 +392,11 @@ func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h f
 	}
 }
 
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (s *Stack) NowNanoseconds() int64 {
+	return s.clock.NowNanoseconds()
+}
+
 // Stats returns a snapshot of the current stats.
 //
 // NOTE: The underlying stats are updated using atomic instructions as a result
@@ -474,6 +483,12 @@ func (s *Stack) CreateDisabledNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *
 	return s.createNIC(id, "", linkEP, false)
 }
 
+// CreateDisabledNamedNIC is a combination of CreateNamedNIC and
+// CreateDisabledNIC.
+func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, name, linkEP, false)
+}
+
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
 // delivering packets to it.
 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index b416065d7..ea7dccdc2 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -176,7 +176,7 @@ func TestNetworkReceive(t *testing.T) {
 	// Create a stack with the fake network protocol, one nic, and two
 	// addresses attached to it: 1 & 2.
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -270,7 +270,7 @@ func TestNetworkSend(t *testing.T) {
 	// address: 1. The route table sends all packets through the only
 	// existing nic.
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("NewNIC failed: %v", err)
 	}
@@ -292,7 +292,7 @@ func TestNetworkSendMultiRoute(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id1, linkEP1 := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id1); err != nil {
@@ -371,7 +371,7 @@ func TestRoutes(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id1, _ := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id1); err != nil {
@@ -435,7 +435,7 @@ func TestRoutes(t *testing.T) {
 }
 
 func TestAddressRemoval(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -479,7 +479,7 @@ func TestAddressRemoval(t *testing.T) {
 }
 
 func TestDelayedRemovalDueToRoute(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -547,7 +547,7 @@ func TestDelayedRemovalDueToRoute(t *testing.T) {
 }
 
 func TestPromiscuousMode(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -607,7 +607,7 @@ func TestAddressSpoofing(t *testing.T) {
 	srcAddr := tcpip.Address("\x01")
 	dstAddr := tcpip.Address("\x02")
 
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, _ := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -648,7 +648,7 @@ func TestAddressSpoofing(t *testing.T) {
 
 // Set the subnet, then check that packet is delivered.
 func TestSubnetAcceptsMatchingPacket(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -682,7 +682,7 @@ func TestSubnetAcceptsMatchingPacket(t *testing.T) {
 
 // Set destination outside the subnet, then check it doesn't get delivered.
 func TestSubnetRejectsNonmatchingPacket(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -714,7 +714,7 @@ func TestSubnetRejectsNonmatchingPacket(t *testing.T) {
 }
 
 func TestNetworkOptions(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, []string{})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{})
 
 	// Try an unsupported network protocol.
 	if err := s.SetNetworkProtocolOption(tcpip.NetworkProtocolNumber(99999), fakeNetGoodOption(false)); err != tcpip.ErrUnknownProtocol {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 7e072e96e..b870ab375 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -46,8 +46,8 @@ func (*fakeTransportEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask
 	return mask
 }
 
-func (*fakeTransportEndpoint) Read(*tcpip.FullAddress) (buffer.View, *tcpip.Error) {
-	return buffer.View{}, nil
+func (*fakeTransportEndpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	return buffer.View{}, tcpip.ControlMessages{}, nil
 }
 
 func (f *fakeTransportEndpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tcpip.Error) {
@@ -67,8 +67,8 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions)
 	return uintptr(len(v)), nil
 }
 
-func (f *fakeTransportEndpoint) Peek([][]byte) (uintptr, *tcpip.Error) {
-	return 0, nil
+func (f *fakeTransportEndpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
@@ -210,7 +210,7 @@ func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
 
 func TestTransportReceive(t *testing.T) {
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -270,7 +270,7 @@ func TestTransportReceive(t *testing.T) {
 
 func TestTransportControlReceive(t *testing.T) {
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -336,7 +336,7 @@ func TestTransportControlReceive(t *testing.T) {
 
 func TestTransportSend(t *testing.T) {
 	id, _ := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -373,7 +373,7 @@ func TestTransportSend(t *testing.T) {
 }
 
 func TestTransportOptions(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
 
 	// Try an unsupported transport protocol.
 	if err := s.SetTransportProtocolOption(tcpip.TransportProtocolNumber(99999), fakeTransportGoodOption(false)); err != tcpip.ErrUnknownProtocol {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f3a94f353..f9df1d989 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -23,6 +23,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -80,6 +81,24 @@ var (
 	errSubnetAddressMasked  = errors.New("subnet address has bits set outside the mask")
 )
 
+// A Clock provides the current time.
+//
+// Times returned by a Clock should always be used for application-visible
+// time, but never for netstack internal timekeeping.
+type Clock interface {
+	// NowNanoseconds returns the current real time as a number of
+	// nanoseconds since some epoch.
+	NowNanoseconds() int64
+}
+
+// StdClock implements Clock with the time package.
+type StdClock struct{}
+
+// NowNanoseconds implements Clock.NowNanoseconds.
+func (*StdClock) NowNanoseconds() int64 {
+	return time.Now().UnixNano()
+}
+
 // Address is a byte slice cast as a string that represents the address of a
 // network node. Or, in the case of unix endpoints, it may represent a path.
 type Address string
@@ -210,6 +229,16 @@ func (s SlicePayload) Size() int {
 	return len(s)
 }
 
+// A ControlMessages contains socket control messages for IP sockets.
+type ControlMessages struct {
+	// HasTimestamp indicates whether Timestamp is valid/set.
+	HasTimestamp bool
+
+	// Timestamp is the time (in ns) that the last packed used to create
+	// the read data was received.
+	Timestamp int64
+}
+
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
 // that exposes functionality like read, write, connect, etc. to users of the
 // networking stack.
@@ -219,9 +248,13 @@ type Endpoint interface {
 	Close()
 
 	// Read reads data from the endpoint and optionally returns the sender.
-	// This method does not block if there is no data pending.
-	// It will also either return an error or data, never both.
-	Read(*FullAddress) (buffer.View, *Error)
+	//
+	// This method does not block if there is no data pending. It will also
+	// either return an error or data, never both.
+	//
+	// A timestamp (in ns) is optionally returned. A zero value indicates
+	// that no timestamp was available.
+	Read(*FullAddress) (buffer.View, ControlMessages, *Error)
 
 	// Write writes data to the endpoint's peer. This method does not block if
 	// the data cannot be written.
@@ -238,7 +271,10 @@ type Endpoint interface {
 	// Peek reads data without consuming it from the endpoint.
 	//
 	// This method does not block if there is no data pending.
-	Peek([][]byte) (uintptr, *Error)
+	//
+	// A timestamp (in ns) is optionally returned. A zero value indicates
+	// that no timestamp was available.
+	Peek([][]byte) (uintptr, ControlMessages, *Error)
 
 	// Connect connects the endpoint to its peer. Specifying a NIC is
 	// optional.
@@ -347,6 +383,10 @@ type ReuseAddressOption int
 // Only supported on Unix sockets.
 type PasscredOption int
 
+// TimestampOption is used by SetSockOpt/GetSockOpt to specify whether
+// SO_TIMESTAMP socket control messages are enabled.
+type TimestampOption int
+
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
 // TODO: Add and populate stat fields.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5d62589d8..d84171b0c 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -374,7 +374,7 @@ func (e *endpoint) cleanup() {
 }
 
 // Read reads data from the endpoint.
-func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, *tcpip.Error) {
+func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	e.mu.RLock()
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data. Also note that a RST being received
@@ -383,9 +383,9 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, *tcpip.Error) {
 	if s := e.state; s != stateConnected && s != stateClosed && e.rcvBufUsed == 0 {
 		e.mu.RUnlock()
 		if s == stateError {
-			return buffer.View{}, e.hardError
+			return buffer.View{}, tcpip.ControlMessages{}, e.hardError
 		}
-		return buffer.View{}, tcpip.ErrInvalidEndpointState
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
 	}
 
 	e.rcvListMu.Lock()
@@ -394,7 +394,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, *tcpip.Error) {
 
 	e.mu.RUnlock()
 
-	return v, err
+	return v, tcpip.ControlMessages{}, err
 }
 
 func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
@@ -498,7 +498,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 // Peek reads data without consuming it from the endpoint.
 //
 // This method does not block if there is no data pending.
-func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
+func (e *endpoint) Peek(vec [][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
@@ -506,9 +506,9 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
 	// but has some pending unread data.
 	if s := e.state; s != stateConnected && s != stateClosed {
 		if s == stateError {
-			return 0, e.hardError
+			return 0, tcpip.ControlMessages{}, e.hardError
 		}
-		return 0, tcpip.ErrInvalidEndpointState
+		return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
 	}
 
 	e.rcvListMu.Lock()
@@ -516,9 +516,9 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
 
 	if e.rcvBufUsed == 0 {
 		if e.rcvClosed || e.state != stateConnected {
-			return 0, tcpip.ErrClosedForReceive
+			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
 		}
-		return 0, tcpip.ErrWouldBlock
+		return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
 	}
 
 	// Make a copy of vec so we can modify the slide headers.
@@ -534,7 +534,7 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
 
 			for len(v) > 0 {
 				if len(vec) == 0 {
-					return num, nil
+					return num, tcpip.ControlMessages{}, nil
 				}
 				if len(vec[0]) == 0 {
 					vec = vec[1:]
@@ -549,7 +549,7 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
 		}
 	}
 
-	return num, nil
+	return num, tcpip.ControlMessages{}, nil
 }
 
 // zeroReceiveWindow checks if the receive window to be announced now would be
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 118d861ba..3c21a1ec3 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -147,7 +147,7 @@ func TestSimpleReceive(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -169,7 +169,7 @@ func TestSimpleReceive(t *testing.T) {
 	}
 
 	// Receive data.
-	v, err := c.EP.Read(nil)
+	v, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -199,7 +199,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -226,7 +226,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 
 	// Wait 200ms and check that no data has been received.
 	time.Sleep(200 * time.Millisecond)
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -243,7 +243,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 	// Receive data.
 	read := make([]byte, 0, 6)
 	for len(read) < len(data) {
-		v, err := c.EP.Read(nil)
+		v, _, err := c.EP.Read(nil)
 		if err != nil {
 			if err == tcpip.ErrWouldBlock {
 				// Wait for receive to be notified.
@@ -284,7 +284,7 @@ func TestOutOfOrderFlood(t *testing.T) {
 	opt := tcpip.ReceiveBufferSizeOption(10)
 	c.CreateConnected(789, 30000, &opt)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -361,7 +361,7 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -414,7 +414,7 @@ func TestFullWindowReceive(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	_, err := c.EP.Read(nil)
+	_, _, err := c.EP.Read(nil)
 	if err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -449,7 +449,7 @@ func TestFullWindowReceive(t *testing.T) {
 	)
 
 	// Receive data and check it.
-	v, err := c.EP.Read(nil)
+	v, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -487,7 +487,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	_, err := c.EP.Read(nil)
+	_, _, err := c.EP.Read(nil)
 	if err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -551,7 +551,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	// Receive data and check it.
 	read := make([]byte, 0, 10)
 	for len(read) < len(data) {
-		v, err := c.EP.Read(nil)
+		v, _, err := c.EP.Read(nil)
 		if err != nil {
 			t.Fatalf("Unexpected error from Read: %v", err)
 		}
@@ -954,7 +954,7 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 	}
 
 	// Read some data. An ack should be sent in response to that.
-	v, err := c.EP.Read(nil)
+	v, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -1337,7 +1337,7 @@ func TestReceiveOnResetConnection(t *testing.T) {
 
 loop:
 	for {
-		switch _, err := c.EP.Read(nil); err {
+		switch _, _, err := c.EP.Read(nil); err {
 		case nil:
 			t.Fatalf("Unexpected success.")
 		case tcpip.ErrWouldBlock:
@@ -2293,7 +2293,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -2345,7 +2345,7 @@ func TestReadAfterClosedState(t *testing.T) {
 
 	// Check that peek works.
 	peekBuf := make([]byte, 10)
-	n, err := c.EP.Peek([][]byte{peekBuf})
+	n, _, err := c.EP.Peek([][]byte{peekBuf})
 	if err != nil {
 		t.Fatalf("Unexpected error from Peek: %v", err)
 	}
@@ -2356,7 +2356,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	}
 
 	// Receive data.
-	v, err := c.EP.Read(nil)
+	v, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -2367,11 +2367,11 @@ func TestReadAfterClosedState(t *testing.T) {
 
 	// Now that we drained the queue, check that functions fail with the
 	// right error code.
-	if _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
 		t.Fatalf("Unexpected return from Read: got %v, want %v", err, tcpip.ErrClosedForReceive)
 	}
 
-	if _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
+	if _, _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
 		t.Fatalf("Unexpected return from Peek: got %v, want %v", err, tcpip.ErrClosedForReceive)
 	}
 }
@@ -2479,7 +2479,7 @@ func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 }
 
 func TestDefaultBufferSizes(t *testing.T) {
-	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
 
 	// Check the default values.
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
@@ -2525,7 +2525,7 @@ func TestDefaultBufferSizes(t *testing.T) {
 }
 
 func TestMinMaxBufferSizes(t *testing.T) {
-	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
 
 	// Check the default values.
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
@@ -2575,7 +2575,7 @@ func TestSelfConnect(t *testing.T) {
 	// it checks that if an endpoint binds to say 127.0.0.1:1000 then
 	// connects to 127.0.0.1:1000, then it will be connected to itself, and
 	// is able to send and receive data through the same endpoint.
-	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
 
 	id := loopback.New()
 	if testing.Verbose() {
@@ -2637,13 +2637,13 @@ func TestSelfConnect(t *testing.T) {
 	// Read back what was written.
 	wq.EventUnregister(&waitEntry)
 	wq.EventRegister(&waitEntry, waiter.EventIn)
-	rd, err := ep.Read(nil)
+	rd, _, err := ep.Read(nil)
 	if err != nil {
 		if err != tcpip.ErrWouldBlock {
 			t.Fatalf("Read failed: %v", err)
 		}
 		<-notifyCh
-		rd, err = ep.Read(nil)
+		rd, _, err = ep.Read(nil)
 		if err != nil {
 			t.Fatalf("Read failed: %v", err)
 		}
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index d12081bb7..335262e43 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -95,7 +95,7 @@ func TestTimeStampEnabledConnect(t *testing.T) {
 	// There should be 5 views to read and each of them should
 	// contain the same data.
 	for i := 0; i < 5; i++ {
-		got, err := c.EP.Read(nil)
+		got, _, err := c.EP.Read(nil)
 		if err != nil {
 			t.Fatalf("Unexpected error from Read: %v", err)
 		}
@@ -296,7 +296,7 @@ func TestSegmentDropWhenTimestampMissing(t *testing.T) {
 	}
 
 	// Issue a read and we should data.
-	got, err := c.EP.Read(nil)
+	got, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 6a402d150..eb928553f 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -129,7 +129,7 @@ type Context struct {
 // New allocates and initializes a test context containing a new
 // stack and a link-layer endpoint.
 func New(t *testing.T, mtu uint32) *Context {
-	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName})
 
 	// Allow minimum send/receive buffer sizes to be 1 during tests.
 	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{1, tcp.DefaultBufferSize, tcp.DefaultBufferSize * 10}); err != nil {
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 80fa88c4c..f86fc6d5a 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -19,6 +19,8 @@ type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	timestamp     int64
+	hasTimestamp  bool
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -52,6 +54,7 @@ type endpoint struct {
 	rcvBufSizeMax int `state:".(int)"`
 	rcvBufSize    int
 	rcvClosed     bool
+	rcvTimestamp  bool
 
 	// The following fields are protected by the mu mutex.
 	mu         sync.RWMutex `state:"nosave"`
@@ -134,7 +137,7 @@ func (e *endpoint) Close() {
 
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
-func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, *tcpip.Error) {
+func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	e.rcvMu.Lock()
 
 	if e.rcvList.Empty() {
@@ -143,12 +146,13 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, *tcpip.Error) {
 			err = tcpip.ErrClosedForReceive
 		}
 		e.rcvMu.Unlock()
-		return buffer.View{}, err
+		return buffer.View{}, tcpip.ControlMessages{}, err
 	}
 
 	p := e.rcvList.Front()
 	e.rcvList.Remove(p)
 	e.rcvBufSize -= p.data.Size()
+	ts := e.rcvTimestamp
 
 	e.rcvMu.Unlock()
 
@@ -156,7 +160,12 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, *tcpip.Error) {
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), nil
+	if ts && !p.hasTimestamp {
+		// Linux uses the current time.
+		p.timestamp = e.stack.NowNanoseconds()
+	}
+
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: ts, Timestamp: p.timestamp}, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -299,8 +308,8 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 }
 
 // Peek only returns data from a single datagram, so do nothing here.
-func (e *endpoint) Peek([][]byte) (uintptr, *tcpip.Error) {
-	return 0, nil
+func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
@@ -322,6 +331,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 
 		e.v6only = v != 0
+
+	case tcpip.TimestampOption:
+		e.rcvMu.Lock()
+		e.rcvTimestamp = v != 0
+		e.rcvMu.Unlock()
 	}
 	return nil
 }
@@ -370,6 +384,14 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		e.rcvMu.Unlock()
 		return nil
+
+	case *tcpip.TimestampOption:
+		e.rcvMu.Lock()
+		*o = 0
+		if e.rcvTimestamp {
+			*o = 1
+		}
+		e.rcvMu.Unlock()
 	}
 
 	return tcpip.ErrUnknownProtocolOption
@@ -733,6 +755,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 	e.rcvList.PushBack(pkt)
 	e.rcvBufSize += vv.Size()
 
+	if e.rcvTimestamp {
+		pkt.timestamp = e.stack.NowNanoseconds()
+		pkt.hasTimestamp = true
+	}
+
 	e.rcvMu.Unlock()
 
 	// Notify any waiters that there's data to be read now.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 65c567952..1eb9ecb80 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,7 +56,7 @@ type headers struct {
 }
 
 func newDualTestContext(t *testing.T, mtu uint32) *testContext {
-	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{udp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{udp.ProtocolName})
 
 	id, linkEP := channel.New(256, mtu, "")
 	if testing.Verbose() {
@@ -260,12 +260,12 @@ func testV4Read(c *testContext) {
 	defer c.wq.EventUnregister(&we)
 
 	var addr tcpip.FullAddress
-	v, err := c.ep.Read(&addr)
+	v, _, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, err = c.ep.Read(&addr)
+			v, _, err = c.ep.Read(&addr)
 			if err != nil {
 				c.t.Fatalf("Read failed: %v", err)
 			}
@@ -355,12 +355,12 @@ func TestV6ReadOnV6(t *testing.T) {
 	defer c.wq.EventUnregister(&we)
 
 	var addr tcpip.FullAddress
-	v, err := c.ep.Read(&addr)
+	v, _, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, err = c.ep.Read(&addr)
+			v, _, err = c.ep.Read(&addr)
 			if err != nil {
 				c.t.Fatalf("Read failed: %v", err)
 			}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index a470cb054..d63a9028e 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -37,6 +37,7 @@ import (
 	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
@@ -177,7 +178,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
-	networkStack := newEmptyNetworkStack(conf)
+	networkStack := newEmptyNetworkStack(conf, k)
 
 	// Initiate the Kernel object, which is required by the Context passed
 	// to createVFS in order to mount (among other things) procfs.
@@ -337,7 +338,7 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config) inet.Stack {
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) inet.Stack {
 	switch conf.Network {
 	case NetworkHost:
 		return hostinet.NewStack()
@@ -346,7 +347,7 @@ func newEmptyNetworkStack(conf *Config) inet.Stack {
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
 		protoNames := []string{tcp.ProtocolName, udp.ProtocolName}
-		return &epsocket.Stack{stack.New(netProtos, protoNames)}
+		return &epsocket.Stack{stack.New(clock, netProtos, protoNames)}
 
 	default:
 		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
-- 
cgit v1.2.3


From eb5414ee29f20b1805345820e6174afff84276c2 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 1 May 2018 22:50:55 -0700
Subject: Add support for ping sockets

PiperOrigin-RevId: 195049322
Change-Id: I09f6dd58cf10a2e50e53d17d2823d540102913c5
---
 pkg/sentry/socket/epsocket/BUILD           |   1 +
 pkg/sentry/socket/epsocket/provider.go     |  19 +-
 pkg/sentry/syscalls/linux/sys_socket.go    |   2 +-
 pkg/tcpip/network/arp/BUILD                |   1 +
 pkg/tcpip/network/arp/arp_test.go          |   3 +-
 pkg/tcpip/network/ipv4/BUILD               |  17 +-
 pkg/tcpip/network/ipv4/icmp.go             | 190 +--------
 pkg/tcpip/network/ipv4/icmp_test.go        | 124 ------
 pkg/tcpip/transport/ping/BUILD             |  50 +++
 pkg/tcpip/transport/ping/endpoint.go       | 665 +++++++++++++++++++++++++++++
 pkg/tcpip/transport/ping/endpoint_state.go |  61 +++
 pkg/tcpip/transport/ping/protocol.go       | 106 +++++
 pkg/tcpip/transport/udp/endpoint_state.go  |   2 +-
 runsc/boot/BUILD                           |   1 +
 runsc/boot/loader.go                       |   3 +-
 15 files changed, 914 insertions(+), 331 deletions(-)
 delete mode 100644 pkg/tcpip/network/ipv4/icmp_test.go
 create mode 100644 pkg/tcpip/transport/ping/BUILD
 create mode 100644 pkg/tcpip/transport/ping/endpoint.go
 create mode 100644 pkg/tcpip/transport/ping/endpoint_state.go
 create mode 100644 pkg/tcpip/transport/ping/protocol.go

(limited to 'runsc')

diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 0e463a92a..8430886cb 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -50,6 +50,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 5616435b3..6c1e3b6b9 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
@@ -37,8 +38,8 @@ type provider struct {
 	netProto tcpip.NetworkProtocolNumber
 }
 
-// GetTransportProtocol figures out transport protocol. Currently only TCP and
-// UDP are supported.
+// GetTransportProtocol figures out transport protocol. Currently only TCP,
+// UDP, and ICMP are supported.
 func GetTransportProtocol(stype unix.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
 	switch stype {
 	case linux.SOCK_STREAM:
@@ -48,14 +49,16 @@ func GetTransportProtocol(stype unix.SockType, protocol int) (tcpip.TransportPro
 		return tcp.ProtocolNumber, nil
 
 	case linux.SOCK_DGRAM:
-		if protocol != 0 && protocol != syscall.IPPROTO_UDP {
-			return 0, syserr.ErrInvalidArgument
+		switch protocol {
+		case 0, syscall.IPPROTO_UDP:
+			return udp.ProtocolNumber, nil
+		case syscall.IPPROTO_ICMP:
+			return header.ICMPv4ProtocolNumber, nil
+		case syscall.IPPROTO_ICMPV6:
+			return header.ICMPv6ProtocolNumber, nil
 		}
-		return udp.ProtocolNumber, nil
-
-	default:
-		return 0, syserr.ErrInvalidArgument
 	}
+	return 0, syserr.ErrInvalidArgument
 }
 
 // Socket creates a new socket object for the AF_INET or AF_INET6 family.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 6258a1539..d6d5dba8a 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -791,7 +791,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC) != 0 {
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CONFIRM) != 0 {
 		return 0, syscall.EINVAL
 	}
 
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index e6d0899a9..58d174965 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -30,5 +30,6 @@ go_test(
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/ping",
     ],
 )
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 47b10e64e..6d61ff1d7 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -16,6 +16,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping"
 )
 
 const (
@@ -32,7 +33,7 @@ type testContext struct {
 }
 
 func newTestContext(t *testing.T) *testContext {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, arp.ProtocolName}, []string{ipv4.PingProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, arp.ProtocolName}, []string{ping.ProtocolName4})
 
 	const defaultMTU = 65536
 	id, linkEP := channel.New(256, defaultMTU, stackLinkAddr)
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 9df113df1..02d55355c 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # BSD
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "ipv4",
@@ -19,20 +19,5 @@ go_library(
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
-        "//pkg/waiter",
-    ],
-)
-
-go_test(
-    name = "ipv4_test",
-    size = "small",
-    srcs = ["icmp_test.go"],
-    deps = [
-        ":ipv4",
-        "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
-        "//pkg/tcpip/link/channel",
-        "//pkg/tcpip/link/sniffer",
-        "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index ffd761350..3c382fdc2 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -5,26 +5,14 @@
 package ipv4
 
 import (
-	"context"
 	"encoding/binary"
-	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// PingProtocolName is a pseudo transport protocol used to handle ping replies.
-// Use it when constructing a stack that intends to use ipv4.Ping.
-const PingProtocolName = "icmpv4ping"
-
-// pingProtocolNumber is a fake transport protocol used to
-// deliver incoming ICMP echo replies. The ICMP identifier
-// number is used as a port number for multiplexing.
-const pingProtocolNumber tcpip.TransportProtocolNumber = 256 + 11
-
 // handleControl handles the case when an ICMP packet contains the headers of
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
@@ -78,7 +66,10 @@ func (e *endpoint) handleICMP(r *stack.Route, vv *buffer.VectorisedView) {
 		}
 
 	case header.ICMPv4EchoReply:
-		e.dispatcher.DeliverTransportPacket(r, pingProtocolNumber, vv)
+		if len(v) < header.ICMPv4EchoMinimumSize {
+			return
+		}
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, vv)
 
 	case header.ICMPv4DstUnreachable:
 		if len(v) < header.ICMPv4DstUnreachableMinimumSize {
@@ -104,179 +95,20 @@ type echoRequest struct {
 
 func (e *endpoint) echoReplier() {
 	for req := range e.echoRequests {
-		sendICMPv4(&req.r, header.ICMPv4EchoReply, 0, req.v)
+		sendPing4(&req.r, 0, req.v)
 		req.r.Release()
 	}
 }
 
-func sendICMPv4(r *stack.Route, typ header.ICMPv4Type, code byte, data buffer.View) *tcpip.Error {
-	hdr := buffer.NewPrependable(header.ICMPv4MinimumSize + int(r.MaxHeaderLength()))
+func sendPing4(r *stack.Route, code byte, data buffer.View) *tcpip.Error {
+	hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength()))
 
-	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
-	icmpv4.SetType(typ)
+	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize))
+	icmpv4.SetType(header.ICMPv4EchoReply)
 	icmpv4.SetCode(code)
+	copy(icmpv4[header.ICMPv4MinimumSize:], data)
+	data = data[header.ICMPv4EchoMinimumSize-header.ICMPv4MinimumSize:]
 	icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
 
 	return r.WritePacket(&hdr, data, header.ICMPv4ProtocolNumber)
 }
-
-// A Pinger can send echo requests to an address.
-type Pinger struct {
-	Stack     *stack.Stack
-	NICID     tcpip.NICID
-	Addr      tcpip.Address
-	LocalAddr tcpip.Address // optional
-	Wait      time.Duration // if zero, defaults to 1 second
-	Count     uint16        // if zero, defaults to MaxUint16
-}
-
-// Ping sends echo requests to an ICMPv4 endpoint.
-// Responses are streamed to the channel ch.
-func (p *Pinger) Ping(ctx context.Context, ch chan<- PingReply) *tcpip.Error {
-	count := p.Count
-	if count == 0 {
-		count = 1<<16 - 1
-	}
-	wait := p.Wait
-	if wait == 0 {
-		wait = 1 * time.Second
-	}
-
-	r, err := p.Stack.FindRoute(p.NICID, p.LocalAddr, p.Addr, ProtocolNumber)
-	if err != nil {
-		return err
-	}
-
-	netProtos := []tcpip.NetworkProtocolNumber{ProtocolNumber}
-	ep := &pingEndpoint{
-		stack: p.Stack,
-		pktCh: make(chan buffer.View, 1),
-	}
-	id := stack.TransportEndpointID{
-		LocalAddress:  r.LocalAddress,
-		RemoteAddress: p.Addr,
-	}
-
-	_, err = p.Stack.PickEphemeralPort(func(port uint16) (bool, *tcpip.Error) {
-		id.LocalPort = port
-		err := p.Stack.RegisterTransportEndpoint(p.NICID, netProtos, pingProtocolNumber, id, ep)
-		switch err {
-		case nil:
-			return true, nil
-		case tcpip.ErrPortInUse:
-			return false, nil
-		default:
-			return false, err
-		}
-	})
-	if err != nil {
-		return err
-	}
-	defer p.Stack.UnregisterTransportEndpoint(p.NICID, netProtos, pingProtocolNumber, id)
-
-	v := buffer.NewView(4)
-	binary.BigEndian.PutUint16(v[0:], id.LocalPort)
-
-	start := time.Now()
-
-	done := make(chan struct{})
-	go func(count int) {
-	loop:
-		for ; count > 0; count-- {
-			select {
-			case v := <-ep.pktCh:
-				seq := binary.BigEndian.Uint16(v[header.ICMPv4MinimumSize+2:])
-				ch <- PingReply{
-					Duration:  time.Since(start) - time.Duration(seq)*wait,
-					SeqNumber: seq,
-				}
-			case <-ctx.Done():
-				break loop
-			}
-		}
-		close(done)
-	}(int(count))
-	defer func() { <-done }()
-
-	t := time.NewTicker(wait)
-	defer t.Stop()
-	for seq := uint16(0); seq < count; seq++ {
-		select {
-		case <-t.C:
-		case <-ctx.Done():
-			return nil
-		}
-		binary.BigEndian.PutUint16(v[2:], seq)
-		sent := time.Now()
-		if err := sendICMPv4(&r, header.ICMPv4Echo, 0, v); err != nil {
-			ch <- PingReply{
-				Error:     err,
-				Duration:  time.Since(sent),
-				SeqNumber: seq,
-			}
-		}
-	}
-	return nil
-}
-
-// PingReply summarizes an ICMP echo reply.
-type PingReply struct {
-	Error     *tcpip.Error // reports any errors sending a ping request
-	Duration  time.Duration
-	SeqNumber uint16
-}
-
-type pingProtocol struct{}
-
-func (*pingProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return nil, tcpip.ErrNotSupported // endpoints are created directly
-}
-
-func (*pingProtocol) Number() tcpip.TransportProtocolNumber { return pingProtocolNumber }
-
-func (*pingProtocol) MinimumPacketSize() int { return header.ICMPv4EchoMinimumSize }
-
-func (*pingProtocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
-	ident := binary.BigEndian.Uint16(v[4:])
-	return 0, ident, nil
-}
-
-func (*pingProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *buffer.VectorisedView) bool {
-	return true
-}
-
-// SetOption implements TransportProtocol.SetOption.
-func (p *pingProtocol) SetOption(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-// Option implements TransportProtocol.Option.
-func (p *pingProtocol) Option(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-func init() {
-	stack.RegisterTransportProtocolFactory(PingProtocolName, func() stack.TransportProtocol {
-		return &pingProtocol{}
-	})
-}
-
-type pingEndpoint struct {
-	stack *stack.Stack
-	pktCh chan buffer.View
-}
-
-func (e *pingEndpoint) Close() {
-	close(e.pktCh)
-}
-
-func (e *pingEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv *buffer.VectorisedView) {
-	select {
-	case e.pktCh <- vv.ToView():
-	default:
-	}
-}
-
-// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *pingEndpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv *buffer.VectorisedView) {
-}
diff --git a/pkg/tcpip/network/ipv4/icmp_test.go b/pkg/tcpip/network/ipv4/icmp_test.go
deleted file mode 100644
index c55aa1835..000000000
--- a/pkg/tcpip/network/ipv4/icmp_test.go
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright 2016 The Netstack Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package ipv4_test
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/channel"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-)
-
-const stackAddr = "\x0a\x00\x00\x01"
-
-type testContext struct {
-	t      *testing.T
-	linkEP *channel.Endpoint
-	s      *stack.Stack
-}
-
-func newTestContext(t *testing.T) *testContext {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{ipv4.PingProtocolName})
-
-	const defaultMTU = 65536
-	id, linkEP := channel.New(256, defaultMTU, "")
-	if testing.Verbose() {
-		id = sniffer.New(id)
-	}
-	if err := s.CreateNIC(1, id); err != nil {
-		t.Fatalf("CreateNIC failed: %v", err)
-	}
-
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
-	}
-
-	s.SetRouteTable([]tcpip.Route{{
-		Destination: "\x00\x00\x00\x00",
-		Mask:        "\x00\x00\x00\x00",
-		Gateway:     "",
-		NIC:         1,
-	}})
-
-	return &testContext{
-		t:      t,
-		s:      s,
-		linkEP: linkEP,
-	}
-}
-
-func (c *testContext) cleanup() {
-	close(c.linkEP.C)
-}
-
-func (c *testContext) loopback() {
-	go func() {
-		for pkt := range c.linkEP.C {
-			v := make(buffer.View, len(pkt.Header)+len(pkt.Payload))
-			copy(v, pkt.Header)
-			copy(v[len(pkt.Header):], pkt.Payload)
-			vv := v.ToVectorisedView([1]buffer.View{})
-			c.linkEP.Inject(pkt.Proto, &vv)
-		}
-	}()
-}
-
-func TestEcho(t *testing.T) {
-	c := newTestContext(t)
-	defer c.cleanup()
-	c.loopback()
-
-	ch := make(chan ipv4.PingReply, 1)
-	p := ipv4.Pinger{
-		Stack: c.s,
-		NICID: 1,
-		Addr:  stackAddr,
-		Wait:  10 * time.Millisecond,
-		Count: 1, // one ping only
-	}
-	if err := p.Ping(context.Background(), ch); err != nil {
-		t.Fatalf("icmp.Ping failed: %v", err)
-	}
-
-	ping := <-ch
-	if ping.Error != nil {
-		t.Errorf("bad ping response: %v", ping.Error)
-	}
-}
-
-func TestEchoSequence(t *testing.T) {
-	c := newTestContext(t)
-	defer c.cleanup()
-	c.loopback()
-
-	const numPings = 3
-	ch := make(chan ipv4.PingReply, numPings)
-	p := ipv4.Pinger{
-		Stack: c.s,
-		NICID: 1,
-		Addr:  stackAddr,
-		Wait:  10 * time.Millisecond,
-		Count: numPings,
-	}
-	if err := p.Ping(context.Background(), ch); err != nil {
-		t.Fatalf("icmp.Ping failed: %v", err)
-	}
-
-	for i := uint16(0); i < numPings; i++ {
-		ping := <-ch
-		if ping.Error != nil {
-			t.Errorf("i=%d bad ping response: %v", i, ping.Error)
-		}
-		if ping.SeqNumber != i {
-			t.Errorf("SeqNumber=%d, want %d", ping.SeqNumber, i)
-		}
-	}
-}
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
new file mode 100644
index 000000000..a39a887b6
--- /dev/null
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -0,0 +1,50 @@
+package(licenses = ["notice"])  # BSD
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "ping_state",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "ping_packet_list.go",
+    ],
+    out = "ping_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
+    package = "ping",
+)
+
+go_template_instance(
+    name = "ping_packet_list",
+    out = "ping_packet_list.go",
+    package = "ping",
+    prefix = "pingPacket",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*pingPacket",
+    },
+)
+
+go_library(
+    name = "ping",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "ping_packet_list.go",
+        "ping_state.go",
+        "protocol.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sleep",
+        "//pkg/state",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
new file mode 100644
index 000000000..609e7d947
--- /dev/null
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -0,0 +1,665 @@
+// Copyright 2016 The Netstack Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ping
+
+import (
+	"encoding/binary"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sleep"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+type pingPacket struct {
+	pingPacketEntry
+	senderAddress tcpip.FullAddress
+	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	timestamp     int64
+	hasTimestamp  bool
+	// views is used as buffer for data when its length is large
+	// enough to store a VectorisedView.
+	views [8]buffer.View `state:"nosave"`
+}
+
+type endpointState int
+
+const (
+	stateInitial endpointState = iota
+	stateBound
+	stateConnected
+	stateClosed
+)
+
+// endpoint represents a ping endpoint. This struct serves as the interface
+// between users of the endpoint and the protocol implementation; it is legal to
+// have concurrent goroutines make calls into the endpoint, they are properly
+// synchronized.
+type endpoint struct {
+	// The following fields are initialized at creation time and do not
+	// change throughout the lifetime of the endpoint.
+	stack       *stack.Stack `state:"manual"`
+	netProto    tcpip.NetworkProtocolNumber
+	waiterQueue *waiter.Queue
+
+	// The following fields are used to manage the receive queue, and are
+	// protected by rcvMu.
+	rcvMu         sync.Mutex `state:"nosave"`
+	rcvReady      bool
+	rcvList       pingPacketList
+	rcvBufSizeMax int
+	rcvBufSize    int
+	rcvClosed     bool
+	rcvTimestamp  bool
+
+	// The following fields are protected by the mu mutex.
+	mu         sync.RWMutex `state:"nosave"`
+	sndBufSize int
+	id         stack.TransportEndpointID
+	state      endpointState
+	bindNICID  tcpip.NICID
+	bindAddr   tcpip.Address
+	regNICID   tcpip.NICID
+	route      stack.Route `state:"manual"`
+}
+
+func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
+	return &endpoint{
+		stack:         stack,
+		netProto:      netProto,
+		waiterQueue:   waiterQueue,
+		rcvBufSizeMax: 32 * 1024,
+		sndBufSize:    32 * 1024,
+	}
+}
+
+// Close puts the endpoint in a closed state and frees all resources
+// associated with it.
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	switch e.state {
+	case stateBound, stateConnected:
+		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, ProtocolNumber4, e.id)
+	}
+
+	// Close the receive list and drain it.
+	e.rcvMu.Lock()
+	e.rcvClosed = true
+	e.rcvBufSize = 0
+	for !e.rcvList.Empty() {
+		p := e.rcvList.Front()
+		e.rcvList.Remove(p)
+	}
+	e.rcvMu.Unlock()
+
+	e.route.Release()
+
+	// Update the state.
+	e.state = stateClosed
+}
+
+// Read reads data from the endpoint. This method does not block if
+// there is no data pending.
+func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	e.rcvMu.Lock()
+
+	if e.rcvList.Empty() {
+		err := tcpip.ErrWouldBlock
+		if e.rcvClosed {
+			err = tcpip.ErrClosedForReceive
+		}
+		e.rcvMu.Unlock()
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	p := e.rcvList.Front()
+	e.rcvList.Remove(p)
+	e.rcvBufSize -= p.data.Size()
+	ts := e.rcvTimestamp
+
+	e.rcvMu.Unlock()
+
+	if addr != nil {
+		*addr = p.senderAddress
+	}
+
+	if ts && !p.hasTimestamp {
+		// Linux uses the current time.
+		p.timestamp = e.stack.NowNanoseconds()
+	}
+
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: ts, Timestamp: p.timestamp}, nil
+}
+
+// prepareForWrite prepares the endpoint for sending data. In particular, it
+// binds it if it's still in the initial state. To do so, it must first
+// reacquire the mutex in exclusive mode.
+//
+// Returns true for retry if preparation should be retried.
+func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
+	switch e.state {
+	case stateInitial:
+	case stateConnected:
+		return false, nil
+
+	case stateBound:
+		if to == nil {
+			return false, tcpip.ErrDestinationRequired
+		}
+		return false, nil
+	default:
+		return false, tcpip.ErrInvalidEndpointState
+	}
+
+	e.mu.RUnlock()
+	defer e.mu.RLock()
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// The state changed when we released the shared locked and re-acquired
+	// it in exclusive mode. Try again.
+	if e.state != stateInitial {
+		return true, nil
+	}
+
+	// The state is still 'initial', so try to bind the endpoint.
+	if err := e.bindLocked(tcpip.FullAddress{}, nil); err != nil {
+		return false, err
+	}
+
+	return true, nil
+}
+
+// Write writes data to the endpoint's peer. This method does not block
+// if the data cannot be written.
+func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tcpip.Error) {
+	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
+	if opts.More {
+		return 0, tcpip.ErrInvalidOptionValue
+	}
+
+	to := opts.To
+
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	// Prepare for write.
+	for {
+		retry, err := e.prepareForWrite(to)
+		if err != nil {
+			return 0, err
+		}
+
+		if !retry {
+			break
+		}
+	}
+
+	var route *stack.Route
+	if to == nil {
+		route = &e.route
+
+		if route.IsResolutionRequired() {
+			// Promote lock to exclusive if using a shared route, given that it may
+			// need to change in Route.Resolve() call below.
+			e.mu.RUnlock()
+			defer e.mu.RLock()
+
+			e.mu.Lock()
+			defer e.mu.Unlock()
+
+			// Recheck state after lock was re-acquired.
+			if e.state != stateConnected {
+				return 0, tcpip.ErrInvalidEndpointState
+			}
+		}
+	} else {
+		// Reject destination address if it goes through a different
+		// NIC than the endpoint was bound to.
+		nicid := to.NIC
+		if e.bindNICID != 0 {
+			if nicid != 0 && nicid != e.bindNICID {
+				return 0, tcpip.ErrNoRoute
+			}
+
+			nicid = e.bindNICID
+		}
+
+		toCopy := *to
+		to = &toCopy
+		netProto, err := e.checkV4Mapped(to, true)
+		if err != nil {
+			return 0, err
+		}
+
+		// Find the enpoint.
+		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto)
+		if err != nil {
+			return 0, err
+		}
+		defer r.Release()
+
+		route = &r
+	}
+
+	if route.IsResolutionRequired() {
+		waker := &sleep.Waker{}
+		if err := route.Resolve(waker); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				// Link address needs to be resolved. Resolution was triggered the
+				// background. Better luck next time.
+				//
+				// TODO: queue up the request and send after link address
+				// is resolved.
+				route.RemoveWaker(waker)
+				return 0, tcpip.ErrNoLinkAddress
+			}
+			return 0, err
+		}
+	}
+
+	v, err := p.Get(p.Size())
+	if err != nil {
+		return 0, err
+	}
+
+	switch e.netProto {
+	case header.IPv4ProtocolNumber:
+		err = sendPing4(route, e.id.LocalPort, v)
+
+	case header.IPv6ProtocolNumber:
+		// TODO: Support IPv6.
+	}
+
+	return uintptr(len(v)), err
+}
+
+// Peek only returns data from a single datagram, so do nothing here.
+func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// SetSockOpt sets a socket option. Currently not supported.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
+	case tcpip.TimestampOption:
+		e.rcvMu.Lock()
+		e.rcvTimestamp = v != 0
+		e.rcvMu.Unlock()
+	}
+	return nil
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+
+	case *tcpip.SendBufferSizeOption:
+		e.mu.Lock()
+		*o = tcpip.SendBufferSizeOption(e.sndBufSize)
+		e.mu.Unlock()
+		return nil
+
+	case *tcpip.ReceiveBufferSizeOption:
+		e.rcvMu.Lock()
+		*o = tcpip.ReceiveBufferSizeOption(e.rcvBufSizeMax)
+		e.rcvMu.Unlock()
+		return nil
+
+	case *tcpip.ReceiveQueueSizeOption:
+		e.rcvMu.Lock()
+		if e.rcvList.Empty() {
+			*o = 0
+		} else {
+			p := e.rcvList.Front()
+			*o = tcpip.ReceiveQueueSizeOption(p.data.Size())
+		}
+		e.rcvMu.Unlock()
+		return nil
+
+	case *tcpip.TimestampOption:
+		e.rcvMu.Lock()
+		*o = 0
+		if e.rcvTimestamp {
+			*o = 1
+		}
+		e.rcvMu.Unlock()
+	}
+
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func sendPing4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
+	if len(data) < header.ICMPv4EchoMinimumSize {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Set the ident. Sequence number is provided by the user.
+	binary.BigEndian.PutUint16(data[header.ICMPv4MinimumSize:], ident)
+
+	hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength()))
+
+	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize))
+	copy(icmpv4, data)
+	data = data[header.ICMPv4EchoMinimumSize:]
+
+	// Linux performs these basic checks.
+	if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	icmpv4.SetChecksum(0)
+	icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
+
+	return r.WritePacket(&hdr, data, header.ICMPv4ProtocolNumber)
+}
+
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := e.netProto
+	if header.IsV4MappedAddress(addr.Addr) {
+		return 0, tcpip.ErrNoRoute
+	}
+
+	// Fail if we're bound to an address length different from the one we're
+	// checking.
+	if l := len(e.id.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+
+	return netProto, nil
+}
+
+// Connect connects the endpoint to its peer. Specifying a NIC is optional.
+func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	nicid := addr.NIC
+	localPort := uint16(0)
+	switch e.state {
+	case stateBound, stateConnected:
+		localPort = e.id.LocalPort
+		if e.bindNICID == 0 {
+			break
+		}
+
+		if nicid != 0 && nicid != e.bindNICID {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		nicid = e.bindNICID
+	default:
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	netProto, err := e.checkV4Mapped(&addr, false)
+	if err != nil {
+		return err
+	}
+
+	// Find a route to the desired destination.
+	r, err := e.stack.FindRoute(nicid, e.bindAddr, addr.Addr, netProto)
+	if err != nil {
+		return err
+	}
+	defer r.Release()
+
+	id := stack.TransportEndpointID{
+		LocalAddress:  r.LocalAddress,
+		LocalPort:     localPort,
+		RemoteAddress: r.RemoteAddress,
+	}
+
+	// Even if we're connected, this endpoint can still be used to send
+	// packets on a different network protocol, so we register both even if
+	// v6only is set to false and this is an ipv6 endpoint.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+
+	id, err = e.registerWithStack(nicid, netProtos, id)
+	if err != nil {
+		return err
+	}
+
+	e.id = id
+	e.route = r.Clone()
+	e.regNICID = nicid
+
+	e.state = stateConnected
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// ConnectEndpoint is not supported.
+func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown closes the read and/or write end of the endpoint connection
+// to its peer.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	if e.state != stateConnected {
+		return tcpip.ErrNotConnected
+	}
+
+	if flags&tcpip.ShutdownRead != 0 {
+		e.rcvMu.Lock()
+		wasClosed := e.rcvClosed
+		e.rcvClosed = true
+		e.rcvMu.Unlock()
+
+		if !wasClosed {
+			e.waiterQueue.Notify(waiter.EventIn)
+		}
+	}
+
+	return nil
+}
+
+// Listen is not supported by UDP, it just fails.
+func (*endpoint) Listen(int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept is not supported by UDP, it just fails.
+func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	return nil, nil, tcpip.ErrNotSupported
+}
+
+func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
+	if id.LocalPort != 0 {
+		// The endpoint already has a local port, just attempt to
+		// register it.
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber4, id, e)
+		return id, err
+	}
+
+	// We need to find a port for the endpoint.
+	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
+		id.LocalPort = p
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber4, id, e)
+		switch err {
+		case nil:
+			return true, nil
+		case tcpip.ErrPortInUse:
+			return false, nil
+		default:
+			return false, err
+		}
+	})
+
+	return id, err
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	// Don't allow binding once endpoint is not in the initial state
+	// anymore.
+	if e.state != stateInitial {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	netProto, err := e.checkV4Mapped(&addr, false)
+	if err != nil {
+		return err
+	}
+
+	// Expand netProtos to include v4 and v6 if the caller is binding to a
+	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
+	// set to false.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+
+	if len(addr.Addr) != 0 {
+		// A local address was specified, verify that it's valid.
+		if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 {
+			return tcpip.ErrBadLocalAddress
+		}
+	}
+
+	id := stack.TransportEndpointID{
+		LocalPort:    addr.Port,
+		LocalAddress: addr.Addr,
+	}
+	id, err = e.registerWithStack(addr.NIC, netProtos, id)
+	if err != nil {
+		return err
+	}
+	if commit != nil {
+		if err := commit(); err != nil {
+			// Unregister, the commit failed.
+			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, ProtocolNumber4, id)
+			return err
+		}
+	}
+
+	e.id = id
+	e.regNICID = addr.NIC
+
+	// Mark endpoint as bound.
+	e.state = stateBound
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// Bind binds the endpoint to a specific local address and port.
+// Specifying a NIC is optional.
+func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	err := e.bindLocked(addr, commit)
+	if err != nil {
+		return err
+	}
+
+	e.bindNICID = addr.NIC
+	e.bindAddr = addr.Addr
+
+	return nil
+}
+
+// GetLocalAddress returns the address to which the endpoint is bound.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	return tcpip.FullAddress{
+		NIC:  e.regNICID,
+		Addr: e.id.LocalAddress,
+		Port: e.id.LocalPort,
+	}, nil
+}
+
+// GetRemoteAddress returns the address to which the endpoint is connected.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	if e.state != stateConnected {
+		return tcpip.FullAddress{}, tcpip.ErrNotConnected
+	}
+
+	return tcpip.FullAddress{
+		NIC:  e.regNICID,
+		Addr: e.id.RemoteAddress,
+		Port: e.id.RemotePort,
+	}, nil
+}
+
+// Readiness returns the current readiness of the endpoint. For example, if
+// waiter.EventIn is set, the endpoint is immediately readable.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// The endpoint is always writable.
+	result := waiter.EventOut & mask
+
+	// Determine if the endpoint is readable if requested.
+	if (mask & waiter.EventIn) != 0 {
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() || e.rcvClosed {
+			result |= waiter.EventIn
+		}
+		e.rcvMu.Unlock()
+	}
+
+	return result
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv *buffer.VectorisedView) {
+	e.rcvMu.Lock()
+
+	// Drop the packet if our buffer is currently full.
+	if !e.rcvReady || e.rcvClosed || e.rcvBufSize >= e.rcvBufSizeMax {
+		e.rcvMu.Unlock()
+		return
+	}
+
+	wasEmpty := e.rcvBufSize == 0
+
+	// Push new packet into receive list and increment the buffer size.
+	pkt := &pingPacket{
+		senderAddress: tcpip.FullAddress{
+			NIC:  r.NICID(),
+			Addr: id.RemoteAddress,
+		},
+	}
+	pkt.data = vv.Clone(pkt.views[:])
+	e.rcvList.PushBack(pkt)
+	e.rcvBufSize += vv.Size()
+
+	if e.rcvTimestamp {
+		pkt.timestamp = e.stack.NowNanoseconds()
+		pkt.hasTimestamp = true
+	}
+
+	e.rcvMu.Unlock()
+
+	// Notify any waiters that there's data to be read now.
+	if wasEmpty {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv *buffer.VectorisedView) {
+}
diff --git a/pkg/tcpip/transport/ping/endpoint_state.go b/pkg/tcpip/transport/ping/endpoint_state.go
new file mode 100644
index 000000000..e1664f049
--- /dev/null
+++ b/pkg/tcpip/transport/ping/endpoint_state.go
@@ -0,0 +1,61 @@
+// Copyright 2016 The Netstack Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ping
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves pingPacket.data field.
+func (p *pingPacket) saveData() buffer.VectorisedView {
+	// We cannot save p.data directly as p.data.views may alias to p.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return p.data.Clone(nil)
+}
+
+// loadData loads pingPacket.data field.
+func (p *pingPacket) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing p.views for data.views.
+	p.data = data
+}
+
+// beforeSave is invoked by stateify.
+func (e *endpoint) beforeSave() {
+	// Stop incoming packets from being handled (and mutate endpoint state).
+	e.rcvMu.Lock()
+}
+
+// afterLoad is invoked by stateify.
+func (e *endpoint) afterLoad() {
+	e.stack = stack.StackFromEnv
+
+	if e.state != stateBound && e.state != stateConnected {
+		return
+	}
+
+	var err *tcpip.Error
+	if e.state == stateConnected {
+		e.route, err = e.stack.FindRoute(e.regNICID, e.bindAddr, e.id.RemoteAddress, e.netProto)
+		if err != nil {
+			panic(*err)
+		}
+
+		e.id.LocalAddress = e.route.LocalAddress
+	} else if len(e.id.LocalAddress) != 0 { // stateBound
+		if e.stack.CheckLocalAddress(e.regNICID, e.netProto, e.id.LocalAddress) == 0 {
+			panic(tcpip.ErrBadLocalAddress)
+		}
+	}
+
+	e.id, err = e.registerWithStack(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.id)
+	if err != nil {
+		panic(*err)
+	}
+}
diff --git a/pkg/tcpip/transport/ping/protocol.go b/pkg/tcpip/transport/ping/protocol.go
new file mode 100644
index 000000000..1459b4d60
--- /dev/null
+++ b/pkg/tcpip/transport/ping/protocol.go
@@ -0,0 +1,106 @@
+// Copyright 2016 The Netstack Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package ping contains the implementation of the ICMP and IPv6-ICMP transport
+// protocols for use in ping. To use it in the networking stack, this package
+// must be added to the project, and
+// activated on the stack by passing ping.ProtocolName (or "ping") and/or
+// ping.ProtocolName6 (or "ping6") as one of the transport protocols when
+// calling stack.New(). Then endpoints can be created by passing
+// ping.ProtocolNumber or ping.ProtocolNumber6 as the transport protocol number
+// when calling Stack.NewEndpoint().
+package ping
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// ProtocolName4 is the string representation of the ping protocol name.
+	ProtocolName4 = "ping4"
+
+	// ProtocolNumber4 is the ICMP protocol number.
+	ProtocolNumber4 = header.ICMPv4ProtocolNumber
+
+	// ProtocolName6 is the string representation of the ping protocol name.
+	ProtocolName6 = "ping6"
+
+	// ProtocolNumber6 is the IPv6-ICMP protocol number.
+	ProtocolNumber6 = header.ICMPv6ProtocolNumber
+)
+
+type protocol struct {
+	number tcpip.TransportProtocolNumber
+}
+
+// Number returns the ICMP protocol number.
+func (p *protocol) Number() tcpip.TransportProtocolNumber {
+	return p.number
+}
+
+func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
+	switch p.number {
+	case ProtocolNumber4:
+		return header.IPv4ProtocolNumber
+	case ProtocolNumber6:
+		return header.IPv6ProtocolNumber
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// NewEndpoint creates a new ping endpoint.
+func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if netProto != p.netProto() {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+	return newEndpoint(stack, netProto, waiterQueue), nil
+}
+
+// MinimumPacketSize returns the minimum valid ping packet size.
+func (p *protocol) MinimumPacketSize() int {
+	switch p.number {
+	case ProtocolNumber4:
+		return header.ICMPv4EchoMinimumSize
+	case ProtocolNumber6:
+		return header.ICMPv6EchoMinimumSize
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// ParsePorts returns the source and destination ports stored in the given udp
+// packet.
+func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
+	return 0, binary.BigEndian.Uint16(v[header.ICMPv4MinimumSize:]), nil
+}
+
+// HandleUnknownDestinationPacket handles packets targeted at this protocol but
+// that don't match any existing endpoint.
+func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *buffer.VectorisedView) bool {
+	return true
+}
+
+// SetOption implements TransportProtocol.SetOption.
+func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Option implements TransportProtocol.Option.
+func (p *protocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func init() {
+	stack.RegisterTransportProtocolFactory(ProtocolName4, func() stack.TransportProtocol {
+		return &protocol{ProtocolNumber4}
+	})
+
+	// TODO: Support IPv6.
+}
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 41b98424a..e20d59ca3 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -13,7 +13,7 @@ import (
 
 // saveData saves udpPacket.data field.
 func (u *udpPacket) saveData() buffer.VectorisedView {
-	// We canoot save u.data directly as u.data.views may alias to u.views,
+	// We cannot save u.data directly as u.data.views may alias to u.views,
 	// which is not allowed by state framework (in-struct pointer).
 	return u.data.Clone(nil)
 }
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 88736cfa4..16522c668 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -64,6 +64,7 @@ go_library(
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/ping",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/urpc",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index d63a9028e..af577f571 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -43,6 +43,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.googlesource.com/gvisor/runsc/boot/filter"
@@ -346,7 +347,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) inet.Stack {
 	case NetworkNone, NetworkSandbox:
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
-		protoNames := []string{tcp.ProtocolName, udp.ProtocolName}
+		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
 		return &epsocket.Stack{stack.New(clock, netProtos, protoNames)}
 
 	default:
-- 
cgit v1.2.3


From a61def1b368a9042e346787008e12770e4e67b35 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 2 May 2018 17:39:12 -0700
Subject: Remove detach for exec options

Detachable exec commands are handled in the client entirely and the detach option is not used anymore.

PiperOrigin-RevId: 195181272
Change-Id: I6e82a2876d2c173709c099be59670f71702e5bf0
---
 pkg/sentry/control/proc.go    | 9 ---------
 runsc/cmd/exec.go             | 3 +--
 runsc/sandbox/sandbox_test.go | 8 +-------
 3 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 7d06a1d04..d77b30c90 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -72,9 +72,6 @@ type ExecArgs struct {
 	// Capabilities is the list of capabilities to give to the process.
 	Capabilities *auth.TaskCapabilities
 
-	// Detach indicates whether Exec should detach once the process starts.
-	Detach bool
-
 	// FilePayload determines the files to give to the new process.
 	urpc.FilePayload
 }
@@ -135,12 +132,6 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		return err
 	}
 
-	// If we're supposed to detach, don't wait for the process to exit.
-	if args.Detach {
-		*waitStatus = 0
-		return nil
-	}
-
 	// Wait for completion.
 	newTG.WaitExited()
 	*waitStatus = newTG.ExitStatus().Status()
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 8379f552d..576031b5b 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -99,7 +99,6 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if err != nil {
 		Fatalf("error parsing process spec: %v", err)
 	}
-	e.Detach = ex.detach
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
@@ -123,7 +122,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// executed. If detach was specified, starts a child in non-detach mode,
 	// write the child's PID to the pid file. So when the container returns, the
 	// child process will also return and signal containerd.
-	if e.Detach {
+	if ex.detach {
 		binPath, err := specutils.BinPath()
 		if err != nil {
 			Fatalf("error getting bin path: %v", err)
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index 6c71cac30..6e3125b7b 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -365,7 +365,6 @@ func TestExec(t *testing.T) {
 		Envv:             []string{"PATH=" + os.Getenv("PATH")},
 		WorkingDirectory: "/",
 		KUID:             uid,
-		Detach:           false,
 	}
 
 	// Verify that "sleep 100" and "sleep 5" are running after exec.
@@ -472,7 +471,6 @@ func TestCapabilities(t *testing.T) {
 		KUID:             uid,
 		KGID:             gid,
 		Capabilities:     &auth.TaskCapabilities{},
-		Detach:           true,
 	}
 
 	// "exe" should fail because we don't have the necessary permissions.
@@ -484,14 +482,10 @@ func TestCapabilities(t *testing.T) {
 	execArgs.Capabilities = &auth.TaskCapabilities{
 		EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
 	}
-	// First, start running exec.
+	// "exe" should not fail this time.
 	if _, err := s.Execute(&execArgs); err != nil {
 		t.Fatalf("sandbox failed to exec %v: %v", execArgs, err)
 	}
-
-	if err := waitForProcessList(s, expectedPL); err != nil {
-		t.Error(err)
-	}
 }
 
 // Test that an tty FD is sent over the console socket if one is provided.
-- 
cgit v1.2.3


From 04b79137babed361fb227e3ad579adb2df4bb188 Mon Sep 17 00:00:00 2001
From: Cyrille Hemidy <cyrille.hemidy@gmail.com>
Date: Thu, 3 May 2018 14:05:25 -0700
Subject: Fix misspellings.

PiperOrigin-RevId: 195307689
Change-Id: I499f19af49875a43214797d63376f20ae788d2f4
---
 pkg/log/log.go                           | 2 +-
 pkg/sentry/fs/file.go                    | 2 +-
 pkg/sentry/fs/fsutil/file.go             | 2 +-
 pkg/sentry/fs/ramfs/dir.go               | 4 ++--
 pkg/sentry/fs/tty/line_discipline.go     | 4 ++--
 pkg/sentry/kernel/semaphore/semaphore.go | 2 +-
 pkg/sentry/kernel/task_exit.go           | 2 +-
 pkg/sentry/mm/vma.go                     | 2 +-
 pkg/sentry/socket/rpcinet/socket.go      | 2 +-
 pkg/sentry/strace/syscalls.go            | 2 +-
 pkg/tcpip/header/ipv6.go                 | 2 +-
 pkg/tcpip/stack/stack.go                 | 2 +-
 pkg/tcpip/transport/tcp/snd.go           | 2 +-
 runsc/sandbox/sandbox.go                 | 4 ++--
 14 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'runsc')

diff --git a/pkg/log/log.go b/pkg/log/log.go
index 110e0e196..cdfc0601a 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -149,7 +149,7 @@ func (t TestEmitter) Emit(level Level, timestamp time.Time, format string, v ...
 // Logger is a high-level logging interface. It is in fact, not used within the
 // log package. Rather it is provided for others to provide contextual loggers
 // that may append some addition information to log statement. BasicLogger
-// satifies this interface, and may be passed around as a Logger.
+// satisfies this interface, and may be passed around as a Logger.
 type Logger interface {
 	// Debugf logs a debug statement.
 	Debugf(format string, v ...interface{})
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index de2e80bf0..f2683bbd2 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -376,7 +376,7 @@ func (r *FileReader) Read(buf []byte) (int, error) {
 	return int(n), err
 }
 
-// ReadAt implementes io.Reader.ReadAt.
+// ReadAt implements io.Reader.ReadAt.
 func (r *FileReader) ReadAt(buf []byte, offset int64) (int, error) {
 	n, err := r.File.Preadv(r.Ctx, usermem.BytesIOSequence(buf), offset)
 	return int(n), err
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index a7329f1c9..b17f11a5a 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -34,7 +34,7 @@ func (NoopRelease) Release() {}
 // SeekWithDirCursor is used to implement fs.FileOperations.Seek.  If dirCursor
 // is not nil and the seek was on a directory, the cursor will be updated.
 //
-// Currenly only seeking to 0 on a directory is supported.
+// Currently only seeking to 0 on a directory is supported.
 //
 // FIXME: Lift directory seeking limitations.
 func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) {
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index bf4cd8dfd..19d5612ed 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -103,7 +103,7 @@ func (d *Dir) addChildLocked(name string, inode *fs.Inode) {
 	}
 
 	// Given we're now adding this inode to the directory we must also
-	// increase its link count. Similiarly we decremented it in removeChildLocked.
+	// increase its link count. Similarly we decremented it in removeChildLocked.
 	inode.AddLink()
 }
 
@@ -144,7 +144,7 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er
 	inode.NotifyStatusChange(ctx)
 
 	// Given we're now removing this inode to the directory we must also
-	// decrease its link count. Similiarly it is increased in addChildLocked.
+	// decrease its link count. Similarly it is increased in addChildLocked.
 	inode.DropLink()
 
 	return inode, nil
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index fde4e7941..a3aa95ece 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -244,8 +244,8 @@ func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence,
 	return int64(n), err
 }
 
-// transformOutput does ouput processing for one end of the pty. See
-// drivers/tty/n_tty.c:do_output_char for an analagous kernel function.
+// transformOutput does output processing for one end of the pty. See
+// drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
 //
 // Precondition: l.termiosMu must be held.
 func (l *lineDiscipline) transformOutput(buf []byte) *bytes.Buffer {
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 19ad5d537..fb8c2f98c 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -298,7 +298,7 @@ func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
 }
 
 // ExecuteOps attempts to execute a list of operations to the set. It only
-// suceeds when all operations can be applied. No changes are made if it fails.
+// succeeds when all operations can be applied. No changes are made if it fails.
 //
 // On failure, it may return an error (retries are hopeless) or it may return
 // a channel that can be waited on before attempting again.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 3d49ae350..d6604f37b 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -125,7 +125,7 @@ func (t *Task) killLocked() {
 		Signo: int32(linux.SIGKILL),
 		// Linux just sets SIGKILL in the pending signal bitmask without
 		// enqueueing an actual siginfo, such that
-		// kernel/signal.c:collect_signal() initalizes si_code to SI_USER.
+		// kernel/signal.c:collect_signal() initializes si_code to SI_USER.
 		Code: arch.SignalInfoUser,
 	})
 	t.interrupt()
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index b6af48cb7..61aaa3195 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -243,7 +243,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange
 }
 
 // getVecVMAsLocked ensures that vmas exist for all addresses in ars, and
-// support access to type of (at, ignorePermissions). It retuns the subset of
+// support access to type of (at, ignorePermissions). It returns the subset of
 // ars for which vmas exist. If this is not equal to ars, it returns a non-nil
 // error explaining why.
 //
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 15047df01..2911d3fd6 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -530,7 +530,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protoc
 	// Only accept TCP and UDP.
 	//
 	// Try to restrict the flags we will accept to minimize backwards
-	// incompatability with netstack.
+	// incompatibility with netstack.
 	stype := int(stypeflags) & linux.SOCK_TYPE_MASK
 	switch stype {
 	case syscall.SOCK_STREAM:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index d0e661706..eccee733e 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -82,7 +82,7 @@ const (
 	// PipeFDs is an array of two FDs, formatted after syscall execution.
 	PipeFDs
 
-	// Uname is a pointer to a struct uname, formatted after syscall exection.
+	// Uname is a pointer to a struct uname, formatted after syscall execution.
 	Uname
 
 	// Stat is a pointer to a struct stat, formatted after syscall execution.
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index d8dc138b3..da0210539 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -60,7 +60,7 @@ const (
 	// IPv6ProtocolNumber is IPv6's network protocol number.
 	IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd
 
-	// IPv6Version is the version of the ipv6 procotol.
+	// IPv6Version is the version of the ipv6 protocol.
 	IPv6Version = 6
 
 	// IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 2460,
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index b480bf812..f0fbd8aad 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -165,7 +165,7 @@ type TCPSenderState struct {
 	// window size from a segment.
 	SndWndScale uint8
 
-	// MaxSentAck is the highest acknowledgemnt number sent till now.
+	// MaxSentAck is the highest acknowledgement number sent till now.
 	MaxSentAck seqnum.Value
 
 	// FastRecovery holds the fast recovery state for the endpoint.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index ad94aecd8..6c363a929 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -152,7 +152,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 
 // updateMaxPayloadSize updates the maximum payload size based on the given
 // MTU. If this is in response to "packet too big" control packets (indicated
-// by the count argument), it also reduces the number of oustanding packets and
+// by the count argument), it also reduces the number of outstanding packets and
 // attempts to retransmit the first packet above the MTU size.
 func (s *sender) updateMaxPayloadSize(mtu, count int) {
 	m := mtu - header.TCPMinimumSize
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 64810b4ea..954824ada 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -58,7 +58,7 @@ func validateID(id string) error {
 //
 // Within a root directory, we maintain subdirectories for each sandbox named
 // with the sandbox id.  The sandbox metadata is is stored as json within the
-// sandbox directoy in a file named "meta.json".  This metadata format is
+// sandbox directory in a file named "meta.json".  This metadata format is
 // defined by us, and is not part of the OCI spec.
 //
 // Sandboxes must write this metadata file after any change to their internal
@@ -199,7 +199,7 @@ func Load(rootDir, id string) (*Sandbox, error) {
 	// If the status is "Running" or "Created", check that the process
 	// still exists, and set it to Stopped if it does not.
 	//
-	// This is inherintly racey.
+	// This is inherently racey.
 	if s.Status == Running || s.Status == Created {
 		// Send signal 0 to check if process exists.
 		if err := s.Signal(0); err != nil {
-- 
cgit v1.2.3


From c186ebb62a6005288d83feed0e43cca9f0577383 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 3 May 2018 21:08:38 -0700
Subject: Return error when child exits early

PiperOrigin-RevId: 195365050
Change-Id: I8754dc7a3fc2975d422cae453762a455478a8e6a
---
 runsc/cmd/exec.go                 | 87 ++++++++++++++++++-----------------
 runsc/sandbox/sandbox.go          | 23 +++++-----
 runsc/specutils/BUILD             |  9 +++-
 runsc/specutils/specutils.go      | 32 +++++++++++++
 runsc/specutils/specutils_test.go | 96 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 193 insertions(+), 54 deletions(-)
 create mode 100644 runsc/specutils/specutils_test.go

(limited to 'runsc')

diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 576031b5b..052e00316 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -123,48 +123,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// write the child's PID to the pid file. So when the container returns, the
 	// child process will also return and signal containerd.
 	if ex.detach {
-		binPath, err := specutils.BinPath()
-		if err != nil {
-			Fatalf("error getting bin path: %v", err)
-		}
-		var args []string
-		for _, a := range os.Args[1:] {
-			if !strings.Contains(a, "detach") {
-				args = append(args, a)
-			}
-		}
-		cmd := exec.Command(binPath, args...)
-		cmd.Stdin = os.Stdin
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-		if err := cmd.Start(); err != nil {
-			Fatalf("failure to start child exec process, err: %v", err)
-		}
-
-		log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
-
-		// Wait for PID file to ensure that child process has started. Otherwise,
-		// '--process' file is deleted as soon as this process returns and the child
-		// may fail to read it.
-		sleepTime := 10 * time.Millisecond
-		for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
-			_, err := os.Stat(ex.pidFile)
-			if err == nil {
-				break
-			}
-			if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
-				Fatalf("unexpected error waiting for PID file, err: %v", err)
-			}
-
-			log.Infof("Waiting for PID file to be created...")
-			time.Sleep(sleepTime)
-			sleepTime *= sleepTime * 2
-			if sleepTime > 1*time.Second {
-				sleepTime = 1 * time.Second
-			}
-		}
-		*waitStatus = 0
-		return subcommands.ExitSuccess
+		return ex.execAndWait(waitStatus)
 	}
 
 	if ex.pidFile != "" {
@@ -191,6 +150,50 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	return subcommands.ExitSuccess
 }
 
+func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		Fatalf("error getting bin path: %v", err)
+	}
+	var args []string
+	for _, a := range os.Args[1:] {
+		if !strings.Contains(a, "detach") {
+			args = append(args, a)
+		}
+	}
+	cmd := exec.Command(binPath, args...)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Start(); err != nil {
+		Fatalf("failure to start child exec process, err: %v", err)
+	}
+
+	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+
+	// Wait for PID file to ensure that child process has started. Otherwise,
+	// '--process' file is deleted as soon as this process returns and the child
+	// may fail to read it.
+	ready := func() (bool, error) {
+		_, err := os.Stat(ex.pidFile)
+		if err == nil {
+			// File appeared, we're done!
+			return true, nil
+		}
+		if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
+			return false, err
+		}
+		// No file yet, continue to wait...
+		return false, nil
+	}
+	if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil {
+		Fatalf("unexpected error waiting for PID file, err: %v", err)
+	}
+
+	*waitStatus = 0
+	return subcommands.ExitSuccess
+}
+
 // parseArgs parses exec information from the command line or a JSON file
 // depending on whether the --process flag was used. Returns an ExecArgs and
 // the ID of the sandbox to be used.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 954824ada..13bf5d800 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -560,19 +560,20 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, common
 // running, at which point the sandbox is in Created state.
 func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 	log.Debugf("Waiting for sandbox %q creation", s.ID)
-	tchan := time.After(timeout)
-	for {
-		select {
-		case <-tchan:
-			return fmt.Errorf("timed out waiting for sandbox control server")
-		default:
-			if c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)); err == nil {
-				// It's alive!
-				c.Close()
-				return nil
-			}
+
+	ready := func() (bool, error) {
+		c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
+		if err != nil {
+			return false, nil
 		}
+		// It's alive!
+		c.Close()
+		return true, nil
 	}
+	if err := specutils.WaitForReady(s.Pid, timeout, ready); err != nil {
+		return fmt.Errorf("unexpected error waiting for sandbox %q, err: %v", s.ID, err)
+	}
+	return nil
 }
 
 // Wait waits for the containerized process to exit, and returns its WaitStatus.
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index ae89260d2..1b6d265bc 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
 go_library(
     name = "specutils",
@@ -16,3 +16,10 @@ go_library(
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
+
+go_test(
+    name = "specutils_test",
+    size = "small",
+    srcs = ["specutils_test.go"],
+    embed = [":specutils"],
+)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index bed0f75eb..04ecb6ae3 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -23,6 +23,8 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"syscall"
+	"time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -181,3 +183,33 @@ func BinPath() (string, error) {
 	}
 	return binPath, nil
 }
+
+// WaitForReady waits for a process to become ready. The process is ready when
+// the 'ready' function returns true. It continues to wait if 'ready' returns
+// false. It returns error on timeout, if the process stops or if 'ready' fails.
+func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error {
+	backoff := 1 * time.Millisecond
+	for start := time.Now(); time.Now().Sub(start) < timeout; {
+		if ok, err := ready(); err != nil {
+			return err
+		} else if ok {
+			return nil
+		}
+
+		// Check if the process is still running.
+		var ws syscall.WaitStatus
+		var ru syscall.Rusage
+		child, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, &ru)
+		if err != nil || child == pid {
+			return fmt.Errorf("process (%d) is not running, err: %v", pid, err)
+		}
+
+		// Process continues to run, backoff and retry.
+		time.Sleep(backoff)
+		backoff *= 2
+		if backoff > 1*time.Second {
+			backoff = 1 * time.Second
+		}
+	}
+	return fmt.Errorf("timed out waiting for process (%d)", pid)
+}
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
new file mode 100644
index 000000000..ef293e608
--- /dev/null
+++ b/runsc/specutils/specutils_test.go
@@ -0,0 +1,96 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestWaitForReadyHappy(t *testing.T) {
+	cmd := exec.Command("/bin/sleep", "1000")
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("cmd.Start() failed, err: %v", err)
+	}
+	defer cmd.Wait()
+
+	var count int
+	err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) {
+		if count < 3 {
+			count++
+			return false, nil
+		}
+		return true, nil
+	})
+	if err != nil {
+		t.Errorf("ProcessWaitReady got: %v, expected: nil", err)
+	}
+	cmd.Process.Kill()
+}
+
+func TestWaitForReadyFail(t *testing.T) {
+	cmd := exec.Command("/bin/sleep", "1000")
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("cmd.Start() failed, err: %v", err)
+	}
+	defer cmd.Wait()
+
+	var count int
+	err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) {
+		if count < 3 {
+			count++
+			return false, nil
+		}
+		return false, fmt.Errorf("Fake error")
+	})
+	if err == nil {
+		t.Errorf("ProcessWaitReady got: nil, expected: error")
+	}
+	cmd.Process.Kill()
+}
+
+func TestWaitForReadyNotRunning(t *testing.T) {
+	cmd := exec.Command("/bin/true")
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("cmd.Start() failed, err: %v", err)
+	}
+	defer cmd.Wait()
+
+	err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) {
+		return false, nil
+	})
+	if !strings.Contains(err.Error(), "not running") {
+		t.Errorf("ProcessWaitReady got: %v, expected: not running", err)
+	}
+}
+
+func TestWaitForReadyTimeout(t *testing.T) {
+	cmd := exec.Command("/bin/sleep", "1000")
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("cmd.Start() failed, err: %v", err)
+	}
+	defer cmd.Wait()
+
+	err := WaitForReady(cmd.Process.Pid, 50*time.Millisecond, func() (bool, error) {
+		return false, nil
+	})
+	if !strings.Contains(err.Error(), "timed out") {
+		t.Errorf("ProcessWaitReady got: %v, expected: timed out", err)
+	}
+	cmd.Process.Kill()
+}
-- 
cgit v1.2.3


From c90fefc1161c58af34856aff7b7012f19f5d1f1b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 4 May 2018 09:38:35 -0700
Subject: Fix runsc capabilities

There was a typo and one new capability missing from the list

PiperOrigin-RevId: 195427713
Change-Id: I6d9e1c6e77b48fe85ef10d9f54c70c8a7271f6e7
---
 pkg/abi/linux/capability.go  | 5 +++--
 runsc/boot/capability.go     | 3 ++-
 runsc/specutils/specutils.go | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index 1a1bd0ce3..b470ce0a5 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -32,7 +32,7 @@ const (
 	CAP_SETPCAP          = Capability(8)
 	CAP_LINUX_IMMUTABLE  = Capability(9)
 	CAP_NET_BIND_SERVICE = Capability(10)
-	CAP_NET_BROAD_CAST   = Capability(11)
+	CAP_NET_BROADCAST    = Capability(11)
 	CAP_NET_ADMIN        = Capability(12)
 	CAP_NET_RAW          = Capability(13)
 	CAP_IPC_LOCK         = Capability(14)
@@ -58,9 +58,10 @@ const (
 	CAP_SYSLOG           = Capability(34)
 	CAP_WAKE_ALARM       = Capability(35)
 	CAP_BLOCK_SUSPEND    = Capability(36)
+	CAP_AUDIT_READ       = Capability(37)
 
 	// MaxCapability is the highest-numbered capability.
-	MaxCapability = Capability(36) // CAP_BLOCK_SUSPEND as of 3.11
+	MaxCapability = CAP_AUDIT_READ
 )
 
 // Ok returns true if cp is a supported capability.
diff --git a/runsc/boot/capability.go b/runsc/boot/capability.go
index 4c6a59245..efa28fb97 100644
--- a/runsc/boot/capability.go
+++ b/runsc/boot/capability.go
@@ -91,7 +91,7 @@ var capFromName = map[string]capability.Cap{
 	"CAP_SETPCAP":          capability.CAP_SETPCAP,
 	"CAP_LINUX_IMMUTABLE":  capability.CAP_LINUX_IMMUTABLE,
 	"CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE,
-	"CAP_NET_BROAD_CAST":   capability.CAP_NET_BROADCAST,
+	"CAP_NET_BROADCAST":    capability.CAP_NET_BROADCAST,
 	"CAP_NET_ADMIN":        capability.CAP_NET_ADMIN,
 	"CAP_NET_RAW":          capability.CAP_NET_RAW,
 	"CAP_IPC_LOCK":         capability.CAP_IPC_LOCK,
@@ -117,4 +117,5 @@ var capFromName = map[string]capability.Cap{
 	"CAP_SYSLOG":           capability.CAP_SYSLOG,
 	"CAP_WAKE_ALARM":       capability.CAP_WAKE_ALARM,
 	"CAP_BLOCK_SUSPEND":    capability.CAP_BLOCK_SUSPEND,
+	"CAP_AUDIT_READ":       capability.CAP_AUDIT_READ,
 }
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 04ecb6ae3..dcb4b20db 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -129,7 +129,7 @@ var capFromName = map[string]linux.Capability{
 	"CAP_SETPCAP":          linux.CAP_SETPCAP,
 	"CAP_LINUX_IMMUTABLE":  linux.CAP_LINUX_IMMUTABLE,
 	"CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE,
-	"CAP_NET_BROAD_CAST":   linux.CAP_NET_BROAD_CAST,
+	"CAP_NET_BROADCAST":    linux.CAP_NET_BROADCAST,
 	"CAP_NET_ADMIN":        linux.CAP_NET_ADMIN,
 	"CAP_NET_RAW":          linux.CAP_NET_RAW,
 	"CAP_IPC_LOCK":         linux.CAP_IPC_LOCK,
@@ -155,6 +155,7 @@ var capFromName = map[string]linux.Capability{
 	"CAP_SYSLOG":           linux.CAP_SYSLOG,
 	"CAP_WAKE_ALARM":       linux.CAP_WAKE_ALARM,
 	"CAP_BLOCK_SUSPEND":    linux.CAP_BLOCK_SUSPEND,
+	"CAP_AUDIT_READ":       linux.CAP_AUDIT_READ,
 }
 
 func capsFromNames(names []string) (auth.CapabilitySet, error) {
-- 
cgit v1.2.3


From f47174f06b9904b830268d46a7e817053b6235c8 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 4 May 2018 14:15:24 -0700
Subject: Run gofmt -s on everything

PiperOrigin-RevId: 195469901
Change-Id: I66d5c7a334bbb8b47e40d266a2661291c2d91c7f
---
 pkg/sentry/control/proc_test.go               | 8 ++++----
 pkg/sentry/kernel/semaphore/semaphore_test.go | 8 ++++----
 runsc/fsgofer/fsgofer.go                      | 4 ++--
 runsc/fsgofer/fsgofer_test.go                 | 8 ++++----
 runsc/sandbox/sandbox.go                      | 4 ++--
 5 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index 18286496f..22c826236 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -38,7 +38,7 @@ func TestProcessListTable(t *testing.T) {
 		},
 		{
 			pl: []*Process{
-				&Process{
+				{
 					UID:   0,
 					PID:   0,
 					PPID:  0,
@@ -47,7 +47,7 @@ func TestProcessListTable(t *testing.T) {
 					Time:  "0",
 					Cmd:   "zero",
 				},
-				&Process{
+				{
 					UID:   1,
 					PID:   1,
 					PPID:  1,
@@ -83,7 +83,7 @@ func TestProcessListJSON(t *testing.T) {
 		},
 		{
 			pl: []*Process{
-				&Process{
+				{
 					UID:   0,
 					PID:   0,
 					PPID:  0,
@@ -92,7 +92,7 @@ func TestProcessListJSON(t *testing.T) {
 					Time:  "0",
 					Cmd:   "zero",
 				},
-				&Process{
+				{
 					UID:   1,
 					PID:   1,
 					PPID:  1,
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index 0386586ab..1c6a2e1e9 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -57,7 +57,7 @@ func TestBasic(t *testing.T) {
 	ctx := contexttest.Context(t)
 	set := &Set{ID: 123, sems: make([]sem, 1)}
 	ops := []linux.Sembuf{
-		linux.Sembuf{SemOp: 1},
+		{SemOp: 1},
 	}
 	executeOps(ctx, t, set, ops, false)
 
@@ -78,7 +78,7 @@ func TestWaitForZero(t *testing.T) {
 	ctx := contexttest.Context(t)
 	set := &Set{ID: 123, sems: make([]sem, 1)}
 	ops := []linux.Sembuf{
-		linux.Sembuf{SemOp: 0},
+		{SemOp: 0},
 	}
 	executeOps(ctx, t, set, ops, false)
 
@@ -117,7 +117,7 @@ func TestNoWait(t *testing.T) {
 	ctx := contexttest.Context(t)
 	set := &Set{ID: 123, sems: make([]sem, 1)}
 	ops := []linux.Sembuf{
-		linux.Sembuf{SemOp: 1},
+		{SemOp: 1},
 	}
 	executeOps(ctx, t, set, ops, false)
 
@@ -146,7 +146,7 @@ func TestUnregister(t *testing.T) {
 	}
 
 	ops := []linux.Sembuf{
-		linux.Sembuf{SemOp: -1},
+		{SemOp: -1},
 	}
 	chs := make([]chan struct{}, 0, 5)
 	for i := 0; i < 5; i++ {
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 5ddc75a9d..be2ac5f3c 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -614,8 +614,8 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 
 	if valid.ATime || valid.MTime {
 		utimes := [2]syscall.Timespec{
-			syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
-			syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
+			{Sec: 0, Nsec: linux.UTIME_OMIT},
+			{Sec: 0, Nsec: linux.UTIME_OMIT},
 		}
 		if valid.ATime {
 			if valid.ATimeNotSystemTime {
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 7d834d596..58d04aefa 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -39,12 +39,12 @@ var (
 	allConfs []Config
 
 	rwConfs = []Config{
-		Config{ROMount: false, LazyOpenForWrite: false},
-		Config{ROMount: false, LazyOpenForWrite: true},
+		{ROMount: false, LazyOpenForWrite: false},
+		{ROMount: false, LazyOpenForWrite: true},
 	}
 	roConfs = []Config{
-		Config{ROMount: true, LazyOpenForWrite: false},
-		Config{ROMount: true, LazyOpenForWrite: true},
+		{ROMount: true, LazyOpenForWrite: false},
+		{ROMount: true, LazyOpenForWrite: true},
 	}
 )
 
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 13bf5d800..0354a64b9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -493,8 +493,8 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, common
 	// namespace for these.
 	log.Infof("Sandbox will be started in empty IPC and UTS namespaces")
 	nss := []specs.LinuxNamespace{
-		specs.LinuxNamespace{Type: specs.IPCNamespace},
-		specs.LinuxNamespace{Type: specs.UTSNamespace},
+		{Type: specs.IPCNamespace},
+		{Type: specs.UTSNamespace},
 	}
 
 	if conf.Platform == boot.PlatformPtrace {
-- 
cgit v1.2.3


From 7c8c3705ea5d891a3d6126090b1f49d8bae44177 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 7 May 2018 16:37:08 -0700
Subject: Fix misspellings

PiperOrigin-RevId: 195742598
Change-Id: Ibd4a8e4394e268c87700b6d1e50b4b37dfce5182
---
 pkg/cpuid/cpuid.go                                   |  2 +-
 pkg/sentry/fs/fdpipe/pipe_opener.go                  |  2 +-
 pkg/sentry/fs/file_overlay.go                        |  2 +-
 pkg/sentry/fs/inode.go                               |  2 +-
 pkg/sentry/fs/inotify.go                             |  2 +-
 pkg/sentry/kernel/auth/credentials.go                |  2 +-
 pkg/sentry/kernel/pipe/pipe.go                       |  2 +-
 pkg/sentry/loader/loader.go                          |  2 +-
 pkg/sentry/platform/kvm/address_space.go             |  2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go   | 20 ++++++++++----------
 .../platform/ring0/pagetables/pagetables_amd64.go    |  2 +-
 pkg/sentry/usage/memory.go                           |  2 +-
 pkg/tcpip/header/ipv4.go                             |  2 +-
 runsc/sandbox/sandbox_test.go                        |  2 +-
 14 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'runsc')

diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index aa248dd98..b486ab037 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -720,7 +720,7 @@ func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32)
 
 // HostFeatureSet uses cpuid to get host values and construct a feature set
 // that matches that of the host machine. Note that there are several places
-// where there appear to be some unecessary assignments between register names
+// where there appear to be some unnecessary assignments between register names
 // (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show
 // where the different feature blocks come from, to make the code easier to
 // inspect and read.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go
index a0d59575f..945cfaf08 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener.go
@@ -94,7 +94,7 @@ func unwrapError(err error) error {
 // TryOpen uses a NonBlockingOpener to try to open a host pipe, respecting the fs.FileFlags.
 func (p *pipeOpenState) TryOpen(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (*pipeOperations, error) {
 	switch {
-	// Reject invalid configurations so they don't accidently succeed below.
+	// Reject invalid configurations so they don't accidentally succeed below.
 	case !flags.Read && !flags.Write:
 		return nil, syscall.EINVAL
 
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 0c6e622b9..c27c5946e 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -151,7 +151,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See
 
 	// If this was a seek on a directory, we must update the cursor.
 	if seekDir && whence == SeekSet && offset == 0 {
-		// Currenly only seeking to 0 on a directory is supported.
+		// Currently only seeking to 0 on a directory is supported.
 		// FIXME: Lift directory seeking limitations.
 		f.dirCursor = ""
 	}
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b624f4182..6c8e6f188 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -26,7 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
-// Inode is a file system object that can be simulatenously referenced by different
+// Inode is a file system object that can be simultaneously referenced by different
 // components of the VFS (Dirent, fs.File, etc).
 type Inode struct {
 	// AtomicRefCount is our reference count.
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 9f50cb800..a87be8590 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -316,7 +316,7 @@ func (i *Inotify) RmWatch(wd int32) error {
 
 	// The watch is now isolated and we can safely drop the instance lock. We
 	// need to do so because watch.destroy() acquires Watch.mu, which cannot be
-	// aquired with Inotify.mu held.
+	// acquired with Inotify.mu held.
 	i.mu.Unlock()
 
 	// Generate the event for the removal.
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index b832b28fe..f6fb05285 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -126,7 +126,7 @@ func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *T
 		creds.InheritableCaps = capabilities.InheritableCaps
 		// // TODO: Support ambient capabilities.
 	} else {
-		// If no capabilities are specified, grant the same capabilites
+		// If no capabilities are specified, grant the same capabilities
 		// that NewRootCredentials does.
 		creds.PermittedCaps = AllCapabilities
 		creds.EffectiveCaps = AllCapabilities
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 1656c6ff3..9a21df5b4 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -284,7 +284,7 @@ func (p *Pipe) rReadinessLocked() waiter.EventMask {
 		ready |= waiter.EventIn
 	}
 	if !p.HasWriters() && p.hadWriter {
-		// POLLHUP must be supressed until the pipe has had at least one writer
+		// POLLHUP must be suppressed until the pipe has had at least one writer
 		// at some point. Otherwise a reader thread may poll and immediately get
 		// a POLLHUP before the writer ever opens the pipe, which the reader may
 		// interpret as the writer opening then closing the pipe.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 94c281b72..4ed796493 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -118,7 +118,7 @@ func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch
 
 const (
 	// maxLoaderAttempts is the maximum number of attempts to try to load
-	// an interpreter scripts, to prevent loops. 6 (inital + 5 changes) is
+	// an interpreter scripts, to prevent loops. 6 (initial + 5 changes) is
 	// what the Linux kernel allows (fs/exec.c:search_binary_handler).
 	maxLoaderAttempts = 6
 )
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 791f038b0..a4b9198cc 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -178,7 +178,7 @@ func (as *addressSpace) MapFile(addr usermem.Addr, fd int, fr platform.FileRange
 	// we create distinct mappings for each address space. Unfortunately,
 	// there's not a better way to manage this here. The file underlying
 	// this fd can change at any time, so we can't actually index the file
-	// and share between address space. Oh well. It's all refering to the
+	// and share between address space. Oh well. It's all referring to the
 	// same physical pages, hopefully we don't run out of address space.
 	if fd != int(as.filemem.File().Fd()) {
 		// N.B. precommit is ignored for host files.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 3cbf0bfa5..ee7f27601 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -44,18 +44,18 @@ type PageTables struct {
 	// root is the pagetable root.
 	root *Node
 
-	// translater is the translater passed at creation.
-	translater Translater
+	// translator is the translator passed at creation.
+	translator Translator
 
 	// archPageTables includes architecture-specific features.
 	archPageTables
 
-	// allNodes is a set of nodes indexed by translater address.
+	// allNodes is a set of nodes indexed by translator address.
 	allNodes map[uintptr]*Node
 }
 
-// Translater translates to guest physical addresses.
-type Translater interface {
+// Translator translates to guest physical addresses.
+type Translator interface {
 	// TranslateToPhysical translates the given pointer object into a
 	// "physical" address. We do not require that it translates back, the
 	// reverse mapping is maintained internally.
@@ -63,9 +63,9 @@ type Translater interface {
 }
 
 // New returns new PageTables.
-func New(t Translater, opts Opts) *PageTables {
+func New(t Translator, opts Opts) *PageTables {
 	p := &PageTables{
-		translater: t,
+		translator: t,
 		allNodes:   make(map[uintptr]*Node),
 	}
 	p.root = p.allocNode()
@@ -80,7 +80,7 @@ func New(t Translater, opts Opts) *PageTables {
 // managing multiple sets of pagetables.
 func (p *PageTables) New() *PageTables {
 	np := &PageTables{
-		translater: p.translater,
+		translator: p.translator,
 		allNodes:   make(map[uintptr]*Node),
 	}
 	np.root = np.allocNode()
@@ -90,7 +90,7 @@ func (p *PageTables) New() *PageTables {
 
 // setPageTable sets the given index as a page table.
 func (p *PageTables) setPageTable(n *Node, index int, child *Node) {
-	phys := p.translater.TranslateToPhysical(child.PTEs())
+	phys := p.translator.TranslateToPhysical(child.PTEs())
 	p.allNodes[phys] = child
 	pte := &n.PTEs()[index]
 	pte.setPageTable(phys)
@@ -188,6 +188,6 @@ func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType use
 // allocNode allocates a new page.
 func (p *PageTables) allocNode() *Node {
 	n := new(Node)
-	n.physical = p.translater.TranslateToPhysical(n.PTEs())
+	n.physical = p.translator.TranslateToPhysical(n.PTEs())
 	return n
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index b89665c96..a2050b99c 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -301,7 +301,7 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 					}
 
 					// This level has 2-MB huge pages. If this
-					// region is contined in a single PMD entry?
+					// region is contained in a single PMD entry?
 					// As above, we can skip allocating a new page.
 					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
 						pmdEntry.SetSuper()
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 5d1b3a595..4a1527b5f 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -95,7 +95,7 @@ type MemoryStats struct {
 // categories not backed by platform memory. For details about how this works,
 // see the memory accounting docs.
 //
-// N.B. Please keep the struct in sync with the API. Noteably, changes to this
+// N.B. Please keep the struct in sync with the API. Notably, changes to this
 // struct requires a version bump and addition of compatibility logic in the
 // control server. As a special-case, adding fields without re-ordering existing
 // ones do not require a version bump because the mapped page we use is
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index cb0d42093..6e2a3d6f4 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -81,7 +81,7 @@ const (
 	// IPv4ProtocolNumber is IPv4's network protocol number.
 	IPv4ProtocolNumber tcpip.NetworkProtocolNumber = 0x0800
 
-	// IPv4Version is the version of the ipv4 procotol.
+	// IPv4Version is the version of the ipv4 protocol.
 	IPv4Version = 4
 )
 
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index 6e3125b7b..a46212173 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -131,7 +131,7 @@ func waitForProcessList(s *sandbox.Sandbox, expected []*control.Process) error {
 	return fmt.Errorf("sandbox got process list: %s, want: %s", procListToString(got), procListToString(expected))
 }
 
-// TestLifecycle tests the basic Create/Start/Signal/Destory sandbox lifecycle.
+// TestLifecycle tests the basic Create/Start/Signal/Destroy sandbox lifecycle.
 // It verifies after each step that the sandbox can be loaded from disk, and
 // has the correct status.
 func TestLifecycle(t *testing.T) {
-- 
cgit v1.2.3


From e1b412d6609c848ff09356ead133b51cd0589731 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 8 May 2018 10:33:20 -0700
Subject: Error if container requires AppArmor, SELinux or seccomp

Closes #35

PiperOrigin-RevId: 195840128
Change-Id: I31c1ad9b51ec53abb6f0b485d35622d4e9764b29
---
 runsc/sandbox/sandbox.go      | 19 +++++++++++++++++++
 runsc/sandbox/sandbox_test.go | 22 ++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 0354a64b9..2a5eda6ae 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -53,6 +53,22 @@ func validateID(id string) error {
 	return nil
 }
 
+func validateSpec(spec *specs.Spec) error {
+	if spec.Process.SelinuxLabel != "" {
+		return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
+	}
+
+	// Docker uses AppArmor by default, so just log that it's being ignored.
+	if spec.Process.ApparmorProfile != "" {
+		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
+	}
+	// TODO: Apply seccomp to application inside sandbox.
+	if spec.Linux != nil && spec.Linux.Seccomp != nil {
+		log.Warningf("Seccomp spec is being ignored")
+	}
+	return nil
+}
+
 // Sandbox wraps a child sandbox process, and is responsible for saving and
 // loading sandbox metadata to disk.
 //
@@ -110,6 +126,9 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	if err := validateID(id); err != nil {
 		return nil, err
 	}
+	if err := validateSpec(spec); err != nil {
+		return nil, err
+	}
 
 	sandboxRoot := filepath.Join(conf.RootDir, id)
 	if exists(sandboxRoot) {
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index a46212173..1fac38a29 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -567,6 +567,28 @@ func TestConsoleSocket(t *testing.T) {
 	}
 }
 
+func TestSpecUnsupported(t *testing.T) {
+	spec := newSpecWithArgs("/bin/true")
+	spec.Process.SelinuxLabel = "somelabel"
+
+	// These are normally set by docker and will just cause warnings to be logged.
+	spec.Process.ApparmorProfile = "someprofile"
+	spec.Linux = &specs.Linux{Seccomp: &specs.LinuxSeccomp{}}
+
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	id := uniqueSandboxID()
+	_, err = sandbox.Create(id, spec, conf, bundleDir, "", "", nil)
+	if err == nil || !strings.Contains(err.Error(), "is not supported") {
+		t.Errorf("sandbox.Create() wrong error, got: %v, want: *is not supported, spec.Process: %+v", err, spec.Process)
+	}
+}
+
 // procListsEqual is used to check whether 2 Process lists are equal for all
 // implemented fields.
 func procListsEqual(got, want []*control.Process) bool {
-- 
cgit v1.2.3


From 32cabad8dab689fd94d6f2d19d4db10285a94e60 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 8 May 2018 11:10:53 -0700
Subject: Use the containerd annotation instead of detecting the "pause"
 application.

FIXED=72380268
PiperOrigin-RevId: 195846596
Change-Id: Ic87fed1433482a514631e1e72f5ee208e11290d1
---
 runsc/sandbox/network.go | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 1b6a1d9a6..d0ff64067 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -31,6 +31,13 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 )
 
+const (
+	// Annotations used to indicate whether the container corresponds to a
+	// pod or a container within a pod.
+	crioContainerTypeAnnotation       = "io.kubernetes.cri-o.ContainerType"
+	containerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+)
+
 // setupNetwork configures the network stack to mimic the local network
 // configuration. Docker uses network namespaces with vnets to configure the
 // network for the container. The untrusted app expects to see the same network
@@ -68,17 +75,10 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 	// For now the following HACK disables networking for the "pause"
 	// sandbox, allowing the second sandbox to start up successfully.
 	//
-	// Cri-o helpfully adds the "ContainerType" annotation that we can use
-	// to detect whether we are a pod or container.  Cri-containerd will
-	// support this eventually, but does not currently
-	// (https://github.com/kubernetes-incubator/cri-containerd/issues/512).
-	//
-	// Thus, to support cri-containerd, we check if the exec args is
-	// "/pause", which is pretty gross.
-	//
 	// TODO: Remove this once multiple containers per sandbox
 	// is properly supported.
-	if spec.Annotations["io.kubernetes.cri-o.ContainerType"] == "sandbox" || spec.Process.Args[0] == "/pause" {
+	if spec.Annotations[crioContainerTypeAnnotation] == "sandbox" ||
+		spec.Annotations[containerdContainerTypeAnnotation] == "sandbox" {
 		log.Warningf("HACK: Disabling network")
 		conf.Network = boot.NetworkNone
 	}
-- 
cgit v1.2.3


From 1bdec86baed82fe8d79100693949f1259636e9db Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 9 May 2018 14:12:44 -0700
Subject: Return better errors from Docker when runsc fails to start.

Two changes in this CL:

First, make the "boot" process sleep when it encounters an error to give the
controller time to send the error back to the "start" process. Otherwise the
"boot" process exits immediately and the control connection errors with EOF.

Secondly, open the log file with O_APPEND, not O_TRUNC. Docker uses the same
log file for all runtime commands, and setting O_TRUNC causes them to get
destroyed. Furthermore, containerd parses these log files in the event of an
error, and it does not like the file being truncated out from underneath it.

Now, when trying to run a binary that does not exist in the image, the error
message is more reasonable:

$ docker run alpine /not/found
docker: Error response from daemon: OCI runtime start failed: /usr/local/google/docker/runtimes/runscd did not terminate sucessfully: error starting sandbox: error starting application [/not/found]: failed to create init process: no such file or directory

Fixes #32

PiperOrigin-RevId: 196027084
Change-Id: Iabc24c0bdd8fc327237acc051a1655515f445e68
---
 runsc/boot/controller.go  |  2 +-
 runsc/boot/loader.go      | 10 +++++++++-
 runsc/boot/loader_test.go | 19 ++++++++++++++++++-
 runsc/main.go             |  5 ++++-
 runsc/sandbox/sandbox.go  |  2 +-
 5 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 4d4ef7256..60c42fc19 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -68,7 +68,7 @@ func newController(fd int, k *kernel.Kernel) (*controller, error) {
 
 	app := &application{
 		startChan:       make(chan struct{}),
-		startResultChan: make(chan error, 1),
+		startResultChan: make(chan error),
 		k:               k,
 	}
 	srv.Register(app)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index af577f571..34a25241f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -287,7 +287,15 @@ func createPlatform(conf *Config) (platform.Platform, error) {
 func (l *Loader) Run() error {
 	err := l.run()
 	l.ctrl.app.startResultChan <- err
-	return err
+	if err != nil {
+		// Give the controller some time to send the error to the
+		// runtime. If we return too quickly here the process will exit
+		// and the control connection will be closed before the error
+		// is returned.
+		gtime.Sleep(2 * gtime.Second)
+		return err
+	}
+	return nil
 }
 
 func (l *Loader) run() error {
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 2fc16b241..c3d9887fa 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -16,6 +16,7 @@ package boot
 
 import (
 	"os"
+	"sync"
 	"testing"
 	"time"
 
@@ -65,11 +66,27 @@ func TestRun(t *testing.T) {
 	}
 	defer s.Destroy()
 
+	// Start a goroutine to read the start chan result, otherwise Run will
+	// block forever.
+	var resultChanErr error
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		resultChanErr = <-s.ctrl.app.startResultChan
+		wg.Done()
+	}()
+
 	// Run the application.
 	if err := s.Run(); err != nil {
 		t.Errorf("error running application: %v", err)
 	}
 
+	// We should have not gotten an error on the startResultChan.
+	wg.Wait()
+	if resultChanErr != nil {
+		t.Errorf("error on startResultChan: %v", resultChanErr)
+	}
+
 	// Wait for the application to exit.  It should succeed.
 	if status := s.WaitExit(); status.Code != 0 || status.Signo != 0 {
 		t.Errorf("application exited with status %+v, want 0", status)
@@ -94,7 +111,7 @@ func TestStartSignal(t *testing.T) {
 	waitFinished := make(chan struct{})
 	go func() {
 		s.WaitForStartSignal()
-		// Pretent that Run() executed and returned no error.
+		// Pretend that Run() executed and returned no error.
 		s.ctrl.app.startResultChan <- nil
 		waitFinished <- struct{}{}
 	}()
diff --git a/runsc/main.go b/runsc/main.go
index cf4b99d3f..883b8b1f4 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -126,7 +126,10 @@ func main() {
 
 	var logFile io.Writer = os.Stderr
 	if *logFilename != "" {
-		f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
+		// We must set O_APPEND and not O_TRUNC because Docker passes
+		// the same log file for all commands (and also parses these
+		// log files), so we can't destroy them on each command.
+		f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
 		if err != nil {
 			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
 		}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 2a5eda6ae..34bd6ea67 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -289,7 +289,7 @@ func (s *Sandbox) Start(conf *boot.Config) error {
 	// application.
 	if err := c.Call(boot.ApplicationStart, nil, nil); err != nil {
 		s.Destroy()
-		return fmt.Errorf("error starting sandbox: %v", err)
+		return fmt.Errorf("error starting application %v: %v", s.Spec.Process.Args, err)
 	}
 
 	// "If any poststart hook fails, the runtime MUST log a warning, but
-- 
cgit v1.2.3


From 5a509c47a20e0b81b95bb4932e8b19dfc6a402e2 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 10 May 2018 12:37:46 -0700
Subject: Open file as read-write when mount points to a file

This is to allow files mapped directly, like /etc/hosts, to be writable.
Closes #40

PiperOrigin-RevId: 196155920
Change-Id: Id2027e421cef5f94a0951c3e18b398a77c285bbd
---
 runsc/fsgofer/fsgofer.go      | 12 ++++++++++-
 runsc/fsgofer/fsgofer_test.go | 50 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index be2ac5f3c..11dd28cef 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -98,7 +98,17 @@ func (a *attachPoint) Attach(appPath string) (p9.File, error) {
 	}
 
 	root := filepath.Join(a.prefix, appPath)
-	f, err := os.OpenFile(root, openFlags|syscall.O_RDONLY, 0)
+	fi, err := os.Stat(root)
+	if err != nil {
+		return nil, err
+	}
+
+	mode := syscall.O_RDWR
+	if a.conf.ROMount || fi.IsDir() {
+		mode = syscall.O_RDONLY
+	}
+
+	f, err := os.OpenFile(root, mode|openFlags, 0)
 	if err != nil {
 		return nil, fmt.Errorf("unable to open file %q, err: %v", root, err)
 	}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 58d04aefa..249f67bf9 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
+	"path"
 	"syscall"
 	"testing"
 
@@ -102,7 +103,7 @@ func setup(ft fileType) (string, string, error) {
 		return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err)
 	}
 
-	// First attach with writable configuiration to setup tree.
+	// First attach with writable configuration to setup tree.
 	a := NewAttachPoint(path, Config{})
 	root, err := a.Attach("/")
 	if err != nil {
@@ -574,3 +575,50 @@ func TestReaddir(t *testing.T) {
 		}
 	})
 }
+
+// Test that attach point can be written to when it points to a file, e.g.
+// /etc/hosts.
+func TestAttachFile(t *testing.T) {
+	conf := Config{ROMount: false}
+	dir, err := ioutil.TempDir("", "root-")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed, err: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	path := path.Join(dir, "test")
+	if _, err := os.Create(path); err != nil {
+		t.Fatalf("os.Create(%q) failed, err: %v", path, err)
+	}
+
+	a := NewAttachPoint(path, conf)
+	root, err := a.Attach("/")
+	if err != nil {
+		t.Fatalf("Attach(%q) failed, err: %v", "/", err)
+	}
+
+	if _, _, _, err := root.Open(p9.ReadWrite); err != nil {
+		t.Fatalf("Open(ReadWrite) failed, err: %v", err)
+	}
+	defer root.Close()
+
+	b := []byte("foobar")
+	w, err := root.WriteAt(b, 0)
+	if err != nil {
+		t.Fatalf("Write() failed, err: %v", err)
+	}
+	if w != len(b) {
+		t.Fatalf("Write() was partial, got: %d, expected: %d", w, len(b))
+	}
+	rBuf := make([]byte, len(b))
+	r, err := root.ReadAt(rBuf, 0)
+	if err != nil {
+		t.Fatalf("ReadAt() failed, err: %v", err)
+	}
+	if r != len(rBuf) {
+		t.Fatalf("ReadAt() was partial, got: %d, expected: %d", r, len(rBuf))
+	}
+	if string(rBuf) != "foobar" {
+		t.Fatalf("ReadAt() wrong data, got: %s, expected: %s", string(rBuf), "foobar")
+	}
+}
-- 
cgit v1.2.3


From ac01f245ff4515af2b69225e8b7fb2cf28808275 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 10 May 2018 14:58:51 -0700
Subject: Skip atime and mtime update when file is backed by host FD

When file is backed by host FD, atime and mtime for the host file and the
cached attributes in the Sentry must be close together. In this case,
the call to update atime and mtime can be skipped. This is important when
host filesystem is using overlay because updating atime and mtime explicitly
forces a copy up for every file that is touched.

PiperOrigin-RevId: 196176413
Change-Id: I3933ea91637a071ba2ea9db9d8ac7cdba5dc0482
---
 pkg/sentry/fs/gofer/inode.go | 31 ++++++++++++++++++++++++++++++-
 runsc/boot/fs.go             |  6 +-----
 2 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 454242923..c00da5fec 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -206,7 +206,7 @@ func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.Blo
 
 // SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
 func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
-	if mask.Empty() {
+	if i.skipSetAttr(mask) {
 		return nil
 	}
 	as, ans := attr.AccessTime.Unix()
@@ -237,6 +237,35 @@ func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMa
 		})
 }
 
+// skipSetAttr checks if attribute change can be skipped. It can be skipped
+// when:
+//   - Mask is empty
+//   - Mask contains only atime and/or mtime, and host FD exists
+//
+// Updates to atime and mtime can be skipped because cached value will be
+// "close enough" to host value, given that operation went directly to host FD.
+// Skipping atime updates is particularly important to reduce the number of
+// operations sent to the Gofer for readonly files.
+func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool {
+	if mask.Empty() {
+		return true
+	}
+
+	cpy := mask
+	cpy.AccessTime = false
+	cpy.ModificationTime = false
+	if !cpy.Empty() {
+		// More than just atime and mtime is being set.
+		return false
+	}
+
+	i.handlesMu.RLock()
+	defer i.handlesMu.RUnlock()
+	return (i.readonly != nil && i.readonly.Host != nil) ||
+		(i.readthrough != nil && i.readthrough.Host != nil) ||
+		(i.writeback != nil && i.writeback.Host != nil)
+}
+
 // Sync implements fsutil.CachedFileObject.Sync.
 func (i *inodeFileState) Sync(ctx context.Context) error {
 	i.handlesMu.RLock()
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 2073bd0b1..86cbe1169 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -141,10 +141,7 @@ func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, i
 // createRootMount creates the root filesystem.
 func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
-	mf := fs.MountSourceFlags{
-		ReadOnly: spec.Root.Readonly,
-		NoAtime:  true,
-	}
+	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly}
 
 	var (
 		rootInode *fs.Inode
@@ -261,7 +258,6 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 		// All writes go to upper, be paranoid and make lower readonly.
 		mf.ReadOnly = true
 	}
-	mf.NoAtime = true
 
 	inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ","))
 	if err != nil {
-- 
cgit v1.2.3


From 7b6111b695840ab2938abe0d207514dd34081327 Mon Sep 17 00:00:00 2001
From: Chanwit Kaewkasi <chanwit@gmail.com>
Date: Thu, 10 May 2018 16:56:49 -0700
Subject: Display the current git revision in the info block

Change-Id: I9737cc680968033ba82c95bb04cc482fcaa12642
PiperOrigin-RevId: 196192683
---
 runsc/BUILD               |  1 +
 runsc/main.go             |  3 +++
 tools/bazel.rc            |  1 +
 tools/workspace_status.sh | 17 +++++++++++++++++
 4 files changed, 22 insertions(+)
 create mode 100644 tools/bazel.rc
 create mode 100755 tools/workspace_status.sh

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index 3651c2d30..8f8e2ee35 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -8,6 +8,7 @@ go_binary(
         "main.go",
     ],
     pure = "on",
+    x_defs = {"main.gitRevision": "{GIT_REVISION}"},
     deps = [
         "//pkg/log",
         "//runsc/boot",
diff --git a/runsc/main.go b/runsc/main.go
index 883b8b1f4..3311514d2 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -61,6 +61,8 @@ var (
 	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 )
 
+var gitRevision = ""
+
 func main() {
 	// Help and flags commands are generated automatically.
 	subcommands.Register(subcommands.HelpCommand(), "")
@@ -166,6 +168,7 @@ func main() {
 
 	log.Infof("***************************")
 	log.Infof("Args: %s", os.Args)
+	log.Infof("Git Revision: %s", gitRevision)
 	log.Infof("PID: %d", os.Getpid())
 	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
 	log.Infof("Configuration:")
diff --git a/tools/bazel.rc b/tools/bazel.rc
new file mode 100644
index 000000000..6e437306c
--- /dev/null
+++ b/tools/bazel.rc
@@ -0,0 +1 @@
+build --workspace_status_command tools/workspace_status.sh
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
new file mode 100755
index 000000000..d89db1f99
--- /dev/null
+++ b/tools/workspace_status.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo GIT_REVISION $(git describe --always --abbrev=40 --dirty)
-- 
cgit v1.2.3


From 7cff8489de2254cf355ec81cebc2338e0035f2df Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 10 May 2018 17:12:21 -0700
Subject: Fix failure to rename directory

os.Rename validates that the target doesn't exist, which is different from
syscall.Rename which replace the target if both are directories. fsgofer needs
the syscall behavior.

PiperOrigin-RevId: 196194630
Change-Id: I87d08cad88b5ef310b245cd91647c4f5194159d8
---
 runsc/fsgofer/fsgofer.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 11dd28cef..cd6224de3 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -708,7 +708,7 @@ func (l *localFile) Rename(directory p9.File, name string) error {
 	// TODO: change to renameat(2)
 	parent := directory.(*localFile)
 	newPath := path.Join(parent.hostPath, name)
-	if err := os.Rename(l.hostPath, newPath); err != nil {
+	if err := syscall.Rename(l.hostPath, newPath); err != nil {
 		return extractErrno(err)
 	}
 
-- 
cgit v1.2.3


From 205f1027e6beb84101439172b3c776c2671b5be8 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 15 May 2018 10:17:19 -0700
Subject: Refactor the Sandbox package into Sandbox + Container.

This is a necessary prerequisite for supporting multiple containers in a single
sandbox.

All the commands (in cmd package) now call operations on Containers (container
package). When a Container first starts, it will create a Sandbox with the same
ID.

The Sandbox class is now simpler, as it only knows how to create boot/gofer
processes, and how to forward commands into the running boot process.

There are TODOs sprinkled around for additional support for multiple
containers. Most notably, we need to detect when a container is intended to run
in an existing sandbox (by reading the metadata), and then have some way to
signal to the sandbox to start a new container. Other urpc calls into the
sandbox need to pass the container ID, so the sandbox can run the operation on
the given container. These are only half-plummed through right now.

PiperOrigin-RevId: 196688269
Change-Id: I1ecf4abbb9dd8987a53ae509df19341aaf42b5b0
---
 runsc/boot/config.go              |  46 ++-
 runsc/cmd/BUILD                   |   2 +-
 runsc/cmd/cmd.go                  |  11 -
 runsc/cmd/create.go               |  17 +-
 runsc/cmd/delete.go               |  18 +-
 runsc/cmd/events.go               |   8 +-
 runsc/cmd/exec.go                 |  20 +-
 runsc/cmd/kill.go                 |  10 +-
 runsc/cmd/list.go                 |  34 +-
 runsc/cmd/ps.go                   |   8 +-
 runsc/cmd/run.go                  |   6 +-
 runsc/cmd/start.go                |  10 +-
 runsc/cmd/state.go                |  16 +-
 runsc/container/BUILD             |  45 +++
 runsc/container/container.go      | 380 ++++++++++++++++++++++
 runsc/container/container_test.go | 669 ++++++++++++++++++++++++++++++++++++++
 runsc/container/hook.go           | 111 +++++++
 runsc/container/status.go         |  54 +++
 runsc/main.go                     |   4 +
 runsc/sandbox/BUILD               |  25 +-
 runsc/sandbox/hook.go             | 111 -------
 runsc/sandbox/sandbox.go          | 430 ++++++------------------
 runsc/sandbox/sandbox_test.go     | 665 -------------------------------------
 runsc/sandbox/status.go           |  56 ----
 runsc/specutils/specutils.go      |  23 +-
 25 files changed, 1499 insertions(+), 1280 deletions(-)
 create mode 100644 runsc/container/BUILD
 create mode 100644 runsc/container/container.go
 create mode 100644 runsc/container/container_test.go
 create mode 100644 runsc/container/hook.go
 create mode 100644 runsc/container/status.go
 delete mode 100644 runsc/sandbox/hook.go
 delete mode 100644 runsc/sandbox/sandbox_test.go
 delete mode 100644 runsc/sandbox/status.go

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index f3e33e89a..d5dd400d1 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -14,7 +14,11 @@
 
 package boot
 
-import "fmt"
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
 
 // PlatformType tells which platform to use.
 type PlatformType int
@@ -131,6 +135,19 @@ type Config struct {
 	// RootDir is the runtime root directory.
 	RootDir string
 
+	// Debug indicates that debug logging should be enabled.
+	Debug bool
+
+	// LogFilename is the filename to log to, if not empty.
+	LogFilename string
+
+	// LogFormat is the log format, "text" or "json".
+	LogFormat string
+
+	// DebugLogDir is the directory to log debug information to, if not
+	// empty.
+	DebugLogDir string
+
 	// FileAccess indicates how the filesystem is accessed.
 	FileAccess FileAccessType
 
@@ -159,4 +176,31 @@ type Config struct {
 	// DisableSeccomp indicates whether seccomp syscall filters should be
 	// disabled. Pardon the double negation, but default to enabled is important.
 	DisableSeccomp bool
+
+	// TestModeNoFlags indicates that the ToFlags method should return
+	// empty. This should only be used in tests, since the test runner does
+	// not know about all the flags.
+	TestModeNoFlags bool
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+	if c.TestModeNoFlags {
+		return nil
+	}
+	return []string{
+		"--root=" + c.RootDir,
+		"--debug=" + strconv.FormatBool(c.Debug),
+		"--log=" + c.LogFilename,
+		"--log-format=" + c.LogFormat,
+		"--debug-log-dir=" + c.DebugLogDir,
+		"--file-access=" + c.FileAccess.String(),
+		"--overlay=" + strconv.FormatBool(c.Overlay),
+		"--network=" + c.Network.String(),
+		"--log-packets=" + strconv.FormatBool(c.LogPackets),
+		"--platform=" + c.Platform.String(),
+		"--strace=" + strconv.FormatBool(c.Strace),
+		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
+		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
+	}
 }
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 128c8f7e6..08aaee996 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -32,8 +32,8 @@ go_library(
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/container",
         "//runsc/fsgofer",
-        "//runsc/sandbox",
         "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index d4b834213..9f7fd6e25 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -20,7 +20,6 @@ import (
 	"os"
 	"strconv"
 
-	"flag"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
@@ -35,16 +34,6 @@ func Fatalf(s string, args ...interface{}) {
 	os.Exit(128)
 }
 
-// commandLineFlags returns a slice of all top-level command line flags that
-// have been set.
-func commandLineFlags() []string {
-	var args []string
-	flag.CommandLine.Visit(func(f *flag.Flag) {
-		args = append(args, fmt.Sprintf("--%s=%s", f.Name, f.Value.String()))
-	})
-	return args
-}
-
 // intFlags can be used with int flags that appear multiple times.
 type intFlags []int
 
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 83cb09eb0..94a889077 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -19,7 +19,7 @@ import (
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -30,8 +30,8 @@ type Create struct {
 	bundleDir string
 
 	// pidFile is the filename that the sandbox pid will be written to.
-	// This file should only be created once the sandbox process is ready
-	// to use (i.e. control server has started and is listening).
+	// This file should only be created once the container process inside
+	// the sandbox is ready to use.
 	pidFile string
 
 	// consoleSocket is the path to an AF_UNIX socket which will receive a
@@ -61,7 +61,7 @@ func (*Create) Usage() string {
 func (c *Create) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
 	f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
-	f.StringVar(&c.pidFile, "pid-file", "", "filename that the sandbox pid will be written to")
+	f.StringVar(&c.pidFile, "pid-file", "", "filename that the container pid will be written to")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -84,10 +84,11 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 	specutils.LogSpec(spec)
 
-	// Create the sandbox process, passing additional command line
-	// arguments to the sandbox process.
-	if _, err := sandbox.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, commandLineFlags()); err != nil {
-		Fatalf("error creating sandbox: %v", err)
+	// Create the container. A new sandbox will be created for the
+	// container unless the metadata specifies that it should be run in an
+	// existing container.
+	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile); err != nil {
+		Fatalf("error creating container: %v", err)
 	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index a497c034d..769a11c45 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -19,12 +19,12 @@ import (
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 )
 
 // Delete implements subcommands.Command for the "delete" command.
 type Delete struct {
-	// force indicates that the sandbox should be terminated if running.
+	// force indicates that the container should be terminated if running.
 	force bool
 }
 
@@ -45,7 +45,7 @@ func (*Delete) Usage() string {
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (d *Delete) SetFlags(f *flag.FlagSet) {
-	f.BoolVar(&d.force, "force", false, "terminate sandbox if running")
+	f.BoolVar(&d.force, "force", false, "terminate container if running")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -59,15 +59,15 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 
 	for i := 0; i < f.NArg(); i++ {
 		id := f.Arg(i)
-		s, err := sandbox.Load(conf.RootDir, id)
+		c, err := container.Load(conf.RootDir, id)
 		if err != nil {
-			Fatalf("error loading sandbox %q: %v", id, err)
+			Fatalf("error loading container %q: %v", id, err)
 		}
-		if !d.force && (s.Status == sandbox.Running) {
-			Fatalf("cannot stop running sandbox without --force flag")
+		if !d.force && (c.Status == container.Running) {
+			Fatalf("cannot stop running container without --force flag")
 		}
-		if err := s.Destroy(); err != nil {
-			Fatalf("error destroying sandbox: %v", err)
+		if err := c.Destroy(); err != nil {
+			Fatalf("error destroying container: %v", err)
 		}
 	}
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index afd42c2f2..f221ad3ae 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -24,7 +24,7 @@ import (
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 )
 
 // Events implements subcommands.Command for the "events" command.
@@ -74,7 +74,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
 
-	s, err := sandbox.Load(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
 		Fatalf("error loading sandox: %v", err)
 	}
@@ -82,9 +82,9 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	// Repeatedly get stats from the container.
 	for {
 		// Get the event and print it as JSON.
-		ev, err := s.Event()
+		ev, err := c.Event()
 		if err != nil {
-			log.Warningf("error getting events for sandbox: %v", err)
+			log.Warningf("error getting events for container: %v", err)
 		}
 		// err must be preserved because it is used below when breaking
 		// out of the loop.
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 052e00316..235ed9bc6 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -34,7 +34,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -89,11 +89,11 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process")
 	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
 	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
-	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the sandbox pid will be written to")
+	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
-// already created sandbox.
+// already created container.
 func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	e, id, err := ex.parseArgs(f)
 	if err != nil {
@@ -102,17 +102,17 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	s, err := sandbox.Load(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
 		Fatalf("error loading sandox: %v", err)
 	}
 
 	if e.WorkingDirectory == "" {
-		e.WorkingDirectory = s.Spec.Process.Cwd
+		e.WorkingDirectory = c.Spec.Process.Cwd
 	}
 
 	if e.Envv == nil {
-		e.Envv, err = resolveEnvs(s.Spec.Process.Env, ex.env)
+		e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env)
 		if err != nil {
 			Fatalf("error getting environment variables: %v", err)
 		}
@@ -136,15 +136,15 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// inspect the environment PATH which is relative to the root path.
 	// If the user is overriding environment variables, PATH may have been
 	// overwritten.
-	rootPath := s.Spec.Root.Path
+	rootPath := c.Spec.Root.Path
 	e.Filename, err = specutils.GetExecutablePath(e.Argv[0], rootPath, e.Envv)
 	if err != nil {
 		Fatalf("error getting executable path: %v", err)
 	}
 
-	ws, err := s.Execute(e)
+	ws, err := c.Execute(e)
 	if err != nil {
-		Fatalf("error getting processes for sandbox: %v", err)
+		Fatalf("error getting processes for container: %v", err)
 	}
 	*waitStatus = ws
 	return subcommands.ExitSuccess
@@ -196,7 +196,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 
 // parseArgs parses exec information from the command line or a JSON file
 // depending on whether the --process flag was used. Returns an ExecArgs and
-// the ID of the sandbox to be used.
+// the ID of the container to be used.
 func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
 	if ex.processPath == "" {
 		// Requires at least a container ID and command.
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index f89e0077e..97a505fac 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -25,7 +25,7 @@ import (
 	"github.com/google/subcommands"
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 )
 
 // Kill implements subcommands.Command for the "kill" command.
@@ -38,7 +38,7 @@ func (*Kill) Name() string {
 
 // Synopsis implements subcommands.Command.Synopsis.
 func (*Kill) Synopsis() string {
-	return "sends a signal to the sandbox"
+	return "sends a signal to the container"
 }
 
 // Usage implements subcommands.Command.Usage.
@@ -64,9 +64,9 @@ func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
 
-	s, err := sandbox.Load(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandbox: %v", err)
+		Fatalf("error loading container: %v", err)
 	}
 
 	// The OCI command-line spec says that the signal should be specified
@@ -81,7 +81,7 @@ func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	if err != nil {
 		Fatalf("%v", err)
 	}
-	if err := s.Signal(sig); err != nil {
+	if err := c.Signal(sig); err != nil {
 		Fatalf("%v", err)
 	}
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index bf7cb41bb..d554bf7cf 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -26,7 +26,7 @@ import (
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 )
 
 // List implements subcommands.Command for the "list" command for the "list" command.
@@ -64,7 +64,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	conf := args[0].(*boot.Config)
-	ids, err := sandbox.List(conf.RootDir)
+	ids, err := container.List(conf.RootDir)
 	if err != nil {
 		Fatalf("%v", err)
 	}
@@ -76,14 +76,14 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return subcommands.ExitSuccess
 	}
 
-	// Collect the sandboxes.
-	var sandboxes []*sandbox.Sandbox
+	// Collect the containers.
+	var containers []*container.Container
 	for _, id := range ids {
-		s, err := sandbox.Load(conf.RootDir, id)
+		c, err := container.Load(conf.RootDir, id)
 		if err != nil {
-			Fatalf("error loading sandbox %q: %v", id, err)
+			Fatalf("error loading container %q: %v", id, err)
 		}
-		sandboxes = append(sandboxes, s)
+		containers = append(containers, c)
 	}
 
 	switch l.format {
@@ -91,24 +91,24 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		// Print a nice table.
 		w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
 		fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
-		for _, s := range sandboxes {
+		for _, c := range containers {
 			fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
-				s.ID,
-				s.Pid,
-				s.Status,
-				s.BundleDir,
-				s.CreatedAt.Format(time.RFC3339Nano),
-				s.Owner)
+				c.ID,
+				c.Pid(),
+				c.Status,
+				c.BundleDir,
+				c.CreatedAt.Format(time.RFC3339Nano),
+				c.Owner)
 		}
 		w.Flush()
 	case "json":
 		// Print just the states.
 		var states []specs.State
-		for _, s := range sandboxes {
-			states = append(states, s.State())
+		for _, c := range containers {
+			states = append(states, c.State())
 		}
 		if err := json.NewEncoder(os.Stdout).Encode(states); err != nil {
-			Fatalf("error marshaling sandbox state: %v", err)
+			Fatalf("error marshaling container state: %v", err)
 		}
 	default:
 		Fatalf("unknown list format %q", l.format)
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index a667ec04c..9f9f4d15e 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -22,7 +22,7 @@ import (
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 )
 
 // PS implements subcommands.Command for the "ps" command.
@@ -60,13 +60,13 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
 
-	s, err := sandbox.Load(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
 		Fatalf("error loading sandox: %v", err)
 	}
-	pList, err := s.Processes()
+	pList, err := c.Processes()
 	if err != nil {
-		Fatalf("error getting processes for sandbox: %v", err)
+		Fatalf("error getting processes for container: %v", err)
 	}
 
 	switch ps.format {
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index a61a6c73e..681112f30 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -21,7 +21,7 @@ import (
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -72,9 +72,9 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 		Fatalf("error reading spec: %v", err)
 	}
 
-	ws, err := sandbox.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, commandLineFlags())
+	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile)
 	if err != nil {
-		Fatalf("error running sandbox: %v", err)
+		Fatalf("error running container: %v", err)
 	}
 
 	*waitStatus = ws
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index a8e132497..97ea91fff 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -19,7 +19,7 @@ import (
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 )
 
 // Start implements subcommands.Command for the "start" command.
@@ -53,12 +53,12 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
 
-	s, err := sandbox.Load(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandbox: %v", err)
+		Fatalf("error loading container: %v", err)
 	}
-	if err := s.Start(conf); err != nil {
-		Fatalf("error starting sandbox: %v", err)
+	if err := c.Start(conf); err != nil {
+		Fatalf("error starting container: %v", err)
 	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index 0b47f290a..28752d95e 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -23,7 +23,7 @@ import (
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/container"
 )
 
 // State implements subcommands.Command for the "state" command.
@@ -36,12 +36,12 @@ func (*State) Name() string {
 
 // Synopsis implements subcommands.Command.Synopsis.
 func (*State) Synopsis() string {
-	return "get the state of a sandbox"
+	return "get the state of a container"
 }
 
 // Usage implements subcommands.Command.Usage.
 func (*State) Usage() string {
-	return `state [flags] <container id> - get the state of a sandbox`
+	return `state [flags] <container id> - get the state of a container`
 }
 
 // SetFlags implements subcommands.Command.SetFlags.
@@ -57,16 +57,16 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
 
-	s, err := sandbox.Load(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandbox: %v", err)
+		Fatalf("error loading container: %v", err)
 	}
-	log.Debugf("Returning state %+v", s)
+	log.Debugf("Returning state for container %+v", c)
 
 	// Write json-encoded state directly to stdout.
-	b, err := json.MarshalIndent(s.State(), "", "  ")
+	b, err := json.MarshalIndent(c.State(), "", "  ")
 	if err != nil {
-		Fatalf("error marshaling sandbox state: %v", err)
+		Fatalf("error marshaling container state: %v", err)
 	}
 	os.Stdout.Write(b)
 	return subcommands.ExitSuccess
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
new file mode 100644
index 000000000..c558b4b0a
--- /dev/null
+++ b/runsc/container/BUILD
@@ -0,0 +1,45 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "container",
+    srcs = [
+        "container.go",
+        "hook.go",
+        "status.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/container",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//runsc/boot",
+        "//runsc/sandbox",
+        "//runsc/specutils",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
+
+go_test(
+    name = "container_test",
+    size = "small",
+    srcs = ["container_test.go"],
+    pure = "on",
+    rundir = ".",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/unet",
+        "//runsc/boot",
+        "//runsc/cmd",
+        "//runsc/container",
+        "@com_github_google_subcommands//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/container/container.go b/runsc/container/container.go
new file mode 100644
index 000000000..97115cd6b
--- /dev/null
+++ b/runsc/container/container.go
@@ -0,0 +1,380 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package container creates and manipulates containers.
+package container
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strconv"
+	"syscall"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// metadataFilename is the name of the metadata file relative to the container
+// root directory that holds sandbox metadata.
+const metadataFilename = "meta.json"
+
+// validateID validates the container id.
+func validateID(id string) error {
+	// See libcontainer/factory_linux.go.
+	idRegex := regexp.MustCompile(`^[\w+-\.]+$`)
+	if !idRegex.MatchString(id) {
+		return fmt.Errorf("invalid container id: %v", id)
+	}
+	return nil
+}
+
+// Container represents a containerized application. When running, the
+// container is associated with a single Sandbox.
+//
+// Container metadata can be saved and loaded to disk. Within a root directory,
+// we maintain subdirectories for each container named with the container id.
+// The container metadata is is stored as json within the container directory
+// in a file named "meta.json". This metadata format is defined by us, and is
+// not part of the OCI spec.
+//
+// Containers must write their metadata file after any change to their internal
+// state. The entire container directory is deleted when the container is
+// destroyed.
+type Container struct {
+	// ID is the container ID.
+	ID string `json:"id"`
+
+	// Spec is the OCI runtime spec that configures this container.
+	Spec *specs.Spec `json:"spec"`
+
+	// BundleDir is the directory containing the container bundle.
+	BundleDir string `json:"bundleDir"`
+
+	// Root is the directory containing the container metadata file.
+	Root string `json:"root"`
+
+	// CreatedAt is the time the container was created.
+	CreatedAt time.Time `json:"createdAt"`
+
+	// Owner is the container owner.
+	Owner string `json:"owner"`
+
+	// ConsoleSocket is the path to a unix domain socket that will receive
+	// the console FD. It is only used during create, so we don't need to
+	// store it in the metadata.
+	ConsoleSocket string `json:"-"`
+
+	// Status is the current container Status.
+	Status Status `json:"status"`
+
+	// Sandbox is the sandbox this container is running in. It will be nil
+	// if the container is not in state Running or Created.
+	Sandbox *sandbox.Sandbox `json:"sandbox"`
+}
+
+// Load loads a container with the given id from a metadata file.
+func Load(rootDir, id string) (*Container, error) {
+	log.Debugf("Load container %q %q", rootDir, id)
+	if err := validateID(id); err != nil {
+		return nil, err
+	}
+	cRoot := filepath.Join(rootDir, id)
+	if !exists(cRoot) {
+		return nil, fmt.Errorf("container with id %q does not exist", id)
+	}
+	metaFile := filepath.Join(cRoot, metadataFilename)
+	if !exists(metaFile) {
+		return nil, fmt.Errorf("container with id %q does not have metadata file %q", id, metaFile)
+	}
+	metaBytes, err := ioutil.ReadFile(metaFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading container metadata file %q: %v", metaFile, err)
+	}
+	var c Container
+	if err := json.Unmarshal(metaBytes, &c); err != nil {
+		return nil, fmt.Errorf("error unmarshaling container metadata from %q: %v", metaFile, err)
+	}
+
+	// If the status is "Running" or "Created", check that the sandbox
+	// process still exists, and set it to Stopped if it does not.
+	//
+	// This is inherently racey.
+	if c.Status == Running || c.Status == Created {
+		// Send signal 0 to check if container still exists.
+		if err := c.Signal(0); err != nil {
+			// Container no longer exists.
+			c.Status = Stopped
+			c.Sandbox = nil
+		}
+	}
+
+	return &c, nil
+}
+
+// List returns all container ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+	log.Debugf("List containers %q", rootDir)
+	fs, err := ioutil.ReadDir(rootDir)
+	if err != nil {
+		return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err)
+	}
+	var out []string
+	for _, f := range fs {
+		out = append(out, f.Name())
+	}
+	return out, nil
+}
+
+// Create creates the container in a new Sandbox process, unless the metadata
+// indicates that an existing Sandbox should be used.
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (*Container, error) {
+	log.Debugf("Create container %q in root dir: %s", id, conf.RootDir)
+	if err := validateID(id); err != nil {
+		return nil, err
+	}
+	if err := specutils.ValidateSpec(spec); err != nil {
+		return nil, err
+	}
+
+	containerRoot := filepath.Join(conf.RootDir, id)
+	if exists(containerRoot) {
+		return nil, fmt.Errorf("container with id %q already exists: %q ", id, containerRoot)
+	}
+
+	c := &Container{
+		ID:            id,
+		Spec:          spec,
+		ConsoleSocket: consoleSocket,
+		BundleDir:     bundleDir,
+		Root:          containerRoot,
+		Status:        Creating,
+		Owner:         os.Getenv("USER"),
+	}
+
+	// TODO: If the metadata annotations indicates that this
+	// container should be started in another sandbox, we must do so. The
+	// metadata will indicate the ID of the sandbox, which is the same as
+	// the ID of the init container in the sandbox. We can look up that
+	// init container by ID to get the sandbox, then we need to expose a
+	// way to run a new container in the sandbox.
+
+	// Start a new sandbox for this container. Any errors after this point
+	// must destroy the container.
+	s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
+	if err != nil {
+		c.Destroy()
+		return nil, err
+	}
+
+	c.Sandbox = s
+	c.Status = Created
+
+	// Save the metadata file.
+	if err := c.save(); err != nil {
+		c.Destroy()
+		return nil, err
+	}
+
+	// Write the pid file. Containerd considers the create complete after
+	// this file is created, so it must be the last thing we do.
+	if pidFile != "" {
+		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.Pid())), 0644); err != nil {
+			s.Destroy()
+			return nil, fmt.Errorf("error writing pid file: %v", err)
+		}
+	}
+
+	return c, nil
+}
+
+// Start starts running the containerized process inside the sandbox.
+func (c *Container) Start(conf *boot.Config) error {
+	log.Debugf("Start container %q", c.ID)
+	if c.Status != Created {
+		return fmt.Errorf("cannot start container in state %s", c.Status)
+	}
+
+	// "If any prestart hook fails, the runtime MUST generate an error,
+	// stop and destroy the container".
+	if c.Spec.Hooks != nil {
+		if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
+			c.Destroy()
+			return err
+		}
+	}
+
+	if err := c.Sandbox.Start(c.ID, c.Spec, conf); err != nil {
+		c.Destroy()
+		return err
+	}
+
+	// "If any poststart hook fails, the runtime MUST log a warning, but
+	// the remaining hooks and lifecycle continue as if the hook had
+	// succeeded".
+	if c.Spec.Hooks != nil {
+		executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
+	}
+
+	c.Status = Running
+	return c.save()
+}
+
+// Run is a helper that calls Create + Start + Wait.
+func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (syscall.WaitStatus, error) {
+	log.Debugf("Run container %q in root dir: %s", id, conf.RootDir)
+	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile)
+	if err != nil {
+		return 0, fmt.Errorf("error creating container: %v", err)
+	}
+	if err := c.Start(conf); err != nil {
+		return 0, fmt.Errorf("error starting container: %v", err)
+	}
+	return c.Wait()
+}
+
+// Execute runs the specified command in the container.
+func (c *Container) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) {
+	log.Debugf("Execute in container %q, args: %+v", c.ID, e)
+	if c.Status != Created && c.Status != Running {
+		return 0, fmt.Errorf("cannot exec in container in state %s", c.Status)
+	}
+	return c.Sandbox.Execute(c.ID, e)
+}
+
+// Event returns events for the container.
+func (c *Container) Event() (*boot.Event, error) {
+	log.Debugf("Getting events for container %q", c.ID)
+	if c.Status != Running && c.Status != Created {
+		return nil, fmt.Errorf("cannot get events for container in state: %s", c.Status)
+	}
+	return c.Sandbox.Event(c.ID)
+}
+
+// Pid returns the Pid of the sandbox the container is running in, or -1 if the
+// container is not running.
+func (c *Container) Pid() int {
+	if c.Status != Running && c.Status != Created {
+		return -1
+	}
+	return c.Sandbox.Pid
+}
+
+// Wait waits for the container to exit, and returns its WaitStatus.
+func (c *Container) Wait() (syscall.WaitStatus, error) {
+	log.Debugf("Wait on container %q", c.ID)
+	return c.Sandbox.Wait(c.ID)
+}
+
+// Signal sends the signal to the container.
+func (c *Container) Signal(sig syscall.Signal) error {
+	log.Debugf("Signal container %q", c.ID)
+	if c.Status == Stopped {
+		log.Warningf("container %q not running, not sending signal %v", c.ID, sig)
+		return nil
+	}
+	return c.Sandbox.Signal(c.ID, sig)
+}
+
+// State returns the metadata of the container.
+func (c *Container) State() specs.State {
+	return specs.State{
+		Version: specs.Version,
+		ID:      c.ID,
+		Status:  c.Status.String(),
+		Pid:     c.Pid(),
+		Bundle:  c.BundleDir,
+	}
+}
+
+// Processes retrieves the list of processes and associated metadata inside a
+// container.
+func (c *Container) Processes() ([]*control.Process, error) {
+	if c.Status != Running {
+		return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", c.ID, c.Status)
+	}
+	return c.Sandbox.Processes(c.ID)
+}
+
+// Destroy frees all resources associated with the container.
+func (c *Container) Destroy() error {
+	log.Debugf("Destroy container %q", c.ID)
+
+	// First stop the container.
+	if err := c.Sandbox.Stop(c.ID); err != nil {
+		return err
+	}
+
+	// Then destroy all the metadata.
+	if err := os.RemoveAll(c.Root); err != nil {
+		log.Warningf("Failed to delete container root directory %q, err: %v", c.Root, err)
+	}
+
+	// "If any poststop hook fails, the runtime MUST log a warning, but the
+	// remaining hooks and lifecycle continue as if the hook had succeeded".
+	if c.Spec.Hooks != nil && (c.Status == Created || c.Status == Running) {
+		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
+	}
+
+	if err := os.RemoveAll(c.Root); err != nil {
+		log.Warningf("Failed to delete container root directory %q, err: %v", c.Root, err)
+	}
+
+	// If we are the first container in the sandbox, take the sandbox down
+	// as well.
+	if c.Sandbox != nil && c.Sandbox.ID == c.ID {
+		if err := c.Sandbox.Destroy(); err != nil {
+			log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err)
+		}
+	}
+
+	c.Sandbox = nil
+	c.Status = Stopped
+	return nil
+}
+
+// save saves the container metadata to a file.
+func (c *Container) save() error {
+	log.Debugf("Save container %q", c.ID)
+	if err := os.MkdirAll(c.Root, 0711); err != nil {
+		return fmt.Errorf("error creating container root directory %q: %v", c.Root, err)
+	}
+	meta, err := json.Marshal(c)
+	if err != nil {
+		return fmt.Errorf("error marshaling container metadata: %v", err)
+	}
+	metaFile := filepath.Join(c.Root, metadataFilename)
+	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
+		return fmt.Errorf("error writing container metadata: %v", err)
+	}
+	return nil
+}
+
+// exists returns true if the given file exists.
+func exists(f string) bool {
+	if _, err := os.Stat(f); err == nil {
+		return true
+	} else if !os.IsNotExist(err) {
+		log.Warningf("error checking for file %q: %v", f, err)
+	}
+	return false
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
new file mode 100644
index 000000000..67efd2f9e
--- /dev/null
+++ b/runsc/container/container_test.go
@@ -0,0 +1,669 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container_test
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"reflect"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/cmd"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+}
+
+// writeSpec writes the spec to disk in the given directory.
+func writeSpec(dir string, spec *specs.Spec) error {
+	b, err := json.Marshal(spec)
+	if err != nil {
+		return err
+	}
+	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
+}
+
+// newSpecWithArgs creates a simple spec with the given args suitable for use
+// in tests.
+func newSpecWithArgs(args ...string) *specs.Spec {
+	spec := &specs.Spec{
+		// The host filesystem root is the container root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: args,
+			Env: []string{
+				"PATH=" + os.Getenv("PATH"),
+			},
+		},
+	}
+	return spec
+}
+
+// shutdownSignal will be sent to the sandbox in order to shut down cleanly.
+const shutdownSignal = syscall.SIGUSR2
+
+// setupContainer creates a bundle and root dir for the container, generates a
+// test config, and writes the spec to config.json in the bundle dir.
+func setupContainer(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
+	rootDir, err = ioutil.TempDir("", "containers")
+	if err != nil {
+		return "", "", nil, fmt.Errorf("error creating root dir: %v", err)
+	}
+
+	bundleDir, err = ioutil.TempDir("", "bundle")
+	if err != nil {
+		return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+	}
+
+	if err = writeSpec(bundleDir, spec); err != nil {
+		return "", "", nil, fmt.Errorf("error writing spec: %v", err)
+	}
+
+	conf = &boot.Config{
+		RootDir: rootDir,
+		Network: boot.NetworkNone,
+		// Don't add flags when calling subprocesses, since the test
+		// runner does not know about all the flags. We control the
+		// Config in the subprocess anyways, so it does not matter.
+		TestModeNoFlags: true,
+	}
+
+	return rootDir, bundleDir, conf, nil
+}
+
+// uniqueContainerID generates a unique container id for each test.
+//
+// The container id is used to create an abstract unix domain socket, which must
+// be unique.  While the container forbids creating two containers with the same
+// name, sometimes between test runs the socket does not get cleaned up quickly
+// enough, causing container creation to fail.
+func uniqueContainerID() string {
+	return fmt.Sprintf("test-container-%d", time.Now().UnixNano())
+}
+
+// waitForProcessList waits for the given process list to show up in the container.
+func waitForProcessList(s *container.Container, expected []*control.Process) error {
+	var got []*control.Process
+	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
+		var err error
+		got, err := s.Processes()
+		if err != nil {
+			return fmt.Errorf("error getting process data from container: %v", err)
+		}
+		if procListsEqual(got, expected) {
+			return nil
+		}
+		// Process might not have started, try again...
+		time.Sleep(10 * time.Millisecond)
+	}
+	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(expected))
+}
+
+// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
+// It verifies after each step that the container can be loaded from disk, and
+// has the correct status.
+func TestLifecycle(t *testing.T) {
+	// The container will just sleep for a long time.  We will kill it before
+	// it finishes sleeping.
+	spec := newSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := setupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// expectedPL lists the expected process state of the container.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+	// Create the container.
+	id := uniqueContainerID()
+	if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	// Load the container from disk and check the status.
+	s, err := container.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading container: %v", err)
+	}
+	if got, want := s.Status, container.Created; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// List should return the container id.
+	ids, err := container.List(rootDir)
+	if err != nil {
+		t.Fatalf("error listing containers: %v", err)
+	}
+	if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
+		t.Errorf("container list got %v, want %v", got, want)
+	}
+
+	// Start the container.
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+	// Load the container from disk and check the status.
+	s, err = container.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading container: %v", err)
+	}
+	if got, want := s.Status, container.Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Verify that "sleep 100" is running.
+	if err := waitForProcessList(s, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Send the container a signal, which we catch and use to cleanly
+	// shutdown.
+	if err := s.Signal(shutdownSignal); err != nil {
+		t.Fatalf("error sending signal %v to container: %v", shutdownSignal, err)
+	}
+	// Wait for it to die.
+	if _, err := s.Wait(); err != nil {
+		t.Fatalf("error waiting on container: %v", err)
+	}
+	// Load the container from disk and check the status.
+	s, err = container.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading container: %v", err)
+	}
+	if got, want := s.Status, container.Stopped; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Destroy the container.
+	if err := s.Destroy(); err != nil {
+		t.Fatalf("error destroying container: %v", err)
+	}
+
+	// List should not return the container id.
+	ids, err = container.List(rootDir)
+	if err != nil {
+		t.Fatalf("error listing containers: %v", err)
+	}
+	if len(ids) != 0 {
+		t.Errorf("expected container list to be empty, but got %v", ids)
+	}
+
+	// Loading the container by id should fail.
+	if _, err = container.Load(rootDir, id); err == nil {
+		t.Errorf("expected loading destroyed container to fail, but it did not")
+	}
+}
+
+// Test the we can execute the application with different path formats.
+func TestExePath(t *testing.T) {
+	for _, test := range []struct {
+		path    string
+		success bool
+	}{
+		{path: "true", success: true},
+		{path: "bin/true", success: true},
+		{path: "/bin/true", success: true},
+		{path: "thisfiledoesntexit", success: false},
+		{path: "bin/thisfiledoesntexit", success: false},
+		{path: "/bin/thisfiledoesntexit", success: false},
+	} {
+		spec := newSpecWithArgs(test.path)
+		rootDir, bundleDir, conf, err := setupContainer(spec)
+		if err != nil {
+			t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
+		}
+
+		ws, err := container.Run(uniqueContainerID(), spec, conf, bundleDir, "", "")
+
+		os.RemoveAll(rootDir)
+		os.RemoveAll(bundleDir)
+
+		if test.success {
+			if err != nil {
+				t.Errorf("exec: %s, error running container: %v", test.path, err)
+			}
+			if ws.ExitStatus() != 0 {
+				t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
+			}
+		} else {
+			if err == nil {
+				t.Errorf("exec: %s, got: no error, want: error", test.path)
+			}
+		}
+	}
+}
+
+// Test the we can retrieve the application exit status from the container.
+func TestAppExitStatus(t *testing.T) {
+	// First container will succeed.
+	succSpec := newSpecWithArgs("true")
+
+	rootDir, bundleDir, conf, err := setupContainer(succSpec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	ws, err := container.Run(uniqueContainerID(), succSpec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error running container: %v", err)
+	}
+	if ws.ExitStatus() != 0 {
+		t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0)
+	}
+
+	// Second container exits with non-zero status.
+	wantStatus := 123
+	errSpec := newSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
+
+	rootDir2, bundleDir2, conf, err := setupContainer(errSpec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir2)
+	defer os.RemoveAll(bundleDir2)
+
+	ws, err = container.Run(uniqueContainerID(), succSpec, conf, bundleDir2, "", "")
+	if err != nil {
+		t.Fatalf("error running container: %v", err)
+	}
+	if ws.ExitStatus() != wantStatus {
+		t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus)
+	}
+}
+
+// TestExec verifies that a container can exec a new program.
+func TestExec(t *testing.T) {
+	const uid = 343
+	spec := newSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := setupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	s, err := container.Create(uniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// expectedPL lists the expected process state of the container.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  uid,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Verify that "sleep 100" is running.
+	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		t.Error(err)
+	}
+
+	execArgs := control.ExecArgs{
+		Filename:         "/bin/sleep",
+		Argv:             []string{"sleep", "5"},
+		Envv:             []string{"PATH=" + os.Getenv("PATH")},
+		WorkingDirectory: "/",
+		KUID:             uid,
+	}
+
+	// Verify that "sleep 100" and "sleep 5" are running after exec.
+	// First, start running exec (whick blocks).
+	status := make(chan error, 1)
+	go func() {
+		exitStatus, err := s.Execute(&execArgs)
+		if err != nil {
+			status <- err
+		} else if exitStatus != 0 {
+			status <- fmt.Errorf("failed with exit status: %v", exitStatus)
+		} else {
+			status <- nil
+		}
+	}()
+
+	if err := waitForProcessList(s, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	// Ensure that exec finished without error.
+	select {
+	case <-time.After(10 * time.Second):
+		t.Fatalf("container timed out waiting for exec to finish.")
+	case st := <-status:
+		if st != nil {
+			t.Errorf("container failed to exec %v: %v", execArgs, err)
+		}
+	}
+}
+
+// TestCapabilities verifies that:
+// - Running exec as non-root UID and GID will result in an error (because the
+//   executable file can't be read).
+// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips
+//   this check.
+func TestCapabilities(t *testing.T) {
+	const uid = 343
+	const gid = 2401
+	spec := newSpecWithArgs("sleep", "100")
+
+	// We generate files in the host temporary directory.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: os.TempDir(),
+		Source:      os.TempDir(),
+		Type:        "bind",
+	})
+
+	rootDir, bundleDir, conf, err := setupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	s, err := container.Create(uniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// expectedPL lists the expected process state of the container.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  uid,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "exe",
+		},
+	}
+	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+	}
+
+	// Create an executable that can't be run with the specified UID:GID.
+	// This shouldn't be callable within the container until we add the
+	// CAP_DAC_OVERRIDE capability to skip the access check.
+	exePath := filepath.Join(rootDir, "exe")
+	if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+		t.Fatalf("couldn't create executable: %v", err)
+	}
+	defer os.Remove(exePath)
+
+	// Need to traverse the intermediate directory.
+	os.Chmod(rootDir, 0755)
+
+	execArgs := control.ExecArgs{
+		Filename:         exePath,
+		Argv:             []string{exePath},
+		Envv:             []string{"PATH=" + os.Getenv("PATH")},
+		WorkingDirectory: "/",
+		KUID:             uid,
+		KGID:             gid,
+		Capabilities:     &auth.TaskCapabilities{},
+	}
+
+	// "exe" should fail because we don't have the necessary permissions.
+	if _, err := s.Execute(&execArgs); err == nil {
+		t.Fatalf("container executed without error, but an error was expected")
+	}
+
+	// Now we run with the capability enabled and should succeed.
+	execArgs.Capabilities = &auth.TaskCapabilities{
+		EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+	}
+	// "exe" should not fail this time.
+	if _, err := s.Execute(&execArgs); err != nil {
+		t.Fatalf("container failed to exec %v: %v", execArgs, err)
+	}
+}
+
+// Test that an tty FD is sent over the console socket if one is provided.
+func TestConsoleSocket(t *testing.T) {
+	spec := newSpecWithArgs("true")
+	rootDir, bundleDir, conf, err := setupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create a named socket and start listening.  We use a relative path
+	// to avoid overflowing the unix path length limit (108 chars).
+	socketPath := filepath.Join(bundleDir, "socket")
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatalf("error getting cwd: %v", err)
+	}
+	socketRelPath, err := filepath.Rel(cwd, socketPath)
+	if err != nil {
+		t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+	}
+	if len(socketRelPath) > len(socketPath) {
+		socketRelPath = socketPath
+	}
+	srv, err := unet.BindAndListen(socketRelPath, false)
+	if err != nil {
+		t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
+	}
+	defer os.Remove(socketPath)
+
+	// Create the container and pass the socket name.
+	id := uniqueContainerID()
+	s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+
+	// Open the othe end of the socket.
+	sock, err := srv.Accept()
+	if err != nil {
+		t.Fatalf("error accepting socket connection: %v", err)
+	}
+
+	// Allow 3 fds to be received.  We only expect 1.
+	r := sock.Reader(true /* blocking */)
+	r.EnableFDs(1)
+
+	// The socket is closed right after sending the FD, so EOF is
+	// an allowed error.
+	b := [][]byte{{}}
+	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+		t.Fatalf("error reading from socket connection: %v", err)
+	}
+
+	// We should have gotten a control message.
+	fds, err := r.ExtractFDs()
+	if err != nil {
+		t.Fatalf("error extracting fds from socket connection: %v", err)
+	}
+	if len(fds) != 1 {
+		t.Fatalf("got %d fds from socket, wanted 1", len(fds))
+	}
+
+	// Verify that the fd is a terminal.
+	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+		t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+	}
+
+	// Shut it down.
+	if err := s.Destroy(); err != nil {
+		t.Fatalf("error destroying container: %v", err)
+	}
+
+	// Close socket.
+	if err := srv.Close(); err != nil {
+		t.Fatalf("error destroying container: %v", err)
+	}
+}
+
+func TestSpecUnsupported(t *testing.T) {
+	spec := newSpecWithArgs("/bin/true")
+	spec.Process.SelinuxLabel = "somelabel"
+
+	// These are normally set by docker and will just cause warnings to be logged.
+	spec.Process.ApparmorProfile = "someprofile"
+	spec.Linux = &specs.Linux{Seccomp: &specs.LinuxSeccomp{}}
+
+	rootDir, bundleDir, conf, err := setupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	id := uniqueContainerID()
+	_, err = container.Create(id, spec, conf, bundleDir, "", "")
+	if err == nil || !strings.Contains(err.Error(), "is not supported") {
+		t.Errorf("container.Create() wrong error, got: %v, want: *is not supported, spec.Process: %+v", err, spec.Process)
+	}
+}
+
+// procListsEqual is used to check whether 2 Process lists are equal for all
+// implemented fields.
+func procListsEqual(got, want []*control.Process) bool {
+	if len(got) != len(want) {
+		return false
+	}
+	for i := range got {
+		pd1 := got[i]
+		pd2 := want[i]
+		// Zero out unimplemented and timing dependant fields.
+		pd1.Time, pd2.Time = "", ""
+		pd1.STime, pd2.STime = "", ""
+		pd1.C, pd2.C = 0, 0
+		if *pd1 != *pd2 {
+			return false
+		}
+	}
+	return true
+}
+
+func procListToString(pl []*control.Process) string {
+	strs := make([]string, 0, len(pl))
+	for _, p := range pl {
+		strs = append(strs, fmt.Sprintf("%+v", p))
+	}
+	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
+}
+
+// TestMain acts like runsc if it is called with the "boot" argument, otherwise
+// it just runs the tests.  This is required because creating a container will
+// call "/proc/self/exe boot".  Normally /proc/self/exe is the runsc binary,
+// but for tests we have to fake it.
+func TestMain(m *testing.M) {
+	// exit writes coverage data before exiting.
+	exit := func(status int) {
+		os.Exit(status)
+	}
+
+	if !flag.Parsed() {
+		flag.Parse()
+	}
+
+	// If we are passed one of the commands then run it.
+	subcommands.Register(new(cmd.Boot), "boot")
+	subcommands.Register(new(cmd.Gofer), "gofer")
+	switch flag.Arg(0) {
+	case "boot", "gofer":
+		// Run the command in a goroutine so we can block the main
+		// thread waiting for shutdownSignal.
+		go func() {
+			conf := &boot.Config{
+				RootDir: "unused-root-dir",
+				Network: boot.NetworkNone,
+			}
+			var ws syscall.WaitStatus
+			subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+			if subcmdCode != subcommands.ExitSuccess {
+				panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode))
+			}
+			// Container exited normally. Shut down this process.
+			os.Exit(ws.ExitStatus())
+		}()
+
+		// Shutdown cleanly when the shutdownSignal is received.  This
+		// allows us to write coverage data before exiting.
+		sigc := make(chan os.Signal, 1)
+		signal.Notify(sigc, shutdownSignal)
+		<-sigc
+		exit(0)
+	default:
+		// Otherwise run the tests.
+		exit(m.Run())
+	}
+}
diff --git a/runsc/container/hook.go b/runsc/container/hook.go
new file mode 100644
index 000000000..3d93ca0be
--- /dev/null
+++ b/runsc/container/hook.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// This file implements hooks as defined in OCI spec:
+// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22
+//
+// "hooks":{
+// 		"prestart":[{
+// 			"path":"/usr/bin/dockerd",
+// 			"args":[
+// 				"libnetwork-setkey", "arg2",
+// 			]
+// 		}]
+// },
+
+// executeHooksBestEffort executes hooks and logs warning in case they fail.
+// Runs all hooks, always.
+func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
+	for _, h := range hooks {
+		if err := executeHook(h, s); err != nil {
+			log.Warningf("Failure to execute hook %+v, err: %v", h, err)
+		}
+	}
+}
+
+// executeHooks executes hooks until the first one fails or they all execute.
+func executeHooks(hooks []specs.Hook, s specs.State) error {
+	for _, h := range hooks {
+		if err := executeHook(h, s); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func executeHook(h specs.Hook, s specs.State) error {
+	log.Debugf("Executing hook %+v, state: %+v", h, s)
+
+	if strings.TrimSpace(h.Path) == "" {
+		return fmt.Errorf("empty path for hook")
+	}
+	if !filepath.IsAbs(h.Path) {
+		return fmt.Errorf("path for hook is not absolute: %q", h.Path)
+	}
+
+	b, err := json.Marshal(s)
+	if err != nil {
+		return err
+	}
+	var stdout, stderr bytes.Buffer
+	cmd := exec.Cmd{
+		Path:   h.Path,
+		Args:   h.Args,
+		Env:    h.Env,
+		Stdin:  bytes.NewReader(b),
+		Stdout: &stdout,
+		Stderr: &stderr,
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+
+	c := make(chan error, 1)
+	go func() {
+		c <- cmd.Wait()
+	}()
+
+	var timer <-chan time.Time
+	if h.Timeout != nil {
+		timer = time.After(time.Duration(*h.Timeout) * time.Second)
+	}
+	select {
+	case err := <-c:
+		if err != nil {
+			return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String())
+		}
+	case <-timer:
+		cmd.Process.Kill()
+		cmd.Wait()
+		return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String())
+	}
+
+	log.Debugf("Execute hook %q success!", h.Path)
+	return nil
+}
diff --git a/runsc/container/status.go b/runsc/container/status.go
new file mode 100644
index 000000000..8da1b4e89
--- /dev/null
+++ b/runsc/container/status.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+// Status enumerates container statuses. The statuses and their semantics are
+// part of the runtime CLI spec.
+type Status int
+
+const (
+	// Creating indicates "the container is being created".
+	Creating Status = iota
+
+	// Created indicates "the runtime has finished the create operation and
+	// the container process has neither exited nor executed the
+	// user-specified program".
+	Created
+
+	// Running indicates "the container process has executed the
+	// user-specified program but has not exited".
+	Running
+
+	// Stopped indicates "the container process has exited".
+	Stopped
+)
+
+// String converts a Status to a string. These strings are part of the runtime
+// CLI spec and should not be changed.
+func (s Status) String() string {
+	switch s {
+	case Creating:
+		return "creating"
+	case Created:
+		return "created"
+	case Running:
+		return "running"
+	case Stopped:
+		return "stopped"
+	default:
+		return "unknown"
+	}
+
+}
diff --git a/runsc/main.go b/runsc/main.go
index 3311514d2..42c8ee315 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -109,6 +109,10 @@ func main() {
 	// Create a new Config from the flags.
 	conf := &boot.Config{
 		RootDir:       *rootDir,
+		Debug:         *debug,
+		LogFilename:   *logFilename,
+		LogFormat:     *logFormat,
+		DebugLogDir:   *debugLogDir,
 		FileAccess:    fsAccess,
 		Overlay:       *overlay,
 		Network:       netType,
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index bdd95903e..e89b19552 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,16 +1,14 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "sandbox",
     srcs = [
         "console.go",
-        "hook.go",
         "namespace.go",
         "network.go",
         "sandbox.go",
-        "status.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/sandbox",
     visibility = [
@@ -30,24 +28,3 @@ go_library(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
-
-go_test(
-    name = "sandbox_test",
-    size = "small",
-    srcs = ["sandbox_test.go"],
-    pure = "on",
-    rundir = ".",
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/log",
-        "//pkg/sentry/control",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/unet",
-        "//runsc/boot",
-        "//runsc/cmd",
-        "//runsc/sandbox",
-        "@com_github_google_subcommands//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
-)
diff --git a/runsc/sandbox/hook.go b/runsc/sandbox/hook.go
deleted file mode 100644
index 40b064cdc..000000000
--- a/runsc/sandbox/hook.go
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sandbox
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"os/exec"
-	"path/filepath"
-	"strings"
-	"time"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.googlesource.com/gvisor/pkg/log"
-)
-
-// This file implements hooks as defined in OCI spec:
-// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22
-//
-// "hooks":{
-// 		"prestart":[{
-// 			"path":"/usr/bin/dockerd",
-// 			"args":[
-// 				"libnetwork-setkey", "arg2",
-// 			]
-// 		}]
-// },
-
-// executeHooksBestEffort executes hooks and logs warning in case they fail.
-// Runs all hooks, always.
-func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
-	for _, h := range hooks {
-		if err := executeHook(h, s); err != nil {
-			log.Warningf("Failure to execute hook %+v, err: %v", h, err)
-		}
-	}
-}
-
-// executeHooks executes hooks until the first one fails or they all execute.
-func executeHooks(hooks []specs.Hook, s specs.State) error {
-	for _, h := range hooks {
-		if err := executeHook(h, s); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func executeHook(h specs.Hook, s specs.State) error {
-	log.Debugf("Executing hook %+v, state: %+v", h, s)
-
-	if strings.TrimSpace(h.Path) == "" {
-		return fmt.Errorf("empty path for hook")
-	}
-	if !filepath.IsAbs(h.Path) {
-		return fmt.Errorf("path for hook is not absolute: %q", h.Path)
-	}
-
-	b, err := json.Marshal(s)
-	if err != nil {
-		return err
-	}
-	var stdout, stderr bytes.Buffer
-	cmd := exec.Cmd{
-		Path:   h.Path,
-		Args:   h.Args,
-		Env:    h.Env,
-		Stdin:  bytes.NewReader(b),
-		Stdout: &stdout,
-		Stderr: &stderr,
-	}
-	if err := cmd.Start(); err != nil {
-		return err
-	}
-
-	c := make(chan error, 1)
-	go func() {
-		c <- cmd.Wait()
-	}()
-
-	var timer <-chan time.Time
-	if h.Timeout != nil {
-		timer = time.After(time.Duration(*h.Timeout) * time.Second)
-	}
-	select {
-	case err := <-c:
-		if err != nil {
-			return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String())
-		}
-	case <-timer:
-		cmd.Process.Kill()
-		cmd.Wait()
-		return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String())
-	}
-
-	log.Debugf("Execute hook %q success!", h.Path)
-	return nil
-}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 34bd6ea67..5dfa4cf0b 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -16,13 +16,9 @@
 package sandbox
 
 import (
-	"encoding/json"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"os/exec"
-	"path/filepath"
-	"regexp"
 	"strconv"
 	"syscall"
 	"time"
@@ -38,308 +34,110 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-// metadataFilename is the name of the metadata file relative to sandboxRoot
-// that holds sandbox metadata.
-const metadataFilename = "meta.json"
-
-// See libcontainer/factory_linux.go
-var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
-
-// validateID validates the sandbox id.
-func validateID(id string) error {
-	if !idRegex.MatchString(id) {
-		return fmt.Errorf("invalid sandbox id: %v", id)
-	}
-	return nil
-}
-
-func validateSpec(spec *specs.Spec) error {
-	if spec.Process.SelinuxLabel != "" {
-		return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
-	}
-
-	// Docker uses AppArmor by default, so just log that it's being ignored.
-	if spec.Process.ApparmorProfile != "" {
-		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
-	}
-	// TODO: Apply seccomp to application inside sandbox.
-	if spec.Linux != nil && spec.Linux.Seccomp != nil {
-		log.Warningf("Seccomp spec is being ignored")
-	}
-	return nil
-}
-
-// Sandbox wraps a child sandbox process, and is responsible for saving and
-// loading sandbox metadata to disk.
-//
-// Within a root directory, we maintain subdirectories for each sandbox named
-// with the sandbox id.  The sandbox metadata is is stored as json within the
-// sandbox directory in a file named "meta.json".  This metadata format is
-// defined by us, and is not part of the OCI spec.
-//
-// Sandboxes must write this metadata file after any change to their internal
-// state.  The entire sandbox directory is deleted when the sandbox is
-// destroyed.
+// Sandbox wraps a sandbox process.
 //
-// TODO: Protect against concurrent changes to the sandbox metadata
-// file.
+// It is used to start/stop sandbox process (and associated processes like
+// gofers), as well as for running and manipulating containers inside a running
+// sandbox.
 type Sandbox struct {
-	// ID is the sandbox ID.
+	// ID is the id of the sandbox. By convention, this is the same ID as
+	// the first container run in the sandbox.
 	ID string `json:"id"`
 
-	// Spec is the OCI runtime spec that configures this sandbox.
-	Spec *specs.Spec `json:"spec"`
-
-	// BundleDir is the directory containing the sandbox bundle.
-	BundleDir string `json:"bundleDir"`
-
-	// SandboxRoot is the directory containing the sandbox metadata file.
-	SandboxRoot string `json:"sandboxRoot"`
-
-	// CreatedAt is the time the sandbox was created.
-	CreatedAt time.Time `json:"createdAt"`
-
-	// Owner is the sandbox owner.
-	Owner string `json:"owner"`
-
-	// ConsoleSocket is the path to a unix domain socket that will receive
-	// the console FD.  It is only used during create, so we don't need to
-	// store it in the metadata.
-	ConsoleSocket string `json:"-"`
-
-	// Pid is the pid of the running sandbox.  Only valid if Status is
-	// Created or Running.
+	// Pid is the pid of the running sandbox. May be 0 is the sandbox is
+	// not running.
 	Pid int `json:"pid"`
 
-	// GoferPid is the pid of the gofer running along side the sandbox. May be 0
-	// if the gofer has been killed or it's not being used.
+	// GoferPid is the pid of the gofer running along side the sandbox. May
+	// be 0 if the gofer has been killed or it's not being used.
 	GoferPid int `json:"goferPid"`
-
-	// Status is the current sandbox Status.
-	Status Status `json:"status"`
 }
 
-// Create creates the sandbox subprocess and writes the metadata file.  Args
-// are additional arguments that will be passed to the sandbox process.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (*Sandbox, error) {
-	log.Debugf("Create sandbox %q in root dir: %s", id, conf.RootDir)
-	if err := validateID(id); err != nil {
-		return nil, err
-	}
-	if err := validateSpec(spec); err != nil {
-		return nil, err
-	}
-
-	sandboxRoot := filepath.Join(conf.RootDir, id)
-	if exists(sandboxRoot) {
-		return nil, fmt.Errorf("sandbox with id %q already exists: %q ", id, sandboxRoot)
-	}
+// Create creates the sandbox process.
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, error) {
+	s := &Sandbox{ID: id}
 
-	s := &Sandbox{
-		ID:            id,
-		Spec:          spec,
-		ConsoleSocket: consoleSocket,
-		BundleDir:     bundleDir,
-		SandboxRoot:   sandboxRoot,
-		Status:        Creating,
-		Owner:         os.Getenv("USER"),
-	}
-
-	// Create sandbox process. If anything errors between now and the end of this
-	// function, we MUST clean up all sandbox resources.
-	if err := s.createProcesses(conf, args); err != nil {
-		s.Destroy()
+	binPath, err := specutils.BinPath()
+	if err != nil {
 		return nil, err
 	}
 
-	// Wait for the control server to come up (or timeout).  The sandbox is
-	// not "created" until that happens.
-	if err := s.waitForCreated(10 * time.Second); err != nil {
-		s.Destroy()
+	// Create the gofer process.
+	ioFiles, err := s.createGoferProcess(spec, conf, bundleDir, binPath)
+	if err != nil {
 		return nil, err
 	}
 
-	s.Status = Created
-	s.CreatedAt = time.Now()
-
-	// Save the metadata file.
-	if err := s.save(); err != nil {
-		s.Destroy()
+	// Create the sandbox process.
+	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles); err != nil {
 		return nil, err
 	}
 
-	// Write the pid file.  Containerd consideres the create complete after
-	// this file is created, so it must be the last thing we do.
-	if pidFile != "" {
-		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(s.Pid)), 0644); err != nil {
-			s.Destroy()
-			return nil, fmt.Errorf("error writing pid file: %v", err)
-		}
-	}
-
-	return s, nil
-}
-
-// Run is a helper that calls Create + Start + Wait.
-func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (syscall.WaitStatus, error) {
-	s, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, args)
-	if err != nil {
-		return 0, fmt.Errorf("error creating sandbox: %v", err)
-	}
-	if err := s.Start(conf); err != nil {
-		return 0, fmt.Errorf("error starting sandbox: %v", err)
-	}
-	return s.Wait()
-}
-
-// Load loads a sandbox from with the given id from a metadata file.
-func Load(rootDir, id string) (*Sandbox, error) {
-	log.Debugf("Load sandbox %q %q", rootDir, id)
-	if err := validateID(id); err != nil {
+	// Wait for the control server to come up (or timeout).
+	if err := s.waitForCreated(10 * time.Second); err != nil {
 		return nil, err
 	}
-	sandboxRoot := filepath.Join(rootDir, id)
-	if !exists(sandboxRoot) {
-		return nil, fmt.Errorf("sandbox with id %q does not exist", id)
-	}
-	metaFile := filepath.Join(sandboxRoot, metadataFilename)
-	if !exists(metaFile) {
-		return nil, fmt.Errorf("sandbox with id %q does not have metadata file %q", id, metaFile)
-	}
-	metaBytes, err := ioutil.ReadFile(metaFile)
-	if err != nil {
-		return nil, fmt.Errorf("error reading sandbox metadata file %q: %v", metaFile, err)
-	}
-	var s Sandbox
-	if err := json.Unmarshal(metaBytes, &s); err != nil {
-		return nil, fmt.Errorf("error unmarshaling sandbox metadata from %q: %v", metaFile, err)
-	}
-
-	// If the status is "Running" or "Created", check that the process
-	// still exists, and set it to Stopped if it does not.
-	//
-	// This is inherently racey.
-	if s.Status == Running || s.Status == Created {
-		// Send signal 0 to check if process exists.
-		if err := s.Signal(0); err != nil {
-			// Process no longer exists.
-			s.Status = Stopped
-			s.Pid = 0
-		}
-	}
 
-	return &s, nil
-}
-
-// List returns all sandbox ids in the given root directory.
-func List(rootDir string) ([]string, error) {
-	log.Debugf("List sandboxes %q", rootDir)
-	fs, err := ioutil.ReadDir(rootDir)
-	if err != nil {
-		return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err)
-	}
-	var out []string
-	for _, f := range fs {
-		out = append(out, f.Name())
-	}
-	return out, nil
-}
-
-// State returns the metadata of the sandbox.
-func (s *Sandbox) State() specs.State {
-	return specs.State{
-		Version: specs.Version,
-		ID:      s.ID,
-		Status:  s.Status.String(),
-		Pid:     s.Pid,
-		Bundle:  s.BundleDir,
-	}
+	return s, nil
 }
 
 // Start starts running the containerized process inside the sandbox.
-func (s *Sandbox) Start(conf *boot.Config) error {
+func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
 	log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid)
-	if s.Status != Created {
-		return fmt.Errorf("cannot start container in state %s", s.Status)
-	}
-
-	// "If any prestart hook fails, the runtime MUST generate an error,
-	// stop and destroy the container".
-	if s.Spec.Hooks != nil {
-		if err := executeHooks(s.Spec.Hooks.Prestart, s.State()); err != nil {
-			s.Destroy()
-			return err
-		}
-	}
-
-	c, err := s.connect()
+	conn, err := s.connect()
 	if err != nil {
-		s.Destroy()
 		return err
 	}
-	defer c.Close()
+	defer conn.Close()
 
 	// Configure the network.
-	if err := setupNetwork(c, s.Pid, s.Spec, conf); err != nil {
-		s.Destroy()
+	if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
 		return fmt.Errorf("error setting up network: %v", err)
 	}
 
 	// Send a message to the sandbox control server to start the
 	// application.
-	if err := c.Call(boot.ApplicationStart, nil, nil); err != nil {
-		s.Destroy()
-		return fmt.Errorf("error starting application %v: %v", s.Spec.Process.Args, err)
-	}
-
-	// "If any poststart hook fails, the runtime MUST log a warning, but
-	// the remaining hooks and lifecycle continue as if the hook had
-	// succeeded".
-	if s.Spec.Hooks != nil {
-		executeHooksBestEffort(s.Spec.Hooks.Poststart, s.State())
+	//
+	// TODO: Pass in the container id (cid) here. The sandbox
+	// should start only that container.
+	if err := conn.Call(boot.ApplicationStart, nil, nil); err != nil {
+		return fmt.Errorf("error starting application %v: %v", spec.Process.Args, err)
 	}
 
-	s.Status = Running
-	return s.save()
+	return nil
 }
 
-// Processes retrieves the list of processes and associated metadata inside a
-// sandbox.
-func (s *Sandbox) Processes() ([]*control.Process, error) {
-	if s.Status != Running {
-		return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", s.ID, s.Status)
-	}
-
-	c, err := s.connect()
+// Processes retrieves the list of processes and associated metadata for a
+// given container in this sandbox.
+func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
+	conn, err := s.connect()
 	if err != nil {
 		return nil, err
 	}
-	defer c.Close()
+	defer conn.Close()
 
 	var pl []*control.Process
-	if err := c.Call(boot.ApplicationProcesses, nil, &pl); err != nil {
+	// TODO: Pass in the container id (cid) here. The sandbox
+	// should return process info for only that container.
+	if err := conn.Call(boot.ApplicationProcesses, nil, &pl); err != nil {
 		return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
 	}
 	return pl, nil
 }
 
-// Execute runs the specified command in the sandbox.
-func (s *Sandbox) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) {
-	log.Debugf("Execute in sandbox %q, pid: %d, args: %+v", s.ID, s.Pid, e)
-	if s.Status != Created && s.Status != Running {
-		return 0, fmt.Errorf("cannot exec in container in state %s", s.Status)
-	}
-
-	log.Debugf("Connecting to sandbox...")
-	c, err := s.connect()
+// Execute runs the specified command in the container.
+func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, error) {
+	conn, err := s.connect()
 	if err != nil {
 		return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
 	}
-	defer c.Close()
+	defer conn.Close()
 
 	// Send a message to the sandbox control server to start the application.
 	var waitStatus uint32
-	if err := c.Call(boot.ApplicationExecute, e, &waitStatus); err != nil {
+	// TODO: Pass in the container id (cid) here. The sandbox
+	// should execute in the context of that container.
+	if err := conn.Call(boot.ApplicationExecute, e, &waitStatus); err != nil {
 		return 0, fmt.Errorf("error executing in sandbox: %v", err)
 	}
 
@@ -347,60 +145,45 @@ func (s *Sandbox) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) {
 }
 
 // Event retrieves stats about the sandbox such as memory and CPU utilization.
-func (s *Sandbox) Event() (*boot.Event, error) {
-	if s.Status != Running && s.Status != Created {
-		return nil, fmt.Errorf("cannot get events for container in state: %s", s.Status)
-	}
-
-	c, err := s.connect()
+func (s *Sandbox) Event(cid string) (*boot.Event, error) {
+	conn, err := s.connect()
 	if err != nil {
 		return nil, err
 	}
-	defer c.Close()
+	defer conn.Close()
 
 	var e boot.Event
-	if err := c.Call(boot.ApplicationEvent, nil, &e); err != nil {
+	// TODO: Pass in the container id (cid) here. The sandbox
+	// should return events only for that container.
+	if err := conn.Call(boot.ApplicationEvent, nil, &e); err != nil {
 		return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err)
 	}
-	e.ID = s.ID
+	e.ID = cid
 	return &e, nil
 }
 
 func (s *Sandbox) connect() (*urpc.Client, error) {
 	log.Debugf("Connecting to sandbox...")
-	c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
+	conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
 	if err != nil {
 		return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
 	}
-	return c, nil
-}
-
-func (s *Sandbox) createProcesses(conf *boot.Config, args []string) error {
-	binPath, err := specutils.BinPath()
-	if err != nil {
-		return err
-	}
-
-	ioFiles, err := s.createGoferProcess(conf, binPath, args)
-	if err != nil {
-		return err
-	}
-	return s.createSandboxProcess(conf, binPath, args, ioFiles)
+	return conn, nil
 }
 
-func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonArgs []string) ([]*os.File, error) {
+func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) ([]*os.File, error) {
 	if conf.FileAccess != boot.FileAccessProxy {
 		// Don't start a gofer. The sandbox will access host FS directly.
 		return nil, nil
 	}
 
-	var args []string
-	args = append(args, commonArgs...)
-	args = append(args, "gofer", "--bundle", s.BundleDir)
+	// Start with the general config flags.
+	args := conf.ToFlags()
+	args = append(args, "gofer", "--bundle", bundleDir)
 
-	// Start with root mount and then add any other additional mount.
+	// Add root mount and then add any other additional mounts.
 	mountCount := 1
-	for _, m := range s.Spec.Mounts {
+	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			mountCount++
 		}
@@ -429,8 +212,8 @@ func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonAr
 	// Setup any uid/gid mappings, and create or join the configured user
 	// namespace so the gofer's view of the filesystem aligns with the
 	// users in the sandbox.
-	setUIDGIDMappings(cmd, s.Spec)
-	nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, s.Spec)
+	setUIDGIDMappings(cmd, spec)
+	nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
 
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
@@ -444,7 +227,7 @@ func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonAr
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, commonArgs []string, ioFiles []*os.File) error {
+func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -457,13 +240,13 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, common
 		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
 	}
 
-	consoleEnabled := s.ConsoleSocket != ""
+	consoleEnabled := consoleSocket != ""
 
-	cmd := exec.Command(binPath, commonArgs...)
+	cmd := exec.Command(binPath, conf.ToFlags()...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{}
 	cmd.Args = append(cmd.Args,
 		"boot",
-		"--bundle", s.BundleDir,
+		"--bundle", bundleDir,
 		"--controller-fd="+strconv.Itoa(nextFD),
 		fmt.Sprintf("--console=%t", consoleEnabled))
 	nextFD++
@@ -485,9 +268,9 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, common
 	if consoleEnabled {
 		// setupConsole will send the master on the socket, and return
 		// the slave.
-		tty, err := setupConsole(s.ConsoleSocket)
+		tty, err := setupConsole(consoleSocket)
 		if err != nil {
-			return fmt.Errorf("error setting up control socket %q: %v", s.ConsoleSocket, err)
+			return fmt.Errorf("error setting up control socket %q: %v", consoleSocket, err)
 		}
 		defer tty.Close()
 
@@ -535,7 +318,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, common
 	// Joins the network namespace if network is enabled. the sandbox talks
 	// directly to the host network, which may have been configured in the
 	// namespace.
-	if ns, ok := getNS(specs.NetworkNamespace, s.Spec); ok && conf.Network != boot.NetworkNone {
+	if ns, ok := getNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone {
 		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
 		nss = append(nss, ns)
 	} else {
@@ -549,10 +332,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, common
 	//   - Gofer: when using a Gofer, the sandbox process can run isolated in an
 	//       empty namespace.
 	if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect {
-		if userns, ok := getNS(specs.UserNamespace, s.Spec); ok {
+		if userns, ok := getNS(specs.UserNamespace, spec); ok {
 			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
 			nss = append(nss, userns)
-			setUIDGIDMappings(cmd, s.Spec)
+			setUIDGIDMappings(cmd, spec)
 		} else {
 			log.Infof("Sandbox will be started in the current user namespace")
 		}
@@ -596,8 +379,10 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 }
 
 // Wait waits for the containerized process to exit, and returns its WaitStatus.
-func (s *Sandbox) Wait() (syscall.WaitStatus, error) {
-	log.Debugf("Wait on sandbox %q with pid %d", s.ID, s.Pid)
+func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
+	// TODO: This waits on the sandbox process. We need a way
+	// to wait on an individual container in the sandbox.
+
 	p, err := os.FindProcess(s.Pid)
 	if err != nil {
 		// "On Unix systems, FindProcess always succeeds and returns a
@@ -611,6 +396,13 @@ func (s *Sandbox) Wait() (syscall.WaitStatus, error) {
 	return ps.Sys().(syscall.WaitStatus), nil
 }
 
+// Stop stops the container in the sandbox.
+func (s *Sandbox) Stop(cid string) error {
+	// TODO: This should stop the container with the given ID
+	// in the sandbox.
+	return nil
+}
+
 // Destroy frees all resources associated with the sandbox.
 func (s *Sandbox) Destroy() error {
 	log.Debugf("Destroy sandbox %q", s.ID)
@@ -625,60 +417,26 @@ func (s *Sandbox) Destroy() error {
 		sendSignal(s.GoferPid, unix.SIGKILL)
 		s.GoferPid = 0
 	}
-	if err := os.RemoveAll(s.SandboxRoot); err != nil {
-		log.Warningf("Failed to delete sandbox root directory %q, err: %v", s.SandboxRoot, err)
-	}
-
-	// "If any poststop hook fails, the runtime MUST log a warning, but the
-	// remaining hooks and lifecycle continue as if the hook had succeeded".
-	if s.Spec.Hooks != nil && (s.Status == Created || s.Status == Running) {
-		executeHooksBestEffort(s.Spec.Hooks.Poststop, s.State())
-	}
 
-	s.Status = Stopped
 	return nil
 }
 
-// Signal sends the signal to the sandbox.
-func (s *Sandbox) Signal(sig syscall.Signal) error {
+// Signal sends the signal to a container in the sandbox.
+func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 	log.Debugf("Signal sandbox %q", s.ID)
-	if s.Status == Stopped {
-		log.Warningf("sandbox %q not running, not sending signal %v to pid %d", s.ID, sig, s.Pid)
-		return nil
-	}
+
+	// TODO: This sends a signal to the sandbox process, which
+	// will be forwarded to the first process in the sandbox. We need a way
+	// to send a signal to any container in the sandbox.
+	// to wait on an individual container in the sandbox.
+
 	return sendSignal(s.Pid, sig)
 }
 
+// sendSignal sends a signal to the sandbox process.
 func sendSignal(pid int, sig syscall.Signal) error {
 	if err := syscall.Kill(pid, sig); err != nil {
 		return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err)
 	}
 	return nil
 }
-
-// save saves the sandbox metadata to a file.
-func (s *Sandbox) save() error {
-	log.Debugf("Save sandbox %q", s.ID)
-	if err := os.MkdirAll(s.SandboxRoot, 0711); err != nil {
-		return fmt.Errorf("error creating sandbox root directory %q: %v", s.SandboxRoot, err)
-	}
-	meta, err := json.Marshal(s)
-	if err != nil {
-		return fmt.Errorf("error marshaling sandbox metadata: %v", err)
-	}
-	metaFile := filepath.Join(s.SandboxRoot, metadataFilename)
-	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
-		return fmt.Errorf("error writing sandbox metadata: %v", err)
-	}
-	return nil
-}
-
-// exists returns true if the given file exists.
-func exists(f string) bool {
-	if _, err := os.Stat(f); err == nil {
-		return true
-	} else if !os.IsNotExist(err) {
-		log.Warningf("error checking for file %q: %v", f, err)
-	}
-	return false
-}
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
deleted file mode 100644
index 1fac38a29..000000000
--- a/runsc/sandbox/sandbox_test.go
+++ /dev/null
@@ -1,665 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sandbox_test
-
-import (
-	"encoding/json"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"os"
-	"os/signal"
-	"path/filepath"
-	"reflect"
-	"strings"
-	"syscall"
-	"testing"
-	"time"
-
-	"context"
-	"flag"
-	"github.com/google/subcommands"
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
-	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/cmd"
-	"gvisor.googlesource.com/gvisor/runsc/sandbox"
-)
-
-func init() {
-	log.SetLevel(log.Debug)
-}
-
-// writeSpec writes the spec to disk in the given directory.
-func writeSpec(dir string, spec *specs.Spec) error {
-	b, err := json.Marshal(spec)
-	if err != nil {
-		return err
-	}
-	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
-}
-
-// newSpecWithArgs creates a simple spec with the given args suitable for use
-// in tests.
-func newSpecWithArgs(args ...string) *specs.Spec {
-	spec := &specs.Spec{
-		// The host filesystem root is the sandbox root.
-		Root: &specs.Root{
-			Path:     "/",
-			Readonly: true,
-		},
-		Process: &specs.Process{
-			Args: args,
-			Env: []string{
-				"PATH=" + os.Getenv("PATH"),
-			},
-		},
-	}
-	return spec
-}
-
-// shutdownSignal will be sent to the sandbox in order to shut down cleanly.
-const shutdownSignal = syscall.SIGUSR2
-
-// setupSandbox creates a bundle and root dir for the sandbox, generates a test
-// config, and writes the spec to config.json in the bundle dir.
-func setupSandbox(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
-	rootDir, err = ioutil.TempDir("", "sandboxes")
-	if err != nil {
-		return "", "", nil, fmt.Errorf("error creating root dir: %v", err)
-	}
-
-	bundleDir, err = ioutil.TempDir("", "bundle")
-	if err != nil {
-		return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err)
-	}
-
-	if err = writeSpec(bundleDir, spec); err != nil {
-		return "", "", nil, fmt.Errorf("error writing spec: %v", err)
-	}
-
-	conf = &boot.Config{
-		RootDir: rootDir,
-		Network: boot.NetworkNone,
-	}
-
-	return rootDir, bundleDir, conf, nil
-}
-
-// uniqueSandboxID generates a unique sandbox id for each test.
-//
-// The sandbox id is used to create an abstract unix domain socket, which must
-// be unique.  While the sandbox forbids creating two sandboxes with the same
-// name, sometimes between test runs the socket does not get cleaned up quickly
-// enough, causing sandbox creation to fail.
-func uniqueSandboxID() string {
-	return fmt.Sprintf("test-sandbox-%d", time.Now().UnixNano())
-}
-
-// waitForProcessList waits for the given process list to show up in the sandbox.
-func waitForProcessList(s *sandbox.Sandbox, expected []*control.Process) error {
-	var got []*control.Process
-	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
-		var err error
-		got, err := s.Processes()
-		if err != nil {
-			return fmt.Errorf("error getting process data from sandbox: %v", err)
-		}
-		if procListsEqual(got, expected) {
-			return nil
-		}
-		// Process might not have started, try again...
-		time.Sleep(10 * time.Millisecond)
-	}
-	return fmt.Errorf("sandbox got process list: %s, want: %s", procListToString(got), procListToString(expected))
-}
-
-// TestLifecycle tests the basic Create/Start/Signal/Destroy sandbox lifecycle.
-// It verifies after each step that the sandbox can be loaded from disk, and
-// has the correct status.
-func TestLifecycle(t *testing.T) {
-	// The sandbox will just sleep for a long time.  We will kill it before
-	// it finishes sleeping.
-	spec := newSpecWithArgs("sleep", "100")
-
-	rootDir, bundleDir, conf, err := setupSandbox(spec)
-	if err != nil {
-		t.Fatalf("error setting up sandbox: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// expectedPL lists the expected process state of the sandbox.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
-	// Create the sandbox.
-	id := uniqueSandboxID()
-	if _, err := sandbox.Create(id, spec, conf, bundleDir, "", "", nil); err != nil {
-		t.Fatalf("error creating sandbox: %v", err)
-	}
-	// Load the sandbox from disk and check the status.
-	s, err := sandbox.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading sandbox: %v", err)
-	}
-	if got, want := s.Status, sandbox.Created; got != want {
-		t.Errorf("sandbox status got %v, want %v", got, want)
-	}
-
-	// List should return the sandbox id.
-	ids, err := sandbox.List(rootDir)
-	if err != nil {
-		t.Fatalf("error listing sandboxes: %v", err)
-	}
-	if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
-		t.Errorf("sandbox list got %v, want %v", got, want)
-	}
-
-	// Start the sandbox.
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting sandbox: %v", err)
-	}
-	// Load the sandbox from disk and check the status.
-	s, err = sandbox.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading sandbox: %v", err)
-	}
-	if got, want := s.Status, sandbox.Running; got != want {
-		t.Errorf("sandbox status got %v, want %v", got, want)
-	}
-
-	// Verify that "sleep 100" is running.
-	if err := waitForProcessList(s, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Send the sandbox a signal, which we catch and use to cleanly
-	// shutdown.
-	if err := s.Signal(shutdownSignal); err != nil {
-		t.Fatalf("error sending signal %v to sandbox: %v", shutdownSignal, err)
-	}
-	// Wait for it to die.
-	if _, err := s.Wait(); err != nil {
-		t.Fatalf("error waiting on sandbox: %v", err)
-	}
-	// Load the sandbox from disk and check the status.
-	s, err = sandbox.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading sandbox: %v", err)
-	}
-	if got, want := s.Status, sandbox.Stopped; got != want {
-		t.Errorf("sandbox status got %v, want %v", got, want)
-	}
-
-	// Destroy the sandbox.
-	if err := s.Destroy(); err != nil {
-		t.Fatalf("error destroying sandbox: %v", err)
-	}
-
-	// List should not return the sandbox id.
-	ids, err = sandbox.List(rootDir)
-	if err != nil {
-		t.Fatalf("error listing sandboxes: %v", err)
-	}
-	if len(ids) != 0 {
-		t.Errorf("expected sandbox list to be empty, but got %v", ids)
-	}
-
-	// Loading the sandbox by id should fail.
-	if _, err = sandbox.Load(rootDir, id); err == nil {
-		t.Errorf("expected loading destroyed sandbox to fail, but it did not")
-	}
-}
-
-// Test the we can execute the application with different path formats.
-func TestExePath(t *testing.T) {
-	for _, test := range []struct {
-		path    string
-		success bool
-	}{
-		{path: "true", success: true},
-		{path: "bin/true", success: true},
-		{path: "/bin/true", success: true},
-		{path: "thisfiledoesntexit", success: false},
-		{path: "bin/thisfiledoesntexit", success: false},
-		{path: "/bin/thisfiledoesntexit", success: false},
-	} {
-		spec := newSpecWithArgs(test.path)
-		rootDir, bundleDir, conf, err := setupSandbox(spec)
-		if err != nil {
-			t.Fatalf("exec: %s, error setting up sandbox: %v", test.path, err)
-		}
-
-		ws, err := sandbox.Run(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
-
-		os.RemoveAll(rootDir)
-		os.RemoveAll(bundleDir)
-
-		if test.success {
-			if err != nil {
-				t.Errorf("exec: %s, error running sandbox: %v", test.path, err)
-			}
-			if ws.ExitStatus() != 0 {
-				t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
-			}
-		} else {
-			if err == nil {
-				t.Errorf("exec: %s, got: no error, want: error", test.path)
-			}
-		}
-	}
-}
-
-// Test the we can retrieve the application exit status from the sandbox.
-func TestAppExitStatus(t *testing.T) {
-	// First sandbox will succeed.
-	succSpec := newSpecWithArgs("true")
-
-	rootDir, bundleDir, conf, err := setupSandbox(succSpec)
-	if err != nil {
-		t.Fatalf("error setting up sandbox: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	ws, err := sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir, "", "", nil)
-	if err != nil {
-		t.Fatalf("error running sandbox: %v", err)
-	}
-	if ws.ExitStatus() != 0 {
-		t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0)
-	}
-
-	// Second sandbox exits with non-zero status.
-	wantStatus := 123
-	errSpec := newSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
-
-	rootDir2, bundleDir2, conf, err := setupSandbox(errSpec)
-	if err != nil {
-		t.Fatalf("error setting up sandbox: %v", err)
-	}
-	defer os.RemoveAll(rootDir2)
-	defer os.RemoveAll(bundleDir2)
-
-	ws, err = sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir2, "", "", nil)
-	if err != nil {
-		t.Fatalf("error running sandbox: %v", err)
-	}
-	if ws.ExitStatus() != wantStatus {
-		t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus)
-	}
-}
-
-// TestExec verifies that a sandbox can exec a new program.
-func TestExec(t *testing.T) {
-	const uid = 343
-	spec := newSpecWithArgs("sleep", "100")
-
-	rootDir, bundleDir, conf, err := setupSandbox(spec)
-	if err != nil {
-		t.Fatalf("error setting up sandbox: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// Create and start the sandbox.
-	s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
-	if err != nil {
-		t.Fatalf("error creating sandbox: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting sandbox: %v", err)
-	}
-
-	// expectedPL lists the expected process state of the sandbox.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  uid,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
-
-	// Verify that "sleep 100" is running.
-	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
-		t.Error(err)
-	}
-
-	execArgs := control.ExecArgs{
-		Filename:         "/bin/sleep",
-		Argv:             []string{"sleep", "5"},
-		Envv:             []string{"PATH=" + os.Getenv("PATH")},
-		WorkingDirectory: "/",
-		KUID:             uid,
-	}
-
-	// Verify that "sleep 100" and "sleep 5" are running after exec.
-	// First, start running exec (whick blocks).
-	status := make(chan error, 1)
-	go func() {
-		exitStatus, err := s.Execute(&execArgs)
-		if err != nil {
-			status <- err
-		} else if exitStatus != 0 {
-			status <- fmt.Errorf("failed with exit status: %v", exitStatus)
-		} else {
-			status <- nil
-		}
-	}()
-
-	if err := waitForProcessList(s, expectedPL); err != nil {
-		t.Fatal(err)
-	}
-
-	// Ensure that exec finished without error.
-	select {
-	case <-time.After(10 * time.Second):
-		t.Fatalf("sandbox timed out waiting for exec to finish.")
-	case st := <-status:
-		if st != nil {
-			t.Errorf("sandbox failed to exec %v: %v", execArgs, err)
-		}
-	}
-}
-
-// TestCapabilities verifies that:
-// - Running exec as non-root UID and GID will result in an error (because the
-//   executable file can't be read).
-// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips
-//   this check.
-func TestCapabilities(t *testing.T) {
-	const uid = 343
-	const gid = 2401
-	spec := newSpecWithArgs("sleep", "100")
-
-	// We generate files in the host temporary directory.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: os.TempDir(),
-		Source:      os.TempDir(),
-		Type:        "bind",
-	})
-
-	rootDir, bundleDir, conf, err := setupSandbox(spec)
-	if err != nil {
-		t.Fatalf("error setting up sandbox: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// Create and start the sandbox.
-	s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
-	if err != nil {
-		t.Fatalf("error creating sandbox: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting sandbox: %v", err)
-	}
-
-	// expectedPL lists the expected process state of the sandbox.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  uid,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "exe",
-		},
-	}
-	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
-		t.Fatalf("Failed to wait for sleep to start, err: %v", err)
-	}
-
-	// Create an executable that can't be run with the specified UID:GID.
-	// This shouldn't be callable within the sandbox until we add the
-	// CAP_DAC_OVERRIDE capability to skip the access check.
-	exePath := filepath.Join(rootDir, "exe")
-	if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
-		t.Fatalf("couldn't create executable: %v", err)
-	}
-	defer os.Remove(exePath)
-
-	// Need to traverse the intermediate directory.
-	os.Chmod(rootDir, 0755)
-
-	execArgs := control.ExecArgs{
-		Filename:         exePath,
-		Argv:             []string{exePath},
-		Envv:             []string{"PATH=" + os.Getenv("PATH")},
-		WorkingDirectory: "/",
-		KUID:             uid,
-		KGID:             gid,
-		Capabilities:     &auth.TaskCapabilities{},
-	}
-
-	// "exe" should fail because we don't have the necessary permissions.
-	if _, err := s.Execute(&execArgs); err == nil {
-		t.Fatalf("sandbox executed without error, but an error was expected")
-	}
-
-	// Now we run with the capability enabled and should succeed.
-	execArgs.Capabilities = &auth.TaskCapabilities{
-		EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
-	}
-	// "exe" should not fail this time.
-	if _, err := s.Execute(&execArgs); err != nil {
-		t.Fatalf("sandbox failed to exec %v: %v", execArgs, err)
-	}
-}
-
-// Test that an tty FD is sent over the console socket if one is provided.
-func TestConsoleSocket(t *testing.T) {
-	spec := newSpecWithArgs("true")
-	rootDir, bundleDir, conf, err := setupSandbox(spec)
-	if err != nil {
-		t.Fatalf("error setting up sandbox: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// Create a named socket and start listening.  We use a relative path
-	// to avoid overflowing the unix path length limit (108 chars).
-	socketPath := filepath.Join(bundleDir, "socket")
-	cwd, err := os.Getwd()
-	if err != nil {
-		t.Fatalf("error getting cwd: %v", err)
-	}
-	socketRelPath, err := filepath.Rel(cwd, socketPath)
-	if err != nil {
-		t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
-	}
-	if len(socketRelPath) > len(socketPath) {
-		socketRelPath = socketPath
-	}
-	srv, err := unet.BindAndListen(socketRelPath, false)
-	if err != nil {
-		t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
-	}
-	defer os.Remove(socketPath)
-
-	// Create the sandbox and pass the socket name.
-	id := uniqueSandboxID()
-	s, err := sandbox.Create(id, spec, conf, bundleDir, socketRelPath, "", nil)
-	if err != nil {
-		t.Fatalf("error creating sandbox: %v", err)
-	}
-
-	// Open the othe end of the socket.
-	sock, err := srv.Accept()
-	if err != nil {
-		t.Fatalf("error accepting socket connection: %v", err)
-	}
-
-	// Allow 3 fds to be received.  We only expect 1.
-	r := sock.Reader(true /* blocking */)
-	r.EnableFDs(1)
-
-	// The socket is closed right after sending the FD, so EOF is
-	// an allowed error.
-	b := [][]byte{{}}
-	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
-		t.Fatalf("error reading from socket connection: %v", err)
-	}
-
-	// We should have gotten a control message.
-	fds, err := r.ExtractFDs()
-	if err != nil {
-		t.Fatalf("error extracting fds from socket connection: %v", err)
-	}
-	if len(fds) != 1 {
-		t.Fatalf("got %d fds from socket, wanted 1", len(fds))
-	}
-
-	// Verify that the fd is a terminal.
-	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
-		t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
-	}
-
-	// Shut it down.
-	if err := s.Destroy(); err != nil {
-		t.Fatalf("error destroying sandbox: %v", err)
-	}
-
-	// Close socket.
-	if err := srv.Close(); err != nil {
-		t.Fatalf("error destroying sandbox: %v", err)
-	}
-}
-
-func TestSpecUnsupported(t *testing.T) {
-	spec := newSpecWithArgs("/bin/true")
-	spec.Process.SelinuxLabel = "somelabel"
-
-	// These are normally set by docker and will just cause warnings to be logged.
-	spec.Process.ApparmorProfile = "someprofile"
-	spec.Linux = &specs.Linux{Seccomp: &specs.LinuxSeccomp{}}
-
-	rootDir, bundleDir, conf, err := setupSandbox(spec)
-	if err != nil {
-		t.Fatalf("error setting up sandbox: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	id := uniqueSandboxID()
-	_, err = sandbox.Create(id, spec, conf, bundleDir, "", "", nil)
-	if err == nil || !strings.Contains(err.Error(), "is not supported") {
-		t.Errorf("sandbox.Create() wrong error, got: %v, want: *is not supported, spec.Process: %+v", err, spec.Process)
-	}
-}
-
-// procListsEqual is used to check whether 2 Process lists are equal for all
-// implemented fields.
-func procListsEqual(got, want []*control.Process) bool {
-	if len(got) != len(want) {
-		return false
-	}
-	for i := range got {
-		pd1 := got[i]
-		pd2 := want[i]
-		// Zero out unimplemented and timing dependant fields.
-		pd1.Time, pd2.Time = "", ""
-		pd1.STime, pd2.STime = "", ""
-		pd1.C, pd2.C = 0, 0
-		if *pd1 != *pd2 {
-			return false
-		}
-	}
-	return true
-}
-
-func procListToString(pl []*control.Process) string {
-	strs := make([]string, 0, len(pl))
-	for _, p := range pl {
-		strs = append(strs, fmt.Sprintf("%+v", p))
-	}
-	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
-}
-
-// TestMain acts like runsc if it is called with the "boot" argument, otherwise
-// it just runs the tests.  This is required because creating a sandbox will
-// call "/proc/self/exe boot".  Normally /proc/self/exe is the runsc binary,
-// but for tests we have to fake it.
-func TestMain(m *testing.M) {
-	// exit writes coverage data before exiting.
-	exit := func(status int) {
-		os.Exit(status)
-	}
-
-	if !flag.Parsed() {
-		flag.Parse()
-	}
-
-	// If we are passed one of the commands then run it.
-	subcommands.Register(new(cmd.Boot), "boot")
-	subcommands.Register(new(cmd.Gofer), "gofer")
-	switch flag.Arg(0) {
-	case "boot", "gofer":
-		// Run the command in a goroutine so we can block the main
-		// thread waiting for shutdownSignal.
-		go func() {
-			conf := &boot.Config{
-				RootDir: "unused-root-dir",
-				Network: boot.NetworkNone,
-			}
-			var ws syscall.WaitStatus
-			subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
-			if subcmdCode != subcommands.ExitSuccess {
-				panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode))
-			}
-			// Sandbox exited normally. Shut down this process.
-			os.Exit(ws.ExitStatus())
-		}()
-
-		// Shutdown cleanly when the shutdownSignal is received.  This
-		// allows us to write coverage data before exiting.
-		sigc := make(chan os.Signal, 1)
-		signal.Notify(sigc, shutdownSignal)
-		<-sigc
-		exit(0)
-	default:
-		// Otherwise run the tests.
-		exit(m.Run())
-	}
-}
diff --git a/runsc/sandbox/status.go b/runsc/sandbox/status.go
deleted file mode 100644
index 6fc936aba..000000000
--- a/runsc/sandbox/status.go
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sandbox
-
-// Status enumerates sandbox statuses.  The statuses and their semantics are
-// part of the runtime CLI spec.
-//
-// TODO: Get precise about the transitions between statuses.
-type Status int
-
-const (
-	// Creating indicates "the container is being created".
-	Creating Status = iota
-
-	// Created indicates "the runtime has finished the create operation and
-	// the container process has neither exited nor executed the
-	// user-specified program".
-	Created
-
-	// Running indicates "the container process has executed the
-	// user-specified program but has not exited".
-	Running
-
-	// Stopped indicates "the container process has exited".
-	Stopped
-)
-
-// String converts a Status to a string.  These strings are part of the runtime
-// CLI spec and should not be changed.
-func (s Status) String() string {
-	switch s {
-	case Creating:
-		return "creating"
-	case Created:
-		return "created"
-	case Running:
-		return "running"
-	case Stopped:
-		return "stopped"
-	default:
-		return "unknown"
-	}
-
-}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index dcb4b20db..5f455dec4 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -41,9 +41,28 @@ func LogSpec(spec *specs.Spec) {
 	log.Debugf("Spec.Root: %+v", spec.Root)
 }
 
+// ValidateSpec validates that the spec is compatible with runsc.
+func ValidateSpec(spec *specs.Spec) error {
+	if spec.Process == nil {
+		return fmt.Errorf("Process must be defined")
+	}
+	if spec.Process.SelinuxLabel != "" {
+		return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
+	}
+
+	// Docker uses AppArmor by default, so just log that it's being ignored.
+	if spec.Process.ApparmorProfile != "" {
+		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
+	}
+
+	// TODO: Apply seccomp to application inside sandbox.
+	if spec.Linux != nil && spec.Linux.Seccomp != nil {
+		log.Warningf("Seccomp spec is being ignored")
+	}
+	return nil
+}
+
 // ReadSpec reads an OCI runtime spec from the given bundle directory.
-//
-// TODO: This should validate the spec.
 func ReadSpec(bundleDir string) (*specs.Spec, error) {
 	// The spec file must be in "config.json" inside the bundle directory.
 	specFile := filepath.Join(bundleDir, "config.json")
-- 
cgit v1.2.3


From 31386185fe7c2079ee412a411e536a5bf9e9eb25 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 17 May 2018 11:54:36 -0700
Subject: Push signal-delivery and wait into the sandbox.

This is another step towards multi-container support.

Previously, we delivered signals directly to the sandbox process (which then
forwarded the signal to PID 1 inside the sandbox). Similarly, we waited on a
container by waiting on the sandbox process itself. This approach will not work
when there are multiple containers inside the sandbox, and we need to
signal/wait on individual containers.

This CL adds two new messages, ContainerSignal and ContainerWait. These
messages include the id of the container to signal/wait. The controller inside
the sandbox receives these messages and signals/waits on the appropriate
process inside the sandbox.

The container id is plumbed into the sandbox, but it currently is not used. We
still end up signaling/waiting on PID 1 in all cases.  Once we actually have
multiple containers inside the sandbox, we will need to keep some sort of map
of container id -> pid (or possibly pid namespace), and signal/kill the
appropriate process for the container.

PiperOrigin-RevId: 197028366
Change-Id: I07b4d5dc91ecd2affc1447e6b4bdd6b0b7360895
---
 runsc/boot/BUILD                  |   1 +
 runsc/boot/controller.go          | 112 ++++++++++++++++++++++++++------------
 runsc/boot/events.go              |   7 ++-
 runsc/boot/loader.go              |  21 ++++---
 runsc/boot/loader_test.go         |  14 ++---
 runsc/cmd/boot.go                 |  10 ++--
 runsc/container/container.go      |  10 +++-
 runsc/container/container_test.go |  72 +++++++++++++-----------
 runsc/sandbox/sandbox.go          |  80 ++++++++++++++++-----------
 9 files changed, 200 insertions(+), 127 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 16522c668..1746df988 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -25,6 +25,7 @@ go_library(
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/log",
+        "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/control",
         "//pkg/sentry/fs",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 60c42fc19..8fc0a9076 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -18,30 +18,39 @@ import (
 	"fmt"
 
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
 )
 
 const (
-	// ApplicationStart is the URPC endpoint for starting a sandboxed app.
-	ApplicationStart = "application.Start"
+	// ContainerEvent is the URPC endpoint for getting stats about the
+	// container used by "runsc events".
+	ContainerEvent = "containerManager.Event"
 
-	// ApplicationProcesses is the URPC endpoint for getting the list of
-	// processes running in a sandbox.
-	ApplicationProcesses = "application.Processes"
+	// ContainerExecute is the URPC endpoint for executing a command in a
+	// container..
+	ContainerExecute = "containerManager.Execute"
 
-	// ApplicationExecute is the URPC endpoint for executing a command in a
-	// sandbox.
-	ApplicationExecute = "application.Execute"
+	// ContainerProcesses is the URPC endpoint for getting the list of
+	// processes running in a container.
+	ContainerProcesses = "containerManager.Processes"
 
-	// ApplicationEvent is the URPC endpoint for getting stats about the
-	// container used by "runsc events".
-	ApplicationEvent = "application.Event"
+	// ContainerSignal is used to send a signal to a container.
+	ContainerSignal = "containerManager.Signal"
+
+	// ContainerWait is used to wait on the init process of the container
+	// and return its ExitStatus.
+	ContainerWait = "containerManager.Wait"
 
 	// NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
 	// and routes in a network stack.
 	NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
+
+	// RootContainerStart is the URPC endpoint for starting a new sandbox
+	// with root container.
+	RootContainerStart = "containerManager.StartRoot"
 )
 
 // ControlSocketAddr generates an abstract unix socket name for the given id.
@@ -55,8 +64,8 @@ type controller struct {
 	// srv is the contorl server.
 	srv *server.Server
 
-	// app holds the application methods.
-	app *application
+	// manager holds the containerManager methods.
+	manager *containerManager
 }
 
 // newController creates a new controller and starts it listening.
@@ -66,12 +75,12 @@ func newController(fd int, k *kernel.Kernel) (*controller, error) {
 		return nil, err
 	}
 
-	app := &application{
+	manager := &containerManager{
 		startChan:       make(chan struct{}),
 		startResultChan: make(chan error),
 		k:               k,
 	}
-	srv.Register(app)
+	srv.Register(manager)
 
 	if eps, ok := k.NetworkStack().(*epsocket.Stack); ok {
 		net := &Network{
@@ -85,44 +94,79 @@ func newController(fd int, k *kernel.Kernel) (*controller, error) {
 	}
 
 	return &controller{
-		srv: srv,
-		app: app,
+		srv:     srv,
+		manager: manager,
 	}, nil
 }
 
-// application contains methods that control the sandboxed application.
-type application struct {
-	// startChan is used to signal when the application process should be
-	// started.
+// containerManager manages sandboes containers.
+type containerManager struct {
+	// startChan is used to signal when the root container process should
+	// be started.
 	startChan chan struct{}
 
-	// startResultChan is used to signal when the application has started. Any
-	// errors encountered during startup will be sent to the channel. A nil value
-	// indicates success.
+	// startResultChan is used to signal when the root container  has
+	// started. Any errors encountered during startup will be sent to the
+	// channel. A nil value indicates success.
 	startResultChan chan error
 
 	// k is the emulated linux kernel on which the sandboxed
-	// application runs.
+	// containers run.
 	k *kernel.Kernel
 }
 
-// Start will start the application process.
-func (a *application) Start(_, _ *struct{}) error {
-	// Tell the application to start and wait for the result.
-	a.startChan <- struct{}{}
-	return <-a.startResultChan
+// StartRoot will start the root container process.
+func (cm *containerManager) StartRoot(_, _ *struct{}) error {
+	// Tell the root container to start and wait for the result.
+	cm.startChan <- struct{}{}
+	return <-cm.startResultChan
 }
 
 // Processes retrieves information about processes running in the sandbox.
-func (a *application) Processes(_, out *[]*control.Process) error {
-	return control.Processes(a.k, out)
+func (cm *containerManager) Processes(_, out *[]*control.Process) error {
+	return control.Processes(cm.k, out)
 }
 
 // Execute runs a command on a created or running sandbox.
-func (a *application) Execute(e *control.ExecArgs, waitStatus *uint32) error {
-	proc := control.Proc{Kernel: a.k}
+func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
+	proc := control.Proc{Kernel: cm.k}
 	if err := proc.Exec(e, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
 	}
 	return nil
 }
+
+// Wait waits for the init process in the given container.
+func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
+	// TODO: Use the cid and wait on the init process in that
+	// container. Currently we just wait on PID 1 in the sandbox.
+	tg := cm.k.TaskSet().Root.ThreadGroupWithID(1)
+	if tg == nil {
+		return fmt.Errorf("cannot wait: no thread group with id 1")
+	}
+	tg.WaitExited()
+	*waitStatus = tg.ExitStatus().Status()
+	return nil
+}
+
+// SignalArgs are arguments to the Signal method.
+type SignalArgs struct {
+	// CID is the container id.
+	CID string
+
+	// Signo is the signal to send to the process.
+	Signo int32
+}
+
+// Signal sends a signal to the init process of the container.
+func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+	// TODO: Use the cid and send the signal to the init
+	// process in theat container. Currently we just signal PID 1 in the
+	// sandbox.
+	si := arch.SignalInfo{Signo: args.Signo}
+	t := cm.k.TaskSet().Root.TaskWithID(1)
+	if t == nil {
+		return fmt.Errorf("cannot signal: no task with id 1")
+	}
+	return t.SendSignal(&si)
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index ef6459b01..0eb75c14c 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -59,10 +59,11 @@ type Memory struct {
 	Raw       map[string]uint64 `json:"raw,omitempty"`
 }
 
-func (a *application) Event(_ *struct{}, out *Event) error {
+// Event gets the events from the container.
+func (cm *containerManager) Event(_ *struct{}, out *Event) error {
 	stats := &Stats{}
-	stats.populateMemory(a.k)
-	stats.populatePIDs(a.k)
+	stats.populateMemory(cm.k)
+	stats.populatePIDs(cm.k)
 	*out = Event{Type: "stats", Data: stats}
 	return nil
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 34a25241f..0ff54d349 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package boot loads the kernel and runs the application.
+// Package boot loads the kernel and runs a container..
 package boot
 
 import (
@@ -57,7 +57,7 @@ import (
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
 )
 
-// Loader keeps state needed to start the kernel and run the application.
+// Loader keeps state needed to start the kernel and run the container..
 type Loader struct {
 	// k is the kernel.
 	k *kernel.Kernel
@@ -73,10 +73,10 @@ type Loader struct {
 	watchdog *watchdog.Watchdog
 
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
-	// app. It should be called when a sandbox is destroyed.
+	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
 
-	// procArgs refers to the initial application task.
+	// procArgs refers to the root container task.
 	procArgs kernel.CreateProcessArgs
 }
 
@@ -283,10 +283,10 @@ func createPlatform(conf *Config) (platform.Platform, error) {
 	}
 }
 
-// Run runs the application.
+// Run runs the root container..
 func (l *Loader) Run() error {
 	err := l.run()
-	l.ctrl.app.startResultChan <- err
+	l.ctrl.manager.startResultChan <- err
 	if err != nil {
 		// Give the controller some time to send the error to the
 		// runtime. If we return too quickly here the process will exit
@@ -321,7 +321,7 @@ func (l *Loader) run() error {
 		}
 	}
 
-	// Create the initial application task.
+	// Create the root container init task.
 	if _, err := l.k.CreateProcess(l.procArgs); err != nil {
 		return fmt.Errorf("failed to create init process: %v", err)
 	}
@@ -335,13 +335,12 @@ func (l *Loader) run() error {
 
 // WaitForStartSignal waits for a start signal from the control server.
 func (l *Loader) WaitForStartSignal() {
-	<-l.ctrl.app.startChan
+	<-l.ctrl.manager.startChan
 }
 
-// WaitExit waits for the application to exit, and returns the application's
-// exit status.
+// WaitExit waits for the root container to exit, and returns its exit status.
 func (l *Loader) WaitExit() kernel.ExitStatus {
-	// Wait for application.
+	// Wait for container.
 	l.k.WaitExited()
 
 	return l.k.GlobalInit().ExitStatus()
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index c3d9887fa..d2e5fe74e 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -72,13 +72,13 @@ func TestRun(t *testing.T) {
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
-		resultChanErr = <-s.ctrl.app.startResultChan
+		resultChanErr = <-s.ctrl.manager.startResultChan
 		wg.Done()
 	}()
 
-	// Run the application.
+	// Run the container..
 	if err := s.Run(); err != nil {
-		t.Errorf("error running application: %v", err)
+		t.Errorf("error running container: %v", err)
 	}
 
 	// We should have not gotten an error on the startResultChan.
@@ -112,7 +112,7 @@ func TestStartSignal(t *testing.T) {
 	go func() {
 		s.WaitForStartSignal()
 		// Pretend that Run() executed and returned no error.
-		s.ctrl.app.startResultChan <- nil
+		s.ctrl.manager.startResultChan <- nil
 		waitFinished <- struct{}{}
 	}()
 
@@ -126,9 +126,9 @@ func TestStartSignal(t *testing.T) {
 		// OK.
 	}
 
-	// Trigger the control server Start method.
-	if err := s.ctrl.app.Start(nil, nil); err != nil {
-		t.Errorf("error calling Start: %v", err)
+	// Trigger the control server StartRoot method.
+	if err := s.ctrl.manager.StartRoot(nil, nil); err != nil {
+		t.Errorf("error calling StartRoot: %v", err)
 	}
 
 	// Now WaitForStartSignal should return (within a short amount of
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 0dad6da79..3bdc2ced0 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -111,21 +111,21 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-	s, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
+	l, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
-	defer s.Destroy()
+	defer l.Destroy()
 
 	// Wait for the start signal from runsc.
-	s.WaitForStartSignal()
+	l.WaitForStartSignal()
 
 	// Run the application and wait for it to finish.
-	if err := s.Run(); err != nil {
+	if err := l.Run(); err != nil {
 		Fatalf("error running sandbox: %v", err)
 	}
 
-	ws := s.WaitExit()
+	ws := l.WaitExit()
 	log.Infof("application exiting with %+v", ws)
 	*waitStatus = syscall.WaitStatus(ws.Status())
 	return subcommands.ExitSuccess
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 97115cd6b..ae86e40c9 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -120,9 +120,13 @@ func Load(rootDir, id string) (*Container, error) {
 	//
 	// This is inherently racey.
 	if c.Status == Running || c.Status == Created {
-		// Send signal 0 to check if container still exists.
-		if err := c.Signal(0); err != nil {
-			// Container no longer exists.
+		// Check if the sandbox process is still running.
+		if c.Sandbox.IsRunning() {
+			// TODO: Send a message into the sandbox to
+			// see if this particular container is still running.
+		} else {
+			// Sandbox no longer exists, so this container
+			// definitly does not exist.
 			c.Status = Stopped
 			c.Sandbox = nil
 		}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 67efd2f9e..e4467ccba 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -20,10 +20,10 @@ import (
 	"io"
 	"io/ioutil"
 	"os"
-	"os/signal"
 	"path/filepath"
 	"reflect"
 	"strings"
+	"sync"
 	"syscall"
 	"testing"
 	"time"
@@ -75,9 +75,6 @@ func newSpecWithArgs(args ...string) *specs.Spec {
 	return spec
 }
 
-// shutdownSignal will be sent to the sandbox in order to shut down cleanly.
-const shutdownSignal = syscall.SIGUSR2
-
 // setupContainer creates a bundle and root dir for the container, generates a
 // test config, and writes the spec to config.json in the bundle dir.
 func setupContainer(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
@@ -201,15 +198,35 @@ func TestLifecycle(t *testing.T) {
 		t.Error(err)
 	}
 
-	// Send the container a signal, which we catch and use to cleanly
-	// shutdown.
-	if err := s.Signal(shutdownSignal); err != nil {
-		t.Fatalf("error sending signal %v to container: %v", shutdownSignal, err)
+	// Wait on the container.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		ws, err := s.Wait()
+		if err != nil {
+			t.Errorf("error waiting on container: %v", err)
+		}
+		if got, want := ws.Signal(), syscall.SIGTERM; got != want {
+			t.Errorf("got signal %v, want %v", got, want)
+		}
+		wg.Done()
+	}()
+
+	// Send the container a SIGTERM which will cause it to stop.
+	if err := s.Signal(syscall.SIGTERM); err != nil {
+		t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
 	}
 	// Wait for it to die.
-	if _, err := s.Wait(); err != nil {
-		t.Fatalf("error waiting on container: %v", err)
-	}
+	wg.Wait()
+
+	// The sandbox process should have exited by now, but it is a zombie.
+	// In normal runsc usage, it will be parented to init, and init will
+	// reap the sandbox. However, in this case the test runner is the
+	// parent and will not reap the sandbox process, so we must do it
+	// ourselves.
+	p, _ := os.FindProcess(s.Sandbox.Pid)
+	p.Wait()
+
 	// Load the container from disk and check the status.
 	s, err = container.Load(rootDir, id)
 	if err != nil {
@@ -640,28 +657,17 @@ func TestMain(m *testing.M) {
 	subcommands.Register(new(cmd.Gofer), "gofer")
 	switch flag.Arg(0) {
 	case "boot", "gofer":
-		// Run the command in a goroutine so we can block the main
-		// thread waiting for shutdownSignal.
-		go func() {
-			conf := &boot.Config{
-				RootDir: "unused-root-dir",
-				Network: boot.NetworkNone,
-			}
-			var ws syscall.WaitStatus
-			subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
-			if subcmdCode != subcommands.ExitSuccess {
-				panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode))
-			}
-			// Container exited normally. Shut down this process.
-			os.Exit(ws.ExitStatus())
-		}()
-
-		// Shutdown cleanly when the shutdownSignal is received.  This
-		// allows us to write coverage data before exiting.
-		sigc := make(chan os.Signal, 1)
-		signal.Notify(sigc, shutdownSignal)
-		<-sigc
-		exit(0)
+		conf := &boot.Config{
+			RootDir: "unused-root-dir",
+			Network: boot.NetworkNone,
+		}
+		var ws syscall.WaitStatus
+		subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+		if subcmdCode != subcommands.ExitSuccess {
+			panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode))
+		}
+		// Container exited. Shut down this process.
+		exit(ws.ExitStatus())
 	default:
 		// Otherwise run the tests.
 		exit(m.Run())
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 5dfa4cf0b..a9486cfdc 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -95,13 +95,12 @@ func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
 		return fmt.Errorf("error setting up network: %v", err)
 	}
 
-	// Send a message to the sandbox control server to start the
-	// application.
+	// Send a message to the sandbox control server to start the root
+	// container..
 	//
-	// TODO: Pass in the container id (cid) here. The sandbox
-	// should start only that container.
-	if err := conn.Call(boot.ApplicationStart, nil, nil); err != nil {
-		return fmt.Errorf("error starting application %v: %v", spec.Process.Args, err)
+	// TODO: We need a way to start non-root containers.
+	if err := conn.Call(boot.RootContainerStart, nil, nil); err != nil {
+		return fmt.Errorf("error starting root container %v: %v", spec.Process.Args, err)
 	}
 
 	return nil
@@ -110,6 +109,7 @@ func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
 // Processes retrieves the list of processes and associated metadata for a
 // given container in this sandbox.
 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
+	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
 	conn, err := s.connect()
 	if err != nil {
 		return nil, err
@@ -119,7 +119,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	var pl []*control.Process
 	// TODO: Pass in the container id (cid) here. The sandbox
 	// should return process info for only that container.
-	if err := conn.Call(boot.ApplicationProcesses, nil, &pl); err != nil {
+	if err := conn.Call(boot.ContainerProcesses, nil, &pl); err != nil {
 		return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
 	}
 	return pl, nil
@@ -127,17 +127,18 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 
 // Execute runs the specified command in the container.
 func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, error) {
+	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
 	conn, err := s.connect()
 	if err != nil {
 		return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
 	}
 	defer conn.Close()
 
-	// Send a message to the sandbox control server to start the application.
+	// Send a message to the sandbox control server to start the container..
 	var waitStatus uint32
 	// TODO: Pass in the container id (cid) here. The sandbox
 	// should execute in the context of that container.
-	if err := conn.Call(boot.ApplicationExecute, e, &waitStatus); err != nil {
+	if err := conn.Call(boot.ContainerExecute, e, &waitStatus); err != nil {
 		return 0, fmt.Errorf("error executing in sandbox: %v", err)
 	}
 
@@ -146,6 +147,7 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 
 // Event retrieves stats about the sandbox such as memory and CPU utilization.
 func (s *Sandbox) Event(cid string) (*boot.Event, error) {
+	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
 	conn, err := s.connect()
 	if err != nil {
 		return nil, err
@@ -155,7 +157,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	var e boot.Event
 	// TODO: Pass in the container id (cid) here. The sandbox
 	// should return events only for that container.
-	if err := conn.Call(boot.ApplicationEvent, nil, &e); err != nil {
+	if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil {
 		return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err)
 	}
 	e.ID = cid
@@ -163,7 +165,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 }
 
 func (s *Sandbox) connect() (*urpc.Client, error) {
-	log.Debugf("Connecting to sandbox...")
+	log.Debugf("Connecting to sandbox %q", s.ID)
 	conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
 	if err != nil {
 		return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
@@ -380,20 +382,18 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 
 // Wait waits for the containerized process to exit, and returns its WaitStatus.
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
-	// TODO: This waits on the sandbox process. We need a way
-	// to wait on an individual container in the sandbox.
-
-	p, err := os.FindProcess(s.Pid)
+	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
+	var ws syscall.WaitStatus
+	conn, err := s.connect()
 	if err != nil {
-		// "On Unix systems, FindProcess always succeeds and returns a
-		// Process for the given pid."
-		panic(err)
+		return ws, err
 	}
-	ps, err := p.Wait()
-	if err != nil {
-		return 0, err
+	defer conn.Close()
+
+	if err := conn.Call(boot.ContainerWait, &cid, &ws); err != nil {
+		return ws, fmt.Errorf("err waiting on container %q: %v", cid, err)
 	}
-	return ps.Sys().(syscall.WaitStatus), nil
+	return ws, nil
 }
 
 // Stop stops the container in the sandbox.
@@ -409,12 +409,12 @@ func (s *Sandbox) Destroy() error {
 	if s.Pid != 0 {
 		// TODO: Too harsh?
 		log.Debugf("Killing sandbox %q", s.ID)
-		sendSignal(s.Pid, unix.SIGKILL)
+		killProcess(s.Pid, unix.SIGKILL)
 		s.Pid = 0
 	}
 	if s.GoferPid != 0 {
 		log.Debugf("Killing gofer for sandbox %q", s.ID)
-		sendSignal(s.GoferPid, unix.SIGKILL)
+		killProcess(s.GoferPid, unix.SIGKILL)
 		s.GoferPid = 0
 	}
 
@@ -424,17 +424,35 @@ func (s *Sandbox) Destroy() error {
 // Signal sends the signal to a container in the sandbox.
 func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 	log.Debugf("Signal sandbox %q", s.ID)
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
 
-	// TODO: This sends a signal to the sandbox process, which
-	// will be forwarded to the first process in the sandbox. We need a way
-	// to send a signal to any container in the sandbox.
-	// to wait on an individual container in the sandbox.
+	args := boot.SignalArgs{
+		CID:   cid,
+		Signo: int32(sig),
+	}
+	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
+		return fmt.Errorf("err signaling container %q: %v", cid, err)
+	}
+	return nil
+}
 
-	return sendSignal(s.Pid, sig)
+// IsRunning returns true iff the sandbox process is running.
+func (s *Sandbox) IsRunning() bool {
+	// Send a signal 0 to the sandbox process.
+	if err := killProcess(s.Pid, 0); err != nil {
+		return false
+	}
+	return true
 }
 
-// sendSignal sends a signal to the sandbox process.
-func sendSignal(pid int, sig syscall.Signal) error {
+// killProcess sends a signal to the host process (i.e. a sandbox or gofer
+// process). Sandbox.Signal should be used to send a signal to a process
+// running inside the sandbox.
+func killProcess(pid int, sig syscall.Signal) error {
 	if err := syscall.Kill(pid, sig); err != nil {
 		return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err)
 	}
-- 
cgit v1.2.3


From 8878a66a565733493e702199b284cd7855f80bf0 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 17 May 2018 15:05:15 -0700
Subject: Implement sysv shm.

PiperOrigin-RevId: 197058289
Change-Id: I3946c25028b7e032be4894d61acb48ac0c24d574
---
 pkg/abi/linux/BUILD                  |   1 +
 pkg/abi/linux/shm.go                 |  75 +++++
 pkg/refs/refcounter.go               |   8 +-
 pkg/sentry/context/context.go        |  20 ++
 pkg/sentry/fs/dirent_refs_test.go    |  62 ++--
 pkg/sentry/kernel/BUILD              |   1 +
 pkg/sentry/kernel/ipc_namespace.go   |  15 +-
 pkg/sentry/kernel/shm/BUILD          |  40 +++
 pkg/sentry/kernel/shm/device.go      |  20 ++
 pkg/sentry/kernel/shm/shm.go         | 630 +++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/task.go            |   3 +
 pkg/sentry/kernel/task_clone.go      |   4 +-
 pkg/sentry/mm/BUILD                  |   2 +
 pkg/sentry/mm/shm.go                 |  66 ++++
 pkg/sentry/syscalls/linux/BUILD      |   2 +
 pkg/sentry/syscalls/linux/linux64.go |   8 +-
 pkg/sentry/syscalls/linux/sys_shm.go | 155 +++++++++
 runsc/boot/loader.go                 |   2 +-
 18 files changed, 1072 insertions(+), 42 deletions(-)
 create mode 100644 pkg/abi/linux/shm.go
 create mode 100644 pkg/sentry/kernel/shm/BUILD
 create mode 100644 pkg/sentry/kernel/shm/device.go
 create mode 100644 pkg/sentry/kernel/shm/shm.go
 create mode 100644 pkg/sentry/mm/shm.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_shm.go

(limited to 'runsc')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index a428e61a3..693ce0fdd 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -51,6 +51,7 @@ go_library(
         "sched.go",
         "seccomp.go",
         "sem.go",
+        "shm.go",
         "signal.go",
         "socket.go",
         "time.go",
diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
new file mode 100644
index 000000000..9149ed094
--- /dev/null
+++ b/pkg/abi/linux/shm.go
@@ -0,0 +1,75 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// shmat(2) flags. Source: include/uapi/linux/shm.h
+const (
+	SHM_RDONLY = 010000  // Read-only access.
+	SHM_RND    = 020000  // Round attach address to SHMLBA boundary.
+	SHM_REMAP  = 040000  // Take-over region on attach.
+	SHM_EXEC   = 0100000 // Execution access.
+)
+
+// IPCPerm.Mode upper byte flags. Source: include/linux/shm.h
+const (
+	SHM_DEST      = 01000  // Segment will be destroyed on last detach.
+	SHM_LOCKED    = 02000  // Segment will not be swapped.
+	SHM_HUGETLB   = 04000  // Segment will use huge TLB pages.
+	SHM_NORESERVE = 010000 // Don't check for reservations.
+)
+
+// Additional Linux-only flags for shmctl(2). Source: include/uapi/linux/shm.h
+const (
+	SHM_LOCK   = 11
+	SHM_UNLOCK = 12
+	SHM_STAT   = 13
+	SHM_INFO   = 14
+)
+
+// ShmidDS is equivalent to struct shmid64_ds. Source:
+// include/uapi/asm-generic/shmbuf.h
+type ShmidDS struct {
+	ShmPerm    IPCPerm
+	ShmSegsz   uint64
+	ShmAtime   TimeT
+	ShmDtime   TimeT
+	ShmCtime   TimeT
+	ShmCpid    int32
+	ShmLpid    int32
+	ShmNattach uint64
+
+	Unused4 uint64
+	Unused5 uint64
+}
+
+// ShmParams is equivalent to struct shminfo. Source: include/uapi/linux/shm.h
+type ShmParams struct {
+	ShmMax uint64
+	ShmMin uint64
+	ShmMni uint64
+	ShmSeg uint64
+	ShmAll uint64
+}
+
+// ShmInfo is equivalent to struct shm_info. Source: include/uapi/linux/shm.h
+type ShmInfo struct {
+	UsedIDs       int32 // Number of currently existing segments.
+	_             [4]byte
+	ShmTot        uint64 // Total number of shared memory pages.
+	ShmRss        uint64 // Number of resident shared memory pages.
+	ShmSwp        uint64 // Number of swapped shared memory pages.
+	SwapAttempts  uint64 // Unused since Linux 2.4.
+	SwapSuccesses uint64 // Unused since Linux 2.4.
+}
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 1036553c7..3162001e1 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -194,9 +194,11 @@ type AtomicRefCount struct {
 	weakRefs ilist.List `state:"nosave"`
 }
 
-// TestReadRefs returns the current reference count of r. Use only for tests.
-func (r *AtomicRefCount) TestReadRefs() int64 {
-	return atomic.LoadInt64(&r.refCount)
+// ReadRefs returns the current number of references. The returned count is
+// inherently racy and is unsafe to use without external synchronization.
+func (r *AtomicRefCount) ReadRefs() int64 {
+	// Account for the internal -1 offset on refcounts.
+	return atomic.LoadInt64(&r.refCount) + 1
 }
 
 // IncRef increments this object's reference count. While the count is kept
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index e0dffafba..598c5b4ff 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -20,6 +20,26 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
+type contextID int
+
+// Globally accessible values from a context. These keys are defined in the
+// context package to resolve dependency cycles by not requiring the caller to
+// import packages usually required to get these information.
+const (
+	// CtxThreadGroupID is the current thread group ID when a context represents
+	// a task context. The value is represented as an int32.
+	CtxThreadGroupID contextID = iota
+)
+
+// ThreadGroupIDFromContext returns the current thread group ID when ctx
+// represents a task context.
+func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) {
+	if tgid := ctx.Value(CtxThreadGroupID); tgid != nil {
+		return tgid.(int32), true
+	}
+	return 0, false
+}
+
 // A Context represents a thread of execution (hereafter "goroutine" to reflect
 // Go idiosyncrasy). It carries state associated with the goroutine across API
 // boundaries.
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index 8ce9ba02d..f9dcba316 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -33,8 +33,8 @@ func TestWalkPositive(t *testing.T) {
 	ctx := contexttest.Context(t)
 	root := NewDirent(newMockDirInode(ctx, nil), "root")
 
-	if got := root.TestReadRefs(); got != 0 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	if got := root.ReadRefs(); got != 1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
 	}
 
 	name := "d"
@@ -43,22 +43,22 @@ func TestWalkPositive(t *testing.T) {
 		t.Fatalf("root.walk(root, %q) got %v, want nil", name, err)
 	}
 
-	if got := root.TestReadRefs(); got != 1 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 1)
+	if got := root.ReadRefs(); got != 2 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 2)
 	}
 
-	if got := d.TestReadRefs(); got != 0 {
-		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0)
+	if got := d.ReadRefs(); got != 1 {
+		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 1)
 	}
 
 	d.DecRef()
 
-	if got := root.TestReadRefs(); got != 0 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	if got := root.ReadRefs(); got != 1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
 	}
 
-	if got := d.TestReadRefs(); got != -1 {
-		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, -1)
+	if got := d.ReadRefs(); got != 0 {
+		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0)
 	}
 
 	root.flush()
@@ -76,8 +76,8 @@ func TestWalkNegative(t *testing.T) {
 	root := NewDirent(NewEmptyDir(ctx, nil), "root")
 	mn := root.Inode.InodeOperations.(*mockInodeOperationsLookupNegative)
 
-	if got := root.TestReadRefs(); got != 0 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	if got := root.ReadRefs(); got != 1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
 	}
 
 	name := "d"
@@ -88,7 +88,7 @@ func TestWalkNegative(t *testing.T) {
 		}
 	}
 
-	if got := root.TestReadRefs(); got != 0 {
+	if got := root.ReadRefs(); got != 1 {
 		t.Fatalf("root has a ref count of %d, want %d", got, 1)
 	}
 
@@ -110,14 +110,14 @@ func TestWalkNegative(t *testing.T) {
 		t.Fatalf("root found positive child at %q, want negative", name)
 	}
 
-	if got := child.(*Dirent).TestReadRefs(); got != 1 {
-		t.Fatalf("child has a ref count of %d, want %d", got, 1)
+	if got := child.(*Dirent).ReadRefs(); got != 2 {
+		t.Fatalf("child has a ref count of %d, want %d", got, 2)
 	}
 
 	child.DecRef()
 
-	if got := child.(*Dirent).TestReadRefs(); got != 0 {
-		t.Fatalf("child has a ref count of %d, want %d", got, 0)
+	if got := child.(*Dirent).ReadRefs(); got != 1 {
+		t.Fatalf("child has a ref count of %d, want %d", got, 1)
 	}
 
 	if got := len(root.children); got != 1 {
@@ -126,7 +126,7 @@ func TestWalkNegative(t *testing.T) {
 
 	root.DecRef()
 
-	if got := root.TestReadRefs(); got != -1 {
+	if got := root.ReadRefs(); got != 0 {
 		t.Fatalf("root has a ref count of %d, want %d", got, 0)
 	}
 
@@ -184,12 +184,12 @@ func TestHashNegativeToPositive(t *testing.T) {
 		t.Fatalf("got negative Dirent, want positive")
 	}
 
-	if got := d.TestReadRefs(); got != 0 {
-		t.Fatalf("child %q has a ref count of %d, want %d", name, got, 0)
+	if got := d.ReadRefs(); got != 1 {
+		t.Fatalf("child %q has a ref count of %d, want %d", name, got, 1)
 	}
 
-	if got := root.TestReadRefs(); got != 1 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 1)
+	if got := root.ReadRefs(); got != 2 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 2)
 	}
 
 	if got := len(root.children); got != 1 {
@@ -291,12 +291,12 @@ func TestCreateExtraRefs(t *testing.T) {
 		{
 			desc: "Create caching",
 			root: NewDirent(NewEmptyDir(ctx, NewDirentCache(1)), "root"),
-			refs: 1,
+			refs: 2,
 		},
 		{
 			desc: "Create not caching",
 			root: NewDirent(NewEmptyDir(ctx, nil), "root"),
-			refs: 0,
+			refs: 1,
 		},
 	} {
 		t.Run(test.desc, func(t *testing.T) {
@@ -307,7 +307,7 @@ func TestCreateExtraRefs(t *testing.T) {
 			}
 			d := f.Dirent
 
-			if got := d.TestReadRefs(); got != test.refs {
+			if got := d.ReadRefs(); got != test.refs {
 				t.Errorf("dirent has a ref count of %d, want %d", got, test.refs)
 			}
 		})
@@ -347,8 +347,8 @@ func TestRemoveExtraRefs(t *testing.T) {
 				t.Fatalf("root.Remove(root, %q) failed: %v", name, err)
 			}
 
-			if got := d.TestReadRefs(); got != 0 {
-				t.Fatalf("dirent has a ref count of %d, want %d", got, 0)
+			if got := d.ReadRefs(); got != 1 {
+				t.Fatalf("dirent has a ref count of %d, want %d", got, 1)
 			}
 
 			d.DecRef()
@@ -406,11 +406,11 @@ func TestRenameExtraRefs(t *testing.T) {
 			newParent.flush()
 
 			// Expect to have only active references.
-			if got := renamed.TestReadRefs(); got != 0 {
-				t.Errorf("renamed has ref count %d, want only active references %d", got, 0)
+			if got := renamed.ReadRefs(); got != 1 {
+				t.Errorf("renamed has ref count %d, want only active references %d", got, 1)
 			}
-			if got := replaced.TestReadRefs(); got != 0 {
-				t.Errorf("replaced has ref count %d, want only active references %d", got, 0)
+			if got := replaced.ReadRefs(); got != 1 {
+				t.Errorf("replaced has ref count %d, want only active references %d", got, 1)
 			}
 		})
 	}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 62794cff5..377c94e4c 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -184,6 +184,7 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/shm",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 78737f58f..3049fead4 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,18 +15,26 @@
 package kernel
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm"
 )
 
 // IPCNamespace represents an IPC namespace.
 type IPCNamespace struct {
+	// User namespace which owns this IPC namespace. Immutable.
+	userNS *auth.UserNamespace
+
 	semaphores *semaphore.Registry
+	shms       *shm.Registry
 }
 
 // NewIPCNamespace creates a new IPC namespace.
-func NewIPCNamespace() *IPCNamespace {
+func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
 	return &IPCNamespace{
+		userNS:     userNS,
 		semaphores: semaphore.NewRegistry(),
+		shms:       shm.NewRegistry(userNS),
 	}
 }
 
@@ -35,6 +43,11 @@ func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
 	return i.semaphores
 }
 
+// ShmRegistry returns the shm segment registry for this namespace.
+func (i *IPCNamespace) ShmRegistry() *shm.Registry {
+	return i.shms
+}
+
 // IPCNamespace returns the task's IPC namespace.
 func (t *Task) IPCNamespace() *IPCNamespace {
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
new file mode 100644
index 000000000..182cc1c76
--- /dev/null
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -0,0 +1,40 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "shm_state",
+    srcs = [
+        "shm.go",
+    ],
+    out = "shm_autogen_state.go",
+    package = "shm",
+)
+
+go_library(
+    name = "shm",
+    srcs = [
+        "device.go",
+        "shm.go",
+        "shm_autogen_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go
new file mode 100644
index 000000000..b0dacdbe0
--- /dev/null
+++ b/pkg/sentry/kernel/shm/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shm
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// shmDevice is the kernel shm device.
+var shmDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
new file mode 100644
index 000000000..7217e8103
--- /dev/null
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -0,0 +1,630 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package shm implements sysv shared memory segments.
+//
+// Known missing features:
+//
+// - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
+//   memory locking in general.
+//
+// - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
+//   way to implement hugetlb support on a per-map basis, and it has no impact
+//   on correctness.
+//
+// - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
+//   so it's meaningless to reserve space for swap.
+//
+// - No per-process segment size enforcement. This feature probably isn't used
+//   much anyways, since Linux sets the per-process limits to the system-wide
+//   limits by default.
+//
+// Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
+package shm
+
+import (
+	"fmt"
+	"math"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Various limits for shared memory segments.
+const (
+	// shmsTotalMaxPages is the system-wide limit on all shared memory segments, measured
+	// in number of pages.
+	shmsTotalMaxPages = math.MaxInt64 // SHMALL
+
+	// shmMaxSize is the maximum size of a single segment, in bytes.
+	shmMaxSize = math.MaxInt64 // SHMMAX
+
+	// shmMinSize is the minimum specifiable size of a segment, effectively
+	// yielding a size rounded up to the next page size. Measured in bytes.
+	shmMinSize = 1 // SHMMIN
+
+	// shmsTotalMax is the maximum number of segments on the system.
+	shmsTotalMax = 4096 // SHMMNI
+)
+
+// Registry tracks all shared memory segments in an IPC namespace. The registry
+// provides the mechanisms for creating and finding segments, and reporting
+// global shm parameters.
+type Registry struct {
+	// userNS owns the IPC namespace this registry belong to. Immutable.
+	userNS *auth.UserNamespace
+
+	mu sync.Mutex `state:"nosave"`
+
+	// shms maps segment ids to segments. Protected by mu.
+	shms map[int32]*Shm
+
+	// Sum of the sizes of all existing segments rounded up to page size, in
+	// units of page size. Protected by mu.
+	totalPages uint64
+
+	// lastIDUsed is protected by mu.
+	lastIDUsed int32
+}
+
+// NewRegistry creates a new shm registry.
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+	return &Registry{
+		userNS: userNS,
+		shms:   make(map[int32]*Shm),
+	}
+}
+
+// FindByID looks up a segment given an ID.
+func (r *Registry) FindByID(id int32) *Shm {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.shms[id]
+}
+
+// Precondition: Caller must hold r.mu.
+func (r *Registry) findByKey(key int32) *Shm {
+	for _, v := range r.shms {
+		if v.key == key {
+			return v
+		}
+	}
+	return nil
+}
+
+// FindOrCreate looks up or creates a segment in the registry. It's functionally
+// analogous to open(2).
+func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
+	if create && (size < shmMinSize || size > shmMaxSize) {
+		// "A new segment was to be created and size is less than SHMMIN or
+		// greater than SHMMAX." - man shmget(2)
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if len(r.shms) >= shmsTotalMax {
+		// "All possible shared memory IDs have been taken (SHMMNI) ..."
+		//   - man shmget(2)
+		return nil, syserror.ENOSPC
+	}
+
+	if !private {
+		// Look up an existing segment.
+		if shm := r.findByKey(key); shm != nil {
+			shm.mu.Lock()
+			defer shm.mu.Unlock()
+
+			// Check that caller can access the segment.
+			if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) {
+				// "The user does not have permission to access the shared
+				// memory segment, and does not have the CAP_IPC_OWNER
+				// capability in the user namespace that governs its IPC
+				// namespace." - man shmget(2)
+				return nil, syserror.EACCES
+			}
+
+			if size > shm.size {
+				// "A segment for the given key exists, but size is greater than
+				// the size of that segment." - man shmget(2)
+				return nil, syserror.EINVAL
+			}
+
+			if create && exclusive {
+				// "IPC_CREAT and IPC_EXCL were specified in shmflg, but a
+				// shared memory segment already exists for key."
+				//  - man shmget(2)
+				return nil, syserror.EEXIST
+			}
+
+			return shm, nil
+		}
+
+		if !create {
+			// "No segment exists for the given key, and IPC_CREAT was not
+			// specified." - man shmget(2)
+			return nil, syserror.ENOENT
+		}
+	}
+
+	var sizeAligned uint64
+	if val, ok := usermem.Addr(size).RoundUp(); ok {
+		sizeAligned = uint64(val)
+	} else {
+		return nil, syserror.EINVAL
+	}
+
+	if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > shmsTotalMaxPages {
+		// "... allocating a segment of the requested size would cause the
+		// system to exceed the system-wide limit on shared memory (SHMALL)."
+		//   - man shmget(2)
+		return nil, syserror.ENOSPC
+	}
+
+	// Need to create a new segment.
+	creator := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	return r.newShm(ctx, pid, key, creator, perms, size)
+}
+
+// newShm creates a new segment in the registry.
+func (r *Registry) newShm(ctx context.Context, pid, key int32, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
+	p := platform.FromContext(ctx)
+	if p == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
+	}
+
+	effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
+	fr, err := p.Memory().Allocate(effectiveSize, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+
+	shm := &Shm{
+		p:             p,
+		registry:      r,
+		creator:       creator,
+		size:          size,
+		effectiveSize: effectiveSize,
+		fr:            fr,
+		key:           key,
+		perms:         perms,
+		owner:         creator,
+		creatorPID:    pid,
+		changeTime:    ktime.NowFromContext(ctx),
+	}
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.shms[id] == nil {
+			r.lastIDUsed = id
+			r.shms[id] = shm
+			shm.ID = id
+
+			r.totalPages += effectiveSize / usermem.PageSize
+
+			return shm, nil
+		}
+	}
+
+	log.Warningf("Shm ids exhuasted, they may be leaking")
+	return nil, syserror.ENOSPC
+}
+
+// IPCInfo reports global parameters for sysv shared memory segments on this
+// system. See shmctl(IPC_INFO).
+func (r *Registry) IPCInfo() *linux.ShmParams {
+	return &linux.ShmParams{
+		ShmMax: shmMaxSize,
+		ShmMin: shmMinSize,
+		ShmMni: shmsTotalMax,
+		ShmSeg: shmsTotalMax, // Linux also sets this to SHMMNI.
+		ShmAll: shmsTotalMaxPages,
+	}
+}
+
+// ShmInfo reports linux-specific global parameters for sysv shared memory
+// segments on this system. See shmctl(SHM_INFO).
+func (r *Registry) ShmInfo() *linux.ShmInfo {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	return &linux.ShmInfo{
+		UsedIDs: int32(r.lastIDUsed),
+		ShmTot:  r.totalPages,
+		ShmRss:  r.totalPages, // We could probably get a better estimate from memory accounting.
+		ShmSwp:  0,            // No reclaim at the moment.
+	}
+}
+
+// remove unregisters a segment from this registry, preventing it from being
+// discovered in the future. Caller is responsible for ensuring s is destroyed.
+//
+// Precondition: To preserve lock ordering, caller must not hold s.mu.
+func (r *Registry) remove(s *Shm) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	delete(r.shms, s.ID)
+	r.totalPages -= s.effectiveSize / usermem.PageSize
+}
+
+// Shm represents a single shared memory segment.
+//
+// Shm segment are backed directly by an allocation from platform
+// memory. Segments are always mapped as a whole, greatly simplifying how
+// mappings are tracked. However note that mremap and munmap calls may cause the
+// vma for a segment to become fragmented; which requires special care when
+// unmapping a segment. See mm/shm.go.
+//
+// Segments persist until they are explicitly marked for destruction via
+// shmctl(SHM_RMID).
+//
+// Shm implements memmap.Mappable and memmap.MappingIdentity.
+type Shm struct {
+	// AtomicRefCount tracks the number of references to this segment from
+	// maps. A segment always holds a reference to itself, until it's marked for
+	// destruction.
+	refs.AtomicRefCount
+
+	p platform.Platform
+
+	// registry points to the shm registry containing this segment. Immutable.
+	registry *Registry
+
+	// ID is the kernel identifier for this segment. Immutable.
+	ID int32
+
+	// creator is the user that created the segment. Immutable.
+	creator fs.FileOwner
+
+	// size is the requested size of the segment at creation, in
+	// bytes. Immutable.
+	size uint64
+
+	// effectiveSize of the segment, rounding up to the next page
+	// boundary. Immutable.
+	//
+	// Invariant: effectiveSize must be a multiple of usermem.PageSize.
+	effectiveSize uint64
+
+	// fr is the offset into platform.Memory() that backs this contents of this
+	// segment. Immutable.
+	fr platform.FileRange
+
+	// key is the public identifier for this segment.
+	key int32
+
+	// mu protects all fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// perms is the access permissions for the segment.
+	perms fs.FilePermissions
+
+	// owner of this segment.
+	owner fs.FileOwner
+	// attachTime is updated on every successful shmat.
+	attachTime ktime.Time
+	// detachTime is updated on every successful shmdt.
+	detachTime ktime.Time
+	// changeTime is updated on every successful changes to the segment via
+	// shmctl(IPC_SET).
+	changeTime ktime.Time
+
+	// creatorPID is the PID of the process that created the segment.
+	creatorPID int32
+	// lastAttachDetachPID is the pid of the process that issued the last shmat
+	// or shmdt syscall.
+	lastAttachDetachPID int32
+
+	// pendingDestruction indicates the segment was marked as destroyed through
+	// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
+	// in the registry and can no longer be attached. When the last user
+	// detaches from the segment, it is destroyed. Protected by mu.
+	pendingDestruction bool
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (s *Shm) MappedName(ctx context.Context) string {
+	return fmt.Sprintf("SYSV%08d", s.key)
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (s *Shm) DeviceID() uint64 {
+	return shmDevice.DeviceID()
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (s *Shm) InodeID() uint64 {
+	// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
+	// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
+	return uint64(s.ID)
+}
+
+// DecRef overrides refs.RefCount.DecRef with a destructor.
+func (s *Shm) DecRef() {
+	s.DecRefWithDestructor(s.destroy)
+}
+
+// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
+// segments.
+func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (s *Shm) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.attachTime = ktime.NowFromContext(ctx)
+	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
+		s.lastAttachDetachPID = pid
+	} else {
+		// AddMapping is called during a syscall, so ctx should always be a task
+		// context.
+		log.Warningf("Adding mapping to shm %+v but couldn't get the current pid; not updating the last attach pid", s)
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (s *Shm) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	// TODO: RemoveMapping may be called during task exit, when ctx
+	// is context.Background. Gracefully handle missing clocks. Failing to
+	// update the detach time in these cases is ok, since no one can observe the
+	// omission.
+	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
+		s.detachTime = clock.Now()
+	}
+
+	// If called from a non-task context we also won't have a threadgroup
+	// id. Silently skip updating the lastAttachDetachPid in that case.
+	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
+		s.lastAttachDetachPID = pid
+	} else {
+		log.Debugf("Couldn't obtain pid when removing mapping to shm %+v, not updating the last detach pid.", s)
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (s *Shm) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > s.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   s.p.Memory(),
+				Offset: s.fr.Start + source.Start,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
+
+// AttachOpts describes various flags passed to shmat(2).
+type AttachOpts struct {
+	Execute  bool
+	Readonly bool
+	Remap    bool
+}
+
+// ConfigureAttach creates an mmap configuration for the segment with the
+// requested attach options.
+//
+// ConfigureAttach returns with a ref on s on success. The caller should drop
+// this once the map is installed. This reference prevents s from being
+// destroyed before the returned configuration is used.
+func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.pendingDestruction && s.ReadRefs() == 0 {
+		return memmap.MMapOpts{}, syserror.EIDRM
+	}
+
+	if !s.checkPermissions(ctx, fs.PermMask{
+		Read:    true,
+		Write:   !opts.Readonly,
+		Execute: opts.Execute,
+	}) {
+		// "The calling process does not have the required permissions for the
+		// requested attach type, and does not have the CAP_IPC_OWNER capability
+		// in the user namespace that governs its IPC namespace." - man shmat(2)
+		return memmap.MMapOpts{}, syserror.EACCES
+	}
+	s.IncRef()
+	return memmap.MMapOpts{
+		Length: s.size,
+		Offset: 0,
+		Addr:   addr,
+		Fixed:  opts.Remap,
+		Perms: usermem.AccessType{
+			Read:    true,
+			Write:   !opts.Readonly,
+			Execute: opts.Execute,
+		},
+		MaxPerms:        usermem.AnyAccess,
+		Mappable:        s,
+		MappingIdentity: s,
+	}, nil
+}
+
+// EffectiveSize returns the size of the underlying shared memory segment. This
+// may be larger than the requested size at creation, due to rounding to page
+// boundaries.
+func (s *Shm) EffectiveSize() uint64 {
+	return s.effectiveSize
+}
+
+// IPCStat returns information about a shm. See shmctl(IPC_STAT).
+func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The caller must have read permission on the shared memory segment."
+	//   - man shmctl(2)
+	if !s.checkPermissions(ctx, fs.PermMask{Read: true}) {
+		// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
+		// read access for shmid, and the calling process does not have the
+		// CAP_IPC_OWNER capability in the user namespace that governs its IPC
+		// namespace." - man shmctl(2)
+		return nil, syserror.EACCES
+	}
+
+	var mode uint16
+	if s.pendingDestruction {
+		mode |= linux.SHM_DEST
+	}
+	creds := auth.CredentialsFromContext(ctx)
+
+	nattach := uint64(s.ReadRefs())
+	// Don't report the self-reference we keep prior to being marked for
+	// destruction. However, also don't report a count of -1 for segments marked
+	// as destroyed, with no mappings.
+	if !s.pendingDestruction {
+		nattach--
+	}
+
+	ds := &linux.ShmidDS{
+		ShmPerm: linux.IPCPerm{
+			Key:  uint32(s.key),
+			UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
+			GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
+			CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
+			CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
+			Mode: mode | uint16(s.perms.LinuxMode()),
+			Seq:  0, // IPC sequences not supported.
+		},
+		ShmSegsz:   s.size,
+		ShmAtime:   s.attachTime.TimeT(),
+		ShmDtime:   s.detachTime.TimeT(),
+		ShmCtime:   s.changeTime.TimeT(),
+		ShmCpid:    s.creatorPID,
+		ShmLpid:    s.lastAttachDetachPID,
+		ShmNattach: nattach,
+	}
+
+	return ds, nil
+}
+
+// Set modifies attributes for a segment. See shmctl(IPC_SET).
+func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if !s.checkOwnership(ctx) {
+		return syserror.EPERM
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
+	gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
+	if !uid.Ok() || !gid.Ok() {
+		return syserror.EINVAL
+	}
+
+	// User may only modify the lower 9 bits of the mode. All the other bits are
+	// always 0 for the underlying inode.
+	mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
+	s.perms = fs.FilePermsFromMode(mode)
+
+	s.owner.UID = uid
+	s.owner.GID = gid
+
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+func (s *Shm) destroy() {
+	s.registry.remove(s)
+	s.p.Memory().DecRef(s.fr)
+}
+
+// MarkDestroyed marks a shm for destruction. The shm is actually destroyed once
+// it has no references. See shmctl(IPC_RMID).
+func (s *Shm) MarkDestroyed() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	// Prevent the segment from being found in the registry.
+	s.key = linux.IPC_PRIVATE
+	s.pendingDestruction = true
+	s.DecRef()
+}
+
+// checkOwnership verifies whether a segment may be accessed by ctx as an
+// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux.
+//
+// Precondition: Caller must hold s.mu.
+func (s *Shm) checkOwnership(ctx context.Context) bool {
+	creds := auth.CredentialsFromContext(ctx)
+	if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID {
+		return true
+	}
+
+	// Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
+	// doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
+	// for use to "override IPC ownership checks".
+	return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS)
+}
+
+// checkPermissions verifies whether a segment is accessible by ctx for access
+// described by req. See ipc/util.c:ipcperms() in Linux.
+//
+// Precondition: Caller must hold s.mu.
+func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool {
+	creds := auth.CredentialsFromContext(ctx)
+
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+	if p.SupersetOf(req) {
+		return true
+	}
+
+	// Tasks with CAP_IPC_OWNER may bypass permission checks.
+	return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS)
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 490f795c2..7763050a5 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bpf"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -559,6 +560,8 @@ func (t *Task) Value(key interface{}) interface{} {
 		return t
 	case auth.CtxCredentials:
 		return t.creds
+	case context.CtxThreadGroupID:
+		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
 		return t.FSContext().RootDirectory()
 	case inet.CtxStack:
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 3a74abdfb..0c2427952 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -197,7 +197,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	if opts.NewIPCNamespace {
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
-		ipcns = NewIPCNamespace()
+		ipcns = NewIPCNamespace(userns)
 	}
 
 	tc, err := t.tc.Fork(t, !opts.NewAddressSpace)
@@ -449,7 +449,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		}
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
-		t.ipcns = NewIPCNamespace()
+		t.ipcns = NewIPCNamespace(t.creds.UserNamespace)
 	}
 	if opts.NewFiles {
 		oldFDMap := t.tr.FDMap
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 39bde2be3..258389bb2 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -107,6 +107,7 @@ go_library(
         "pma_set.go",
         "proc_pid_maps.go",
         "save_restore.go",
+        "shm.go",
         "special_mappable.go",
         "syscalls.go",
         "vma.go",
@@ -123,6 +124,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
new file mode 100644
index 000000000..bab137a5a
--- /dev/null
+++ b/pkg/sentry/mm/shm.go
@@ -0,0 +1,66 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// DetachShm unmaps a sysv shared memory segment.
+func (mm *MemoryManager) DetachShm(ctx context.Context, addr usermem.Addr) error {
+	if addr != addr.RoundDown() {
+		// "... shmaddr is not aligned on a page boundary." - man shmdt(2)
+		return syserror.EINVAL
+	}
+
+	var detached *shm.Shm
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	// Find and remove the first vma containing an address >= addr that maps a
+	// segment originally attached at addr.
+	vseg := mm.vmas.LowerBoundSegment(addr)
+	for vseg.Ok() {
+		vma := vseg.ValuePtr()
+		if shm, ok := vma.mappable.(*shm.Shm); ok && vseg.Start() >= addr && uint64(vseg.Start()-addr) == vma.off {
+			detached = shm
+			vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
+			break
+		} else {
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	if detached == nil {
+		// There is no shared memory segment attached at addr.
+		return syserror.EINVAL
+	}
+
+	// Remove all vmas that could have been created by the same attach.
+	end := addr + usermem.Addr(detached.EffectiveSize())
+	for vseg.Ok() && vseg.End() <= end {
+		vma := vseg.ValuePtr()
+		if vma.mappable == detached && uint64(vseg.Start()-addr) == vma.off {
+			vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
+		} else {
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index bc67ebf30..f9e0a4be3 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -44,6 +44,7 @@ go_library(
         "sys_rusage.go",
         "sys_sched.go",
         "sys_sem.go",
+        "sys_shm.go",
         "sys_signal.go",
         "sys_socket.go",
         "sys_stat.go",
@@ -84,6 +85,7 @@ go_library(
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/shm",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 44db2d582..237c61007 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -75,9 +75,9 @@ var AMD64 = &kernel.SyscallTable{
 		26: Msync,
 		27: Mincore,
 		28: Madvise,
-		//     29: Shmget, TODO
-		//     30: Shmat, TODO
-		//     31: Shmctl, TODO
+		29: Shmget,
+		30: Shmat,
+		31: Shmctl,
 		32: Dup,
 		33: Dup2,
 		34: Pause,
@@ -113,7 +113,7 @@ var AMD64 = &kernel.SyscallTable{
 		64: Semget,
 		65: Semop,
 		66: Semctl,
-		//     67: Shmdt, TODO
+		67: Shmdt,
 		//     68: Msgget, TODO
 		//     69: Msgsnd, TODO
 		//     70: Msgrcv, TODO
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
new file mode 100644
index 000000000..48ff1d5f0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -0,0 +1,155 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Shmget implements shmget(2).
+func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	key := args[0].Int()
+	size := uint64(args[1].SizeT())
+	flag := args[2].Int()
+
+	private := key == linux.IPC_PRIVATE
+	create := flag&linux.IPC_CREAT == linux.IPC_CREAT
+	exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
+	mode := linux.FileMode(flag & 0777)
+
+	pid := int32(t.ThreadGroup().ID())
+	r := t.IPCNamespace().ShmRegistry()
+	segment, err := r.FindOrCreate(t, pid, key, size, mode, private, create, exclusive)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(segment.ID), nil, nil
+}
+
+// findSegment retrives a shm segment by the given id.
+func findSegment(t *kernel.Task, id int32) (*shm.Shm, error) {
+	r := t.IPCNamespace().ShmRegistry()
+	segment := r.FindByID(id)
+	if segment == nil {
+		// No segment with provided id.
+		return nil, syserror.EINVAL
+	}
+	return segment, nil
+}
+
+// Shmat implements shmat(2).
+func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	addr := args[1].Pointer()
+	flag := args[2].Int()
+
+	segment, err := findSegment(t, id)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
+		Execute:  flag&linux.SHM_EXEC == linux.SHM_EXEC,
+		Readonly: flag&linux.SHM_RDONLY == linux.SHM_RDONLY,
+		Remap:    flag&linux.SHM_REMAP == linux.SHM_REMAP,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer segment.DecRef()
+	addr, err = t.MemoryManager().MMap(t, opts)
+	return uintptr(addr), nil, err
+}
+
+// Shmdt implements shmdt(2).
+func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	err := t.MemoryManager().DetachShm(t, addr)
+	return 0, nil, err
+}
+
+// Shmctl implements shmctl(2).
+func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	cmd := args[1].Int()
+	buf := args[2].Pointer()
+
+	r := t.IPCNamespace().ShmRegistry()
+
+	switch cmd {
+	case linux.SHM_STAT:
+		// Technically, we should be treating id as "an index into the kernel's
+		// internal array that maintains information about all shared memory
+		// segments on the system". Since we don't track segments in an array,
+		// we'll just pretend the shmid is the index and do the same thing as
+		// IPC_STAT. Linux also uses the index as the shmid.
+		fallthrough
+	case linux.IPC_STAT:
+		segment, err := findSegment(t, id)
+		if err != nil {
+			return 0, nil, syserror.EINVAL
+		}
+
+		stat, err := segment.IPCStat(t)
+		if err == nil {
+			_, err = t.CopyOut(buf, stat)
+		}
+		return 0, nil, err
+
+	case linux.IPC_INFO:
+		params := r.IPCInfo()
+		_, err := t.CopyOut(buf, params)
+		return 0, nil, err
+
+	case linux.SHM_INFO:
+		info := r.ShmInfo()
+		_, err := t.CopyOut(buf, info)
+		return 0, nil, err
+	}
+
+	// Remaining commands refer to a specific segment.
+	segment, err := findSegment(t, id)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	switch cmd {
+	case linux.IPC_SET:
+		var ds linux.ShmidDS
+		_, err = t.CopyIn(buf, &ds)
+		if err != nil {
+			return 0, nil, err
+		}
+		err = segment.Set(t, &ds)
+		return 0, nil, err
+
+	case linux.IPC_RMID:
+		segment.MarkDestroyed()
+		return 0, nil, nil
+
+	case linux.SHM_LOCK, linux.SHM_UNLOCK:
+		// We currently do not support memmory locking anywhere.
+		// mlock(2)/munlock(2) are currently stubbed out as no-ops so do the
+		// same here.
+		return 0, nil, nil
+
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0ff54d349..566f2eb46 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -146,7 +146,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// not configurable from runtime spec.
 	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
 
-	ipcns := kernel.NewIPCNamespace()
+	ipcns := kernel.NewIPCNamespace(creds.UserNamespace)
 
 	if err := enableStrace(conf); err != nil {
 		return nil, fmt.Errorf("failed to enable strace: %v", err)
-- 
cgit v1.2.3


From ed2b86a54942dfd245e9f872e6da52d9bde0da6d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 21 May 2018 17:47:13 -0700
Subject: Fix test failure when user can't mount temp dir

PiperOrigin-RevId: 197491098
Change-Id: Ifb75bd4e4f41b84256b6d7afc4b157f6ce3839f3
---
 runsc/container/container_test.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index e4467ccba..e1674d631 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -434,6 +434,17 @@ func TestCapabilities(t *testing.T) {
 		Type:        "bind",
 	})
 
+	// Capability below is needed to mount TempDir above in case the user doesn't
+	// have access to all parents that lead to TempDir.
+	caps := []string{"CAP_DAC_OVERRIDE"}
+	spec.Process.Capabilities = &specs.LinuxCapabilities{
+		Bounding:    caps,
+		Effective:   caps,
+		Inheritable: caps,
+		Permitted:   caps,
+		Ambient:     caps,
+	}
+
 	rootDir, bundleDir, conf, err := setupContainer(spec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
-- 
cgit v1.2.3


From e48f7078761b00552ac74068c184ee4fb90fe9aa Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 24 May 2018 14:27:05 -0700
Subject: Configure sandbox as superuser

Container user might not have enough priviledge to walk directories and
mount filesystems. Instead, create superuser to perform these steps of
the configuration.

PiperOrigin-RevId: 197953667
Change-Id: I643650ab654e665408e2af1b8e2f2aa12d58d4fb
---
 runsc/boot/fs.go                  |  43 +++++++++-------
 runsc/boot/loader.go              |  48 +++++++++--------
 runsc/boot/loader_test.go         |   2 +-
 runsc/container/container_test.go | 106 ++++++++++++++++++++++++++------------
 4 files changed, 125 insertions(+), 74 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 86cbe1169..e5b7663d0 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -51,21 +51,30 @@ func (f *fdDispenser) empty() bool {
 	return len(f.fds) == 0
 }
 
-// createMountNamespace creates a mount manager containing the root filesystem
-// and all mounts.
-func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
+// createMountNamespace creates a mount namespace containing the root filesystem
+// and all mounts. 'rootCtx' is used to walk directories to find mount points.
+func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
 	fds := &fdDispenser{fds: ioFDs}
-
-	// Create the MountNamespace from the root.
-	rootInode, err := createRootMount(ctx, spec, conf, fds)
+	rootInode, err := createRootMount(rootCtx, spec, conf, fds)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create root overlay: %v", err)
+		return nil, fmt.Errorf("failed to create root mount: %v", err)
 	}
-	mns, err := fs.NewMountNamespace(ctx, rootInode)
+	mns, err := fs.NewMountNamespace(userCtx, rootInode)
 	if err != nil {
-		return nil, fmt.Errorf("failed to construct MountNamespace: %v", err)
+		return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
+	}
+	if err := configureMounts(rootCtx, spec, conf, mns, fds); err != nil {
+		return nil, fmt.Errorf("failed to configure mounts: %v", err)
+	}
+	if !fds.empty() {
+		return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
 	}
+	return mns, nil
+}
 
+// configureMounts iterates over Spec.Mounts and mounts them in the specified
+// mount namespace.
+func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser) error {
 	// Keep track of whether proc, sys, and tmp were mounted.
 	var procMounted, sysMounted, tmpMounted bool
 
@@ -88,7 +97,7 @@ func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, i
 		}
 
 		if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil {
-			return nil, err
+			return err
 		}
 	}
 
@@ -97,7 +106,7 @@ func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, i
 		Type:        "devtmpfs",
 		Destination: "/dev",
 	}); err != nil {
-		return nil, err
+		return err
 	}
 
 	// Mount proc and sys even if the user did not ask for it, as the spec
@@ -107,7 +116,7 @@ func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, i
 			Type:        "proc",
 			Destination: "/proc",
 		}); err != nil {
-			return nil, err
+			return err
 		}
 	}
 	if !sysMounted {
@@ -115,7 +124,7 @@ func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, i
 			Type:        "sysfs",
 			Destination: "/sys",
 		}); err != nil {
-			return nil, err
+			return err
 		}
 	}
 
@@ -127,15 +136,11 @@ func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, i
 			Type:        "tmpfs",
 			Destination: "/tmp",
 		}); err != nil {
-			return nil, err
+			return err
 		}
 	}
 
-	if !fds.empty() {
-		return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
-	}
-
-	return mns, nil
+	return nil
 }
 
 // createRootMount creates the root filesystem.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 566f2eb46..76edbb905 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -137,9 +137,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		extraKGIDs,
 		caps,
 		auth.NewRootUserNamespace())
-	if err != nil {
-		return nil, fmt.Errorf("error creating credentials: %v", err)
-	}
 
 	// Create user namespace.
 	// TODO: Not clear what domain name should be here.  It is
@@ -159,22 +156,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		return nil, fmt.Errorf("error getting executable path: %v", err)
 	}
 
-	// Create the process arguments.
-	procArgs := kernel.CreateProcessArgs{
-		Filename:         exec,
-		Argv:             spec.Process.Args,
-		Envv:             spec.Process.Env,
-		WorkingDirectory: spec.Process.Cwd,
-		Credentials:      creds,
-		// Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so
-		// it must wait until we have a Kernel.
-		Umask:                uint(syscall.Umask(0)),
-		Limits:               ls,
-		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		UTSNamespace:         utsns,
-		IPCNamespace:         ipcns,
-	}
-
 	// Create an empty network stack because the network namespace may be empty at
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
@@ -219,14 +200,39 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		return nil, fmt.Errorf("error creating control server: %v", err)
 	}
 
+	// Create the process arguments.
+	procArgs := kernel.CreateProcessArgs{
+		Filename:         exec,
+		Argv:             spec.Process.Args,
+		Envv:             spec.Process.Env,
+		WorkingDirectory: spec.Process.Cwd,
+		Credentials:      creds,
+		// Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so
+		// it must wait until we have a Kernel.
+		Umask:                uint(syscall.Umask(0)),
+		Limits:               ls,
+		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+		UTSNamespace:         utsns,
+		IPCNamespace:         ipcns,
+	}
 	ctx := procArgs.NewContext(k)
 
+	// Use root user to configure mounts. The current user might not have
+	// permission to do so.
+	rootProcArgs := kernel.CreateProcessArgs{
+		WorkingDirectory:     "/",
+		Credentials:          auth.NewRootCredentials(creds.UserNamespace),
+		Umask:                uint(syscall.Umask(0022)),
+		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+	}
+	rootCtx := rootProcArgs.NewContext(k)
+
 	// Create the virtual filesystem.
-	mm, err := createMountNamespace(ctx, spec, conf, ioFDs)
+	mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
 	if err != nil {
 		return nil, fmt.Errorf("error creating mounts: %v", err)
 	}
-	k.SetRootMountNamespace(mm)
+	k.SetRootMountNamespace(mns)
 
 	// Create the FD map, which will set stdin, stdout, and stderr.  If console
 	// is true, then ioctl calls will be passed through to the host fd.
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index d2e5fe74e..5bc6f1646 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -239,7 +239,7 @@ func TestCreateMountNamespace(t *testing.T) {
 
 	for _, tc := range testCases {
 		ctx := contexttest.Context(t)
-		mm, err := createMountNamespace(ctx, &tc.spec, conf, nil)
+		mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, nil)
 		if err != nil {
 			t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
 		}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index e1674d631..24e9de3ce 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -20,6 +20,7 @@ import (
 	"io"
 	"io/ioutil"
 	"os"
+	"path"
 	"path/filepath"
 	"reflect"
 	"strings"
@@ -132,6 +133,34 @@ func waitForProcessList(s *container.Container, expected []*control.Process) err
 	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(expected))
 }
 
+// procListsEqual is used to check whether 2 Process lists are equal for all
+// implemented fields.
+func procListsEqual(got, want []*control.Process) bool {
+	if len(got) != len(want) {
+		return false
+	}
+	for i := range got {
+		pd1 := got[i]
+		pd2 := want[i]
+		// Zero out unimplemented and timing dependant fields.
+		pd1.Time, pd2.Time = "", ""
+		pd1.STime, pd2.STime = "", ""
+		pd1.C, pd2.C = 0, 0
+		if *pd1 != *pd2 {
+			return false
+		}
+	}
+	return true
+}
+
+func procListToString(pl []*control.Process) string {
+	strs := make([]string, 0, len(pl))
+	for _, p := range pl {
+		strs = append(strs, fmt.Sprintf("%+v", p))
+	}
+	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
+}
+
 // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
@@ -434,17 +463,6 @@ func TestCapabilities(t *testing.T) {
 		Type:        "bind",
 	})
 
-	// Capability below is needed to mount TempDir above in case the user doesn't
-	// have access to all parents that lead to TempDir.
-	caps := []string{"CAP_DAC_OVERRIDE"}
-	spec.Process.Capabilities = &specs.LinuxCapabilities{
-		Bounding:    caps,
-		Effective:   caps,
-		Inheritable: caps,
-		Permitted:   caps,
-		Ambient:     caps,
-	}
-
 	rootDir, bundleDir, conf, err := setupContainer(spec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -621,32 +639,54 @@ func TestSpecUnsupported(t *testing.T) {
 	}
 }
 
-// procListsEqual is used to check whether 2 Process lists are equal for all
-// implemented fields.
-func procListsEqual(got, want []*control.Process) bool {
-	if len(got) != len(want) {
-		return false
+// TestRunNonRoot checks that sandbox can be configured when running as
+// non-priviledged user.
+func TestRunNonRoot(t *testing.T) {
+	spec := newSpecWithArgs("/bin/true")
+	spec.Process.User.UID = 343
+	spec.Process.User.GID = 2401
+
+	// User that container runs as can't list '$TMP/blocked' and would fail to
+	// mount it.
+	dir := path.Join(os.TempDir(), "blocked")
+	if err := os.Mkdir(dir, 0700); err != nil {
+		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
 	}
-	for i := range got {
-		pd1 := got[i]
-		pd2 := want[i]
-		// Zero out unimplemented and timing dependant fields.
-		pd1.Time, pd2.Time = "", ""
-		pd1.STime, pd2.STime = "", ""
-		pd1.C, pd2.C = 0, 0
-		if *pd1 != *pd2 {
-			return false
-		}
+	dir = path.Join(dir, "test")
+	if err := os.Mkdir(dir, 0755); err != nil {
+		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
 	}
-	return true
-}
 
-func procListToString(pl []*control.Process) string {
-	strs := make([]string, 0, len(pl))
-	for _, p := range pl {
-		strs = append(strs, fmt.Sprintf("%+v", p))
+	// We generate files in the host temporary directory.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: dir,
+		Source:      dir,
+		Type:        "bind",
+	})
+
+	rootDir, bundleDir, conf, err := setupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create, start and wait for the container.
+	s, err := container.Create(uniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+	ws, err := s.Wait()
+	if err != nil {
+		t.Errorf("error waiting on container: %v", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Errorf("container failed, waitStatus: %v", ws)
 	}
-	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
 }
 
 // TestMain acts like runsc if it is called with the "boot" argument, otherwise
-- 
cgit v1.2.3


From 812e83d3bbb99d4fa1ece4712a1ac85e84fe6ec3 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 29 May 2018 17:57:26 -0700
Subject: Supress error when deleting non-existing container with --force

This addresses the first issue reported in #59. CRI-O expects runsc to
return success to delete when --force is used with a non-existing container.

PiperOrigin-RevId: 198487418
Change-Id: If7660e8fdab1eb29549d0a7a45ea82e20a1d4f4a
---
 runsc/cmd/BUILD              |  6 +++++-
 runsc/cmd/delete.go          | 25 +++++++++++++++++++------
 runsc/cmd/delete_test.go     | 41 +++++++++++++++++++++++++++++++++++++++++
 runsc/container/container.go | 35 ++++++++++-------------------------
 4 files changed, 75 insertions(+), 32 deletions(-)
 create mode 100644 runsc/cmd/delete_test.go

(limited to 'runsc')

diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 08aaee996..4b4afa4a0 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -44,13 +44,17 @@ go_library(
 go_test(
     name = "cmd_test",
     size = "small",
-    srcs = ["exec_test.go"],
+    srcs = [
+        "delete_test.go",
+        "exec_test.go",
+    ],
     embed = [":cmd"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
         "//pkg/urpc",
+        "//runsc/boot",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 769a11c45..46de5f348 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -15,9 +15,13 @@
 package cmd
 
 import (
+	"fmt"
+	"os"
+
 	"context"
 	"flag"
 	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/container"
 )
@@ -56,19 +60,28 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 
 	conf := args[0].(*boot.Config)
+	if err := d.execute(f.Args(), conf); err != nil {
+		Fatalf("%v", err)
+	}
+	return subcommands.ExitSuccess
+}
 
-	for i := 0; i < f.NArg(); i++ {
-		id := f.Arg(i)
+func (d *Delete) execute(ids []string, conf *boot.Config) error {
+	for _, id := range ids {
 		c, err := container.Load(conf.RootDir, id)
 		if err != nil {
-			Fatalf("error loading container %q: %v", id, err)
+			if os.IsNotExist(err) && d.force {
+				log.Warningf("couldn't find container %q: %v", id, err)
+				return nil
+			}
+			return fmt.Errorf("error loading container %q: %v", id, err)
 		}
 		if !d.force && (c.Status == container.Running) {
-			Fatalf("cannot stop running container without --force flag")
+			return fmt.Errorf("cannot stop running container without --force flag")
 		}
 		if err := c.Destroy(); err != nil {
-			Fatalf("error destroying container: %v", err)
+			return fmt.Errorf("error destroying container: %v", err)
 		}
 	}
-	return subcommands.ExitSuccess
+	return nil
 }
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
new file mode 100644
index 000000000..928e9ee2c
--- /dev/null
+++ b/runsc/cmd/delete_test.go
@@ -0,0 +1,41 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"io/ioutil"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+)
+
+func TestNotFound(t *testing.T) {
+	ids := []string{"123"}
+	dir, err := ioutil.TempDir("", "metadata")
+	if err != nil {
+		t.Fatalf("error creating dir: %v", err)
+	}
+	conf := &boot.Config{RootDir: dir}
+
+	d := Delete{}
+	if err := d.execute(ids, conf); err == nil {
+		t.Error("Deleting non-existend container should have failed")
+	}
+
+	d = Delete{force: true}
+	if err := d.execute(ids, conf); err != nil {
+		t.Errorf("Deleting non-existend container with --force should NOT have failed: %v", err)
+	}
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index ae86e40c9..f20ec2453 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -93,21 +93,19 @@ type Container struct {
 }
 
 // Load loads a container with the given id from a metadata file.
+// Returns ErrNotExist if container doesn't exits.
 func Load(rootDir, id string) (*Container, error) {
 	log.Debugf("Load container %q %q", rootDir, id)
 	if err := validateID(id); err != nil {
 		return nil, err
 	}
-	cRoot := filepath.Join(rootDir, id)
-	if !exists(cRoot) {
-		return nil, fmt.Errorf("container with id %q does not exist", id)
-	}
-	metaFile := filepath.Join(cRoot, metadataFilename)
-	if !exists(metaFile) {
-		return nil, fmt.Errorf("container with id %q does not have metadata file %q", id, metaFile)
-	}
+	metaFile := filepath.Join(rootDir, id, metadataFilename)
 	metaBytes, err := ioutil.ReadFile(metaFile)
 	if err != nil {
+		if os.IsNotExist(err) {
+			// Preserve error so that callers can distinguish 'not found' errors.
+			return nil, err
+		}
 		return nil, fmt.Errorf("error reading container metadata file %q: %v", metaFile, err)
 	}
 	var c Container
@@ -161,8 +159,10 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	}
 
 	containerRoot := filepath.Join(conf.RootDir, id)
-	if exists(containerRoot) {
-		return nil, fmt.Errorf("container with id %q already exists: %q ", id, containerRoot)
+	if _, err := os.Stat(containerRoot); err == nil {
+		return nil, fmt.Errorf("container with id %q already exists: %q", id, containerRoot)
+	} else if !os.IsNotExist(err) {
+		return nil, fmt.Errorf("error looking for existing container in %q: %v", containerRoot, err)
 	}
 
 	c := &Container{
@@ -328,11 +328,6 @@ func (c *Container) Destroy() error {
 		return err
 	}
 
-	// Then destroy all the metadata.
-	if err := os.RemoveAll(c.Root); err != nil {
-		log.Warningf("Failed to delete container root directory %q, err: %v", c.Root, err)
-	}
-
 	// "If any poststop hook fails, the runtime MUST log a warning, but the
 	// remaining hooks and lifecycle continue as if the hook had succeeded".
 	if c.Spec.Hooks != nil && (c.Status == Created || c.Status == Running) {
@@ -372,13 +367,3 @@ func (c *Container) save() error {
 	}
 	return nil
 }
-
-// exists returns true if the given file exists.
-func exists(f string) bool {
-	if _, err := os.Stat(f); err == nil {
-		return true
-	} else if !os.IsNotExist(err) {
-		log.Warningf("error checking for file %q: %v", f, err)
-	}
-	return false
-}
-- 
cgit v1.2.3


From 65dadc00297d946e86b2e95b0279fb6dc94542dd Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 1 Jun 2018 10:08:40 -0700
Subject: Ignores IPv6 addresses when configuring network

Closes #60

PiperOrigin-RevId: 198887885
Change-Id: I9bf990ee3fde9259836e57d67257bef5b85c6008
---
 runsc/sandbox/network.go | 52 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 17 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index d0ff64067..62dcdd9e9 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -188,14 +188,14 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 			continue
 		}
 
-		ifaddrs, err := iface.Addrs()
+		allAddrs, err := iface.Addrs()
 		if err != nil {
 			return fmt.Errorf("error fetching interface addresses for %q: %v", iface.Name, err)
 		}
 
 		// We build our own loopback devices.
 		if iface.Flags&net.FlagLoopback != 0 {
-			links, err := loopbackLinks(iface, ifaddrs)
+			links, err := loopbackLinks(iface, allAddrs)
 			if err != nil {
 				return fmt.Errorf("error getting loopback routes and links for iface %q: %v", iface.Name, err)
 			}
@@ -203,6 +203,24 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 			continue
 		}
 
+		// Keep only IPv4 addresses.
+		var ip4addrs []*net.IPNet
+		for _, ifaddr := range allAddrs {
+			ipNet, ok := ifaddr.(*net.IPNet)
+			if !ok {
+				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
+			}
+			if ipNet.IP.To4() == nil {
+				log.Warningf("IPv6 is not supported, skipping: %v", ipNet)
+				continue
+			}
+			ip4addrs = append(ip4addrs, ipNet)
+		}
+		if len(ip4addrs) == 0 {
+			log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name)
+			continue
+		}
+
 		// Get the link for the interface.
 		ifaceLink, err := netlink.LinkByName(iface.Name)
 		if err != nil {
@@ -250,16 +268,12 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
-		for _, ifaddr := range ifaddrs {
-			ipNet, ok := ifaddr.(*net.IPNet)
-			if !ok {
-				return fmt.Errorf("address is not IPNet: %t %+v", ifaddr, ifaddr)
-			}
-			link.Addresses = append(link.Addresses, ipNet.IP)
+		for _, addr := range ip4addrs {
+			link.Addresses = append(link.Addresses, addr.IP)
 
 			// Steal IP address from NIC.
-			if err := removeAddress(ifaceLink, ipNet.String()); err != nil {
-				return fmt.Errorf("error removing address %v from device %q: %v", iface.Name, ipNet, err)
+			if err := removeAddress(ifaceLink, addr.String()); err != nil {
+				return fmt.Errorf("error removing address %v from device %q: %v", iface.Name, addr, err)
 			}
 		}
 
@@ -280,7 +294,7 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink,
 	for _, addr := range addrs {
 		ipNet, ok := addr.(*net.IPNet)
 		if !ok {
-			return nil, fmt.Errorf("address is not IPNet: %t %+v", addr, addr)
+			return nil, fmt.Errorf("address is not IPNet: %+v", addr)
 		}
 		links = append(links, boot.LoopbackLink{
 			Name:      iface.Name,
@@ -314,21 +328,25 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
 			if r.Gw == nil {
 				return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
 			}
+			if r.Gw.To4() == nil {
+				log.Warningf("IPv6 is not supported, skipping default route: %v", r)
+				continue
+			}
 			if def != nil {
 				return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
 			}
-			emptyAddr := net.IPv6zero
-			if r.Gw.To4() != nil {
-				emptyAddr = net.IPv4zero
-			}
 			// Create a catch all route to the gateway.
 			def = &boot.Route{
-				Destination: emptyAddr,
-				Mask:        net.IPMask(emptyAddr),
+				Destination: net.IPv4zero,
+				Mask:        net.IPMask(net.IPv4zero),
 				Gateway:     r.Gw,
 			}
 			continue
 		}
+		if r.Dst.IP.To4() == nil {
+			log.Warningf("IPv6 is not supported, skipping route: %v", r)
+			continue
+		}
 		routes = append(routes, boot.Route{
 			Destination: r.Dst.IP.Mask(r.Dst.Mask),
 			Mask:        r.Dst.Mask,
-- 
cgit v1.2.3


From d1ca50d49e52338feb1d46b69725b9ac21cc3ccc Mon Sep 17 00:00:00 2001
From: Zhengyu He <hzy@google.com>
Date: Fri, 1 Jun 2018 13:39:53 -0700
Subject: Add SyscallRules that supports argument filtering

PiperOrigin-RevId: 198919043
Change-Id: I7f1f0a3b3430cd0936a4ee4fc6859aab71820bdf
---
 pkg/bpf/program_builder.go              |  64 ++++++--
 pkg/seccomp/BUILD                       |   4 +
 pkg/seccomp/seccomp.go                  | 217 +++++++++++++++++-----------
 pkg/seccomp/seccomp_rules.go            | 119 +++++++++++++++
 pkg/seccomp/seccomp_test.go             | 125 +++++++++++++---
 pkg/seccomp/seccomp_test_victim.go      | 141 +++++++++---------
 runsc/boot/filter/config.go             | 249 ++++++++++++++++----------------
 runsc/boot/filter/extra_filters.go      |   6 +-
 runsc/boot/filter/extra_filters_msan.go |  10 +-
 runsc/boot/filter/extra_filters_race.go |  19 ++-
 runsc/boot/filter/filter.go             |  12 +-
 11 files changed, 640 insertions(+), 326 deletions(-)
 create mode 100644 pkg/seccomp/seccomp_rules.go

(limited to 'runsc')

diff --git a/pkg/bpf/program_builder.go b/pkg/bpf/program_builder.go
index 7554d47c1..bad56d7ac 100644
--- a/pkg/bpf/program_builder.go
+++ b/pkg/bpf/program_builder.go
@@ -21,7 +21,10 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 )
 
-const labelTarget = math.MaxUint8
+const (
+	labelTarget       = math.MaxUint8
+	labelDirectTarget = math.MaxUint32
+)
 
 // ProgramBuilder assists with building a BPF program with jump
 // labels that are resolved to their proper offsets.
@@ -47,6 +50,14 @@ type label struct {
 	target int
 }
 
+type jmpType int
+
+const (
+	jDirect jmpType = iota
+	jTrue
+	jFalse
+)
+
 // source contains information about a single reference to a label.
 type source struct {
 	// Program line where the label reference is present.
@@ -54,7 +65,7 @@ type source struct {
 
 	// True if label reference is in the 'jump if true' part of the jump.
 	// False if label reference is in the 'jump if false' part of the jump.
-	jt bool
+	jt jmpType
 }
 
 // AddStmt adds a new statement to the program.
@@ -67,23 +78,29 @@ func (b *ProgramBuilder) AddJump(code uint16, k uint32, jt, jf uint8) {
 	b.instructions = append(b.instructions, Jump(code, k, jt, jf))
 }
 
+// AddDirectJumpLabel adds a new jump to the program where is labelled.
+func (b *ProgramBuilder) AddDirectJumpLabel(labelName string) {
+	b.addLabelSource(labelName, jDirect)
+	b.AddJump(Jmp|Ja, labelDirectTarget, 0, 0)
+}
+
 // AddJumpTrueLabel adds a new jump to the program where 'jump if true' is a label.
 func (b *ProgramBuilder) AddJumpTrueLabel(code uint16, k uint32, jtLabel string, jf uint8) {
-	b.addLabelSource(jtLabel, true)
+	b.addLabelSource(jtLabel, jTrue)
 	b.AddJump(code, k, labelTarget, jf)
 }
 
 // AddJumpFalseLabel adds a new jump to the program where 'jump if false' is a label.
 func (b *ProgramBuilder) AddJumpFalseLabel(code uint16, k uint32, jt uint8, jfLabel string) {
-	b.addLabelSource(jfLabel, false)
-	b.AddJump(code, k, jt, math.MaxUint8)
+	b.addLabelSource(jfLabel, jFalse)
+	b.AddJump(code, k, jt, labelTarget)
 }
 
 // AddJumpLabels adds a new jump to the program where both jump targets are labels.
 func (b *ProgramBuilder) AddJumpLabels(code uint16, k uint32, jtLabel, jfLabel string) {
-	b.addLabelSource(jtLabel, true)
-	b.addLabelSource(jfLabel, false)
-	b.AddJump(code, k, math.MaxUint8, math.MaxUint8)
+	b.addLabelSource(jtLabel, jTrue)
+	b.addLabelSource(jfLabel, jFalse)
+	b.AddJump(code, k, labelTarget, labelTarget)
 }
 
 // AddLabel sets the given label name at the current location. The next instruction is executed
@@ -104,20 +121,22 @@ func (b *ProgramBuilder) AddLabel(name string) error {
 
 // Instructions returns an array of BPF instructions representing the program with all labels
 // resolved. Return error in case label resolution failed due to an invalid program.
+//
+// N.B. Partial results will be returned in the error case, which is useful for debugging.
 func (b *ProgramBuilder) Instructions() ([]linux.BPFInstruction, error) {
 	if err := b.resolveLabels(); err != nil {
-		return nil, err
+		return b.instructions, err
 	}
 	return b.instructions, nil
 }
 
-func (b *ProgramBuilder) addLabelSource(labelName string, jt bool) {
+func (b *ProgramBuilder) addLabelSource(labelName string, t jmpType) {
 	l, ok := b.labels[labelName]
 	if !ok {
 		l = &label{sources: make([]source, 0), target: -1}
 		b.labels[labelName] = l
 	}
-	l.sources = append(l.sources, source{line: len(b.instructions), jt: jt})
+	l.sources = append(l.sources, source{line: len(b.instructions), jt: t})
 }
 
 func (b *ProgramBuilder) resolveLabels() error {
@@ -136,21 +155,34 @@ func (b *ProgramBuilder) resolveLabels() error {
 			}
 			// Calculates the jump offset from current line.
 			offset := v.target - s.line - 1
-			if offset > math.MaxUint8 {
-				return fmt.Errorf("jump offset to label '%v' is too large: %v", key, offset)
-			}
 			// Sets offset into jump instruction.
-			if s.jt {
+			switch s.jt {
+			case jDirect:
+				if offset > labelDirectTarget {
+					return fmt.Errorf("jump offset to label '%v' is too large: %v, inst: %v, lineno: %v", key, offset, inst, s.line)
+				}
+				if inst.K != labelDirectTarget {
+					return fmt.Errorf("jump target is not a label")
+				}
+				inst.K = uint32(offset)
+			case jTrue:
+				if offset > labelTarget {
+					return fmt.Errorf("jump offset to label '%v' is too large: %v, inst: %v, lineno: %v", key, offset, inst, s.line)
+				}
 				if inst.JumpIfTrue != labelTarget {
 					return fmt.Errorf("jump target is not a label")
 				}
 				inst.JumpIfTrue = uint8(offset)
-			} else {
+			case jFalse:
+				if offset > labelTarget {
+					return fmt.Errorf("jump offset to label '%v' is too large: %v, inst: %v, lineno: %v", key, offset, inst, s.line)
+				}
 				if inst.JumpIfFalse != labelTarget {
 					return fmt.Errorf("jump target is not a label")
 				}
 				inst.JumpIfFalse = uint8(offset)
 			}
+
 			b.instructions[s.line] = inst
 		}
 	}
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 1e19b1d25..cadd24505 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -21,14 +21,18 @@ go_library(
     name = "seccomp",
     srcs = [
         "seccomp.go",
+        "seccomp_rules.go",
         "seccomp_unsafe.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/seccomp",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/abi",
         "//pkg/abi/linux",
         "//pkg/bpf",
         "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/strace",
     ],
 )
 
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 7ee63140c..cd6b0b4bc 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -12,24 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package seccomp provides basic seccomp filters.
+// Package seccomp provides basic seccomp filters for x86_64 (little endian).
 package seccomp
 
 import (
 	"fmt"
+	"reflect"
 	"sort"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bpf"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
 )
 
 const (
 	// violationLabel is added to the program to take action on a violation.
 	violationLabel = "violation"
 
-	// allowLabel is added to the program to allow the syscall to take place.
-	allowLabel = "allow"
+	// skipOneInst is the offset to take for skipping one instruction.
+	skipOneInst = 1
 )
 
 // Install generates BPF code based on the set of syscalls provided. It only
@@ -38,27 +42,19 @@ const (
 //
 // (*) The current implementation only checks the syscall number. It does NOT
 // validate any of the arguments.
-func Install(syscalls []uintptr, kill bool) error {
-	// Sort syscalls and remove duplicates to build the BST.
-	sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
-	syscalls = filterUnique(syscalls)
-
-	log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(syscalls), kill)
-	for _, s := range syscalls {
-		log.Infof("syscall filter: %v", s)
-	}
-
-	instrs, err := buildProgram(syscalls, kill)
-	if err != nil {
-		return err
-	}
+func Install(rules SyscallRules, kill bool) error {
+	log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill)
+	instrs, err := buildProgram(rules, kill)
 	if log.IsLogging(log.Debug) {
-		programStr, err := bpf.DecodeProgram(instrs)
-		if err != nil {
-			programStr = fmt.Sprintf("Error: %v\n%s", err, programStr)
+		programStr, errDecode := bpf.DecodeProgram(instrs)
+		if errDecode != nil {
+			programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr)
 		}
 		log.Debugf("Seccomp program dump:\n%s", programStr)
 	}
+	if err != nil {
+		return err
+	}
 
 	if err := seccomp(instrs); err != nil {
 		return err
@@ -68,11 +64,8 @@ func Install(syscalls []uintptr, kill bool) error {
 	return nil
 }
 
-// buildProgram builds a BPF program that whitelists all given syscalls.
-//
-// Precondition: syscalls must be sorted and unique.
-func buildProgram(syscalls []uintptr, kill bool) ([]linux.BPFInstruction, error) {
-	const archOffset = 4 // offsetof(seccomp_data, arch)
+// buildProgram builds a BPF program that whitelists all given syscall rules.
+func buildProgram(rules SyscallRules, kill bool) ([]linux.BPFInstruction, error) {
 	program := bpf.NewProgramBuilder()
 	violationAction := uint32(linux.SECCOMP_RET_KILL)
 	if !kill {
@@ -83,10 +76,13 @@ func buildProgram(syscalls []uintptr, kill bool) ([]linux.BPFInstruction, error)
 	//
 	// A = seccomp_data.arch
 	// if (A != AUDIT_ARCH_X86_64) goto violation
-	program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, archOffset)
-	program.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, linux.AUDIT_ARCH_X86_64, 0, violationLabel)
+	program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch)
+	// violationLabel is at the bottom of the program. The size of program
+	// may exceeds 255 lines, which is the limit of a condition jump.
+	program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, linux.AUDIT_ARCH_X86_64, skipOneInst, 0)
+	program.AddDirectJumpLabel(violationLabel)
 
-	if err := buildIndex(syscalls, program); err != nil {
+	if err := buildIndex(rules, program); err != nil {
 		return nil, err
 	}
 
@@ -96,41 +92,34 @@ func buildProgram(syscalls []uintptr, kill bool) ([]linux.BPFInstruction, error)
 	}
 	program.AddStmt(bpf.Ret|bpf.K, violationAction)
 
-	// allow: return SECCOMP_RET_ALLOW
-	if err := program.AddLabel(allowLabel); err != nil {
-		return nil, err
-	}
-	program.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
-
 	return program.Instructions()
 }
 
-// filterUnique filters unique system calls.
-//
-// Precondition: syscalls must be sorted.
-func filterUnique(syscalls []uintptr) []uintptr {
-	filtered := make([]uintptr, 0, len(syscalls))
-	for i := 0; i < len(syscalls); i++ {
-		if len(filtered) > 0 && syscalls[i] == filtered[len(filtered)-1] {
-			// This call has already been inserted, skip.
-			continue
-		}
-		filtered = append(filtered, syscalls[i])
+// buildIndex builds a BST to quickly search through all syscalls that are whitelisted.
+func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error {
+	syscalls := []uintptr{}
+	for sysno, _ := range rules {
+		syscalls = append(syscalls, sysno)
+	}
+
+	t, ok := strace.Lookup(abi.Linux, arch.AMD64)
+	if !ok {
+		panic("Can't find amd64 Linux syscall table")
+	}
+
+	sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
+	for _, s := range syscalls {
+		log.Infof("syscall filter: %v (%v): %s", s, t.Name(s), rules[s])
 	}
-	return filtered
-}
 
-// buildIndex builds a BST to quickly search through all syscalls that are whitelisted.
-//
-// Precondition: syscalls must be sorted and unique.
-func buildIndex(syscalls []uintptr, program *bpf.ProgramBuilder) error {
 	root := createBST(syscalls)
+	root.root = true
 
 	// Load syscall number into A and run through BST.
 	//
 	// A = seccomp_data.nr
-	program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, 0)
-	return root.buildBSTProgram(program, true)
+	program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR)
+	return root.traverse(buildBSTProgram, program, rules)
 }
 
 // createBST converts sorted syscall slice into a balanced BST.
@@ -147,64 +136,128 @@ func createBST(syscalls []uintptr) *node {
 	return &parent
 }
 
-// node represents a tree node.
-type node struct {
-	value uintptr
-	left  *node
-	right *node
+func ruleViolationLabel(sysno uintptr, idx int) string {
+	return fmt.Sprintf("ruleViolation_%v_%v", sysno, idx)
 }
 
-// label returns the label corresponding to this node. If node is nil (syscall not present),
-// violationLabel is returned for convenience.
-func (n *node) label() string {
-	if n == nil {
-		return violationLabel
+func checkArgsLabel(sysno uintptr) string {
+	return fmt.Sprintf("checkArgs_%v", sysno)
+}
+
+func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) error {
+	for ruleidx, rule := range rules {
+		labelled := false
+		for i, arg := range rule {
+			if arg != nil {
+				switch a := arg.(type) {
+				case AllowAny:
+				case AllowValue:
+					high, low := uint32(a>>32), uint32(a)
+					// assert arg_low == low
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(sysno, ruleidx))
+					// assert arg_high == high
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(sysno, ruleidx))
+					labelled = true
+
+				default:
+					return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
+				}
+			}
+		}
+		// Matched, allow the syscall.
+		p.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
+		// Label the end of the rule if necessary.
+		if labelled {
+			if err := p.AddLabel(ruleViolationLabel(sysno, ruleidx)); err != nil {
+				return err
+			}
+		}
 	}
-	return fmt.Sprintf("index_%v", n.value)
+	// Not matched?
+	p.AddDirectJumpLabel(violationLabel)
+	return nil
 }
 
 // buildBSTProgram converts a binary tree started in 'root' into BPF code. The ouline of the code
 // is as follows:
 //
 // // SYS_PIPE(22), root
-//   (A == 22) ? goto allow : continue
+//   (A == 22) ? goto argument check : continue
 //   (A > 22) ? goto index_35 : goto index_9
 //
 // index_9:  // SYS_MMAP(9), leaf
-//   (A == 9) ? goto allow : goto violation
+//   A == 9) ? goto argument check : violation
 //
 // index_35:  // SYS_NANOSLEEP(35), single child
-//   (A == 35) ? goto allow : continue
+//   (A == 35) ? goto argument check : continue
 //   (A > 35) ? goto index_50 : goto violation
 //
 // index_50:  // SYS_LISTEN(50), leaf
-//   (A == 50) ? goto allow : goto violation
+//   (A == 50) ? goto argument check : goto violation
 //
-func (n *node) buildBSTProgram(program *bpf.ProgramBuilder, root bool) error {
-	if n == nil {
-		return nil
-	}
-
+func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) error {
 	// Root node is never referenced by label, skip it.
-	if !root {
+	if !n.root {
 		if err := program.AddLabel(n.label()); err != nil {
 			return err
 		}
 	}
 
-	// Leaf nodes don't require extra check, they either allow or violate!
+	sysno := n.value
+	program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0)
 	if n.left == nil && n.right == nil {
-		program.AddJumpLabels(bpf.Jmp|bpf.Jeq|bpf.K, uint32(n.value), allowLabel, violationLabel)
+		// Leaf nodes don't require extra check.
+		program.AddDirectJumpLabel(violationLabel)
+	} else {
+		// Non-leaf node. Check which turn to take otherwise. Using direct jumps
+		// in case that the offset may exceed the limit of a conditional jump (255)
+		// Note that 'violationLabel' is returned for nil children.
+		program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst)
+		program.AddDirectJumpLabel(n.right.label())
+		program.AddDirectJumpLabel(n.left.label())
+	}
+
+	if err := program.AddLabel(checkArgsLabel(sysno)); err != nil {
+		return err
+	}
+	// No rules, just allow it and save one jmp.
+	if len(rules[sysno]) == 0 {
+		program.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
 		return nil
 	}
+	return addSyscallArgsCheck(program, rules[sysno], sysno)
+}
 
-	// Non-leaf node. Allows syscall if it matches, check which turn to take otherwise. Note
-	// that 'violationLabel' is returned for nil children.
-	program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(n.value), allowLabel, 0)
-	program.AddJumpLabels(bpf.Jmp|bpf.Jgt|bpf.K, uint32(n.value), n.right.label(), n.left.label())
+// node represents a tree node.
+type node struct {
+	value uintptr
+	left  *node
+	right *node
+	root  bool
+}
+
+// label returns the label corresponding to this node. If node is nil (syscall not present),
+// violationLabel is returned for convenience.
+func (n *node) label() string {
+	if n == nil {
+		return violationLabel
+	}
+	return fmt.Sprintf("index_%v", n.value)
+}
 
-	if err := n.left.buildBSTProgram(program, false); err != nil {
+type traverseFunc func(*bpf.ProgramBuilder, SyscallRules, *node) error
+
+func (n *node) traverse(fn traverseFunc, p *bpf.ProgramBuilder, rules SyscallRules) error {
+	if n == nil {
+		return nil
+	}
+	if err := fn(p, rules, n); err != nil {
+		return err
+	}
+	if err := n.left.traverse(fn, p, rules); err != nil {
 		return err
 	}
-	return n.right.buildBSTProgram(program, false)
+	return n.right.traverse(fn, p, rules)
 }
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
new file mode 100644
index 000000000..892ccabb4
--- /dev/null
+++ b/pkg/seccomp/seccomp_rules.go
@@ -0,0 +1,119 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seccomp
+
+import "fmt"
+
+// The offsets are based on the following struct in include/linux/seccomp.h.
+// struct seccomp_data {
+//	int nr;
+//	__u32 arch;
+//	__u64 instruction_pointer;
+//	__u64 args[6];
+// };
+const (
+	seccompDataOffsetNR   = 0
+	seccompDataOffsetArch = 4
+	seccompDataOffsetArgs = 16
+)
+
+func seccompDataOffsetArgLow(i int) uint32 {
+	return uint32(seccompDataOffsetArgs + i*8)
+}
+
+func seccompDataOffsetArgHigh(i int) uint32 {
+	return uint32(seccompDataOffsetArgs + i*8 + 4)
+}
+
+// AllowAny is marker to indicate any value will be accepted.
+type AllowAny struct{}
+
+func (a AllowAny) String() (s string) {
+	return "*"
+}
+
+// AllowValue specifies a value that needs to be strictly matched.
+type AllowValue uintptr
+
+func (a AllowValue) String() (s string) {
+	return fmt.Sprintf("%#x ", uintptr(a))
+}
+
+// Rule stores the whitelist of syscall arguments.
+//
+// For example:
+// rule := Rule {
+//       AllowValue(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0
+// }
+type Rule [6]interface{}
+
+func (r Rule) String() (s string) {
+	if len(r) == 0 {
+		return
+	}
+	s += "( "
+	for _, arg := range r {
+		if arg != nil {
+			s += fmt.Sprintf("%v ", arg)
+		}
+	}
+	s += ")"
+	return
+}
+
+// SyscallRules stores a map of OR'ed whitelist rules indexed by the syscall number.
+// If the 'Rules' is empty, we treat it as any argument is allowed.
+//
+// For example:
+//  rules := SyscallRules{
+//         syscall.SYS_FUTEX: []Rule{
+//                 {
+//                         AllowAny{},
+//                         AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+//                 }, // OR
+//                 {
+//                         AllowAny{},
+//                         AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+//                 },
+//         },
+//         syscall.SYS_GETPID: []Rule{},
+// }
+type SyscallRules map[uintptr][]Rule
+
+// NewSyscallRules returns a new SyscallRules.
+func NewSyscallRules() SyscallRules {
+	return make(map[uintptr][]Rule)
+}
+
+// AddRule adds the given rule. It will create a new entry for a new syscall, otherwise
+// it will append to the existing rules.
+func (sr SyscallRules) AddRule(sysno uintptr, r Rule) {
+	if _, ok := sr[sysno]; ok {
+		sr[sysno] = append(sr[sysno], r)
+	} else {
+		sr[sysno] = []Rule{r}
+	}
+}
+
+// Merge merges the given SyscallRules.
+func (sr SyscallRules) Merge(rules SyscallRules) {
+	for sysno, rs := range rules {
+		if _, ok := sr[sysno]; ok {
+			sr[sysno] = append(sr[sysno], rs...)
+		} else {
+			sr[sysno] = rs
+		}
+	}
+}
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index c700d88d6..d3aca7ee9 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -19,10 +19,10 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"math"
 	"math/rand"
 	"os"
 	"os/exec"
-	"sort"
 	"strings"
 	"testing"
 	"time"
@@ -77,12 +77,12 @@ func TestBasic(t *testing.T) {
 
 	for _, test := range []struct {
 		// filters are the set of syscall that are allowed.
-		filters []uintptr
+		filters SyscallRules
 		kill    bool
 		specs   []spec
 	}{
 		{
-			filters: []uintptr{1},
+			filters: SyscallRules{1: {}},
 			kill:    false,
 			specs: []spec{
 				{
@@ -98,8 +98,12 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: []uintptr{1, 3, 5},
-			kill:    false,
+			filters: SyscallRules{
+				1: {},
+				3: {},
+				5: {},
+			},
+			kill: false,
 			specs: []spec{
 				{
 					desc: "Multiple syscalls allowed (1)",
@@ -144,7 +148,7 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: []uintptr{1},
+			filters: SyscallRules{1: {}},
 			kill:    false,
 			specs: []spec{
 				{
@@ -155,7 +159,7 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: []uintptr{1},
+			filters: SyscallRules{1: {}},
 			kill:    true,
 			specs: []spec{
 				{
@@ -165,8 +169,96 @@ func TestBasic(t *testing.T) {
 				},
 			},
 		},
+		{
+			filters: SyscallRules{
+				1: []Rule{
+					{
+						AllowAny{},
+						AllowValue(0xf),
+					},
+				},
+			},
+			kill: false,
+			specs: []spec{
+				{
+					desc: "Syscall argument allowed",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0xf, 0xf}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "Syscall argument disallowed",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0xf, 0xe}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			filters: SyscallRules{
+				1: []Rule{
+					{
+						AllowValue(0xf),
+					},
+					{
+						AllowValue(0xe),
+					},
+				},
+			},
+			kill: false,
+			specs: []spec{
+				{
+					desc: "Syscall argument allowed, two rules",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0xf}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "Syscall argument allowed, two rules",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0xe}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+		},
+		{
+			filters: SyscallRules{
+				1: []Rule{
+					{
+						AllowValue(0),
+						AllowValue(math.MaxUint64 - 1),
+						AllowValue(math.MaxUint32),
+					},
+				},
+			},
+			kill: false,
+			specs: []spec{
+				{
+					desc: "64bit syscall argument allowed",
+					data: seccompData{
+						nr:   1,
+						arch: linux.AUDIT_ARCH_X86_64,
+						args: [6]uint64{0, math.MaxUint64 - 1, math.MaxUint32},
+					},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "64bit syscall argument disallowed",
+					data: seccompData{
+						nr:   1,
+						arch: linux.AUDIT_ARCH_X86_64,
+						args: [6]uint64{0, math.MaxUint64, math.MaxUint32},
+					},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "64bit syscall argument disallowed",
+					data: seccompData{
+						nr:   1,
+						arch: linux.AUDIT_ARCH_X86_64,
+						args: [6]uint64{0, math.MaxUint64, math.MaxUint32 - 1},
+					},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
 	} {
-		sort.Slice(test.filters, func(i, j int) bool { return test.filters[i] < test.filters[j] })
 		instrs, err := buildProgram(test.filters, test.kill)
 		if err != nil {
 			t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err)
@@ -193,19 +285,16 @@ func TestBasic(t *testing.T) {
 func TestRandom(t *testing.T) {
 	rand.Seed(time.Now().UnixNano())
 	size := rand.Intn(50) + 1
-	syscalls := make([]uintptr, 0, size)
-	syscallMap := make(map[uintptr]struct{})
-	for len(syscalls) < size {
+	syscallRules := make(map[uintptr][]Rule)
+	for len(syscallRules) < size {
 		n := uintptr(rand.Intn(200))
-		if _, ok := syscallMap[n]; !ok {
-			syscalls = append(syscalls, n)
-			syscallMap[n] = struct{}{}
+		if _, ok := syscallRules[n]; !ok {
+			syscallRules[n] = []Rule{}
 		}
 	}
 
-	sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
-	fmt.Printf("Testing filters: %v", syscalls)
-	instrs, err := buildProgram(syscalls, false)
+	fmt.Printf("Testing filters: %v", syscallRules)
+	instrs, err := buildProgram(syscallRules, false)
 	if err != nil {
 		t.Fatalf("buildProgram() got error: %v", err)
 	}
@@ -221,7 +310,7 @@ func TestRandom(t *testing.T) {
 			continue
 		}
 		want := uint32(linux.SECCOMP_RET_TRAP)
-		if _, ok := syscallMap[uintptr(i)]; ok {
+		if _, ok := syscallRules[uintptr(i)]; ok {
 			want = linux.SECCOMP_RET_ALLOW
 		}
 		if got != want {
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index fe3f96901..4f2ae4dac 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -29,76 +29,81 @@ func main() {
 	dieFlag := flag.Bool("die", false, "trips over the filter if true")
 	flag.Parse()
 
-	syscalls := []uintptr{
-		syscall.SYS_ACCEPT,
-		syscall.SYS_ARCH_PRCTL,
-		syscall.SYS_BIND,
-		syscall.SYS_BRK,
-		syscall.SYS_CLOCK_GETTIME,
-		syscall.SYS_CLONE,
-		syscall.SYS_CLOSE,
-		syscall.SYS_DUP,
-		syscall.SYS_DUP2,
-		syscall.SYS_EPOLL_CREATE1,
-		syscall.SYS_EPOLL_CTL,
-		syscall.SYS_EPOLL_WAIT,
-		syscall.SYS_EXIT,
-		syscall.SYS_EXIT_GROUP,
-		syscall.SYS_FALLOCATE,
-		syscall.SYS_FCHMOD,
-		syscall.SYS_FCNTL,
-		syscall.SYS_FSTAT,
-		syscall.SYS_FSYNC,
-		syscall.SYS_FTRUNCATE,
-		syscall.SYS_FUTEX,
-		syscall.SYS_GETDENTS64,
-		syscall.SYS_GETPEERNAME,
-		syscall.SYS_GETPID,
-		syscall.SYS_GETSOCKNAME,
-		syscall.SYS_GETSOCKOPT,
-		syscall.SYS_GETTID,
-		syscall.SYS_GETTIMEOFDAY,
-		syscall.SYS_LISTEN,
-		syscall.SYS_LSEEK,
-		syscall.SYS_MADVISE,
-		syscall.SYS_MINCORE,
-		syscall.SYS_MMAP,
-		syscall.SYS_MPROTECT,
-		syscall.SYS_MUNLOCK,
-		syscall.SYS_MUNMAP,
-		syscall.SYS_NANOSLEEP,
-		syscall.SYS_NEWFSTATAT,
-		syscall.SYS_OPEN,
-		syscall.SYS_POLL,
-		syscall.SYS_PREAD64,
-		syscall.SYS_PSELECT6,
-		syscall.SYS_PWRITE64,
-		syscall.SYS_READ,
-		syscall.SYS_READLINKAT,
-		syscall.SYS_READV,
-		syscall.SYS_RECVMSG,
-		syscall.SYS_RENAMEAT,
-		syscall.SYS_RESTART_SYSCALL,
-		syscall.SYS_RT_SIGACTION,
-		syscall.SYS_RT_SIGPROCMASK,
-		syscall.SYS_RT_SIGRETURN,
-		syscall.SYS_SCHED_YIELD,
-		syscall.SYS_SENDMSG,
-		syscall.SYS_SETITIMER,
-		syscall.SYS_SET_ROBUST_LIST,
-		syscall.SYS_SETSOCKOPT,
-		syscall.SYS_SHUTDOWN,
-		syscall.SYS_SIGALTSTACK,
-		syscall.SYS_SOCKET,
-		syscall.SYS_SYNC_FILE_RANGE,
-		syscall.SYS_TGKILL,
-		syscall.SYS_UTIMENSAT,
-		syscall.SYS_WRITE,
-		syscall.SYS_WRITEV,
+	syscalls := seccomp.SyscallRules{
+		syscall.SYS_ACCEPT:          {},
+		syscall.SYS_ARCH_PRCTL:      {},
+		syscall.SYS_BIND:            {},
+		syscall.SYS_BRK:             {},
+		syscall.SYS_CLOCK_GETTIME:   {},
+		syscall.SYS_CLONE:           {},
+		syscall.SYS_CLOSE:           {},
+		syscall.SYS_DUP:             {},
+		syscall.SYS_DUP2:            {},
+		syscall.SYS_EPOLL_CREATE1:   {},
+		syscall.SYS_EPOLL_CTL:       {},
+		syscall.SYS_EPOLL_WAIT:      {},
+		syscall.SYS_EPOLL_PWAIT:     {},
+		syscall.SYS_EXIT:            {},
+		syscall.SYS_EXIT_GROUP:      {},
+		syscall.SYS_FALLOCATE:       {},
+		syscall.SYS_FCHMOD:          {},
+		syscall.SYS_FCNTL:           {},
+		syscall.SYS_FSTAT:           {},
+		syscall.SYS_FSYNC:           {},
+		syscall.SYS_FTRUNCATE:       {},
+		syscall.SYS_FUTEX:           {},
+		syscall.SYS_GETDENTS64:      {},
+		syscall.SYS_GETPEERNAME:     {},
+		syscall.SYS_GETPID:          {},
+		syscall.SYS_GETSOCKNAME:     {},
+		syscall.SYS_GETSOCKOPT:      {},
+		syscall.SYS_GETTID:          {},
+		syscall.SYS_GETTIMEOFDAY:    {},
+		syscall.SYS_LISTEN:          {},
+		syscall.SYS_LSEEK:           {},
+		syscall.SYS_MADVISE:         {},
+		syscall.SYS_MINCORE:         {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_MPROTECT:        {},
+		syscall.SYS_MUNLOCK:         {},
+		syscall.SYS_MUNMAP:          {},
+		syscall.SYS_NANOSLEEP:       {},
+		syscall.SYS_NEWFSTATAT:      {},
+		syscall.SYS_OPEN:            {},
+		syscall.SYS_POLL:            {},
+		syscall.SYS_PREAD64:         {},
+		syscall.SYS_PSELECT6:        {},
+		syscall.SYS_PWRITE64:        {},
+		syscall.SYS_READ:            {},
+		syscall.SYS_READLINKAT:      {},
+		syscall.SYS_READV:           {},
+		syscall.SYS_RECVMSG:         {},
+		syscall.SYS_RENAMEAT:        {},
+		syscall.SYS_RESTART_SYSCALL: {},
+		syscall.SYS_RT_SIGACTION:    {},
+		syscall.SYS_RT_SIGPROCMASK:  {},
+		syscall.SYS_RT_SIGRETURN:    {},
+		syscall.SYS_SCHED_YIELD:     {},
+		syscall.SYS_SENDMSG:         {},
+		syscall.SYS_SETITIMER:       {},
+		syscall.SYS_SET_ROBUST_LIST: {},
+		syscall.SYS_SETSOCKOPT:      {},
+		syscall.SYS_SHUTDOWN:        {},
+		syscall.SYS_SIGALTSTACK:     {},
+		syscall.SYS_SOCKET:          {},
+		syscall.SYS_SYNC_FILE_RANGE: {},
+		syscall.SYS_TGKILL:          {},
+		syscall.SYS_UTIMENSAT:       {},
+		syscall.SYS_WRITE:           {},
+		syscall.SYS_WRITEV:          {},
 	}
 	die := *dieFlag
 	if !die {
-		syscalls = append(syscalls, syscall.SYS_OPENAT)
+		syscalls[syscall.SYS_OPENAT] = []seccomp.Rule{
+			{
+				seccomp.AllowValue(10),
+			},
+		}
 	}
 
 	if err := seccomp.Install(syscalls, false); err != nil {
@@ -107,6 +112,6 @@ func main() {
 	}
 	fmt.Printf("Filters installed\n")
 
-	syscall.RawSyscall(syscall.SYS_OPENAT, 0, 0, 0)
+	syscall.RawSyscall(syscall.SYS_OPENAT, 10, 0, 0)
 	fmt.Printf("Syscall was allowed!!!\n")
 }
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 130e987df..86c256c5b 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -18,77 +18,78 @@ import (
 	"syscall"
 
 	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 )
 
 // allowedSyscalls is the set of syscalls executed by the Sentry
 // to the host OS.
-var allowedSyscalls = []uintptr{
-	syscall.SYS_ACCEPT,
-	syscall.SYS_ARCH_PRCTL,
-	syscall.SYS_CLOCK_GETTIME,
-	syscall.SYS_CLONE,
-	syscall.SYS_CLOSE,
-	syscall.SYS_DUP,
-	syscall.SYS_DUP2,
-	syscall.SYS_EPOLL_CREATE1,
-	syscall.SYS_EPOLL_CTL,
-	syscall.SYS_EPOLL_PWAIT,
-	syscall.SYS_EPOLL_WAIT,
-	syscall.SYS_EVENTFD2,
-	syscall.SYS_EXIT,
-	syscall.SYS_EXIT_GROUP,
-	syscall.SYS_FALLOCATE,
-	syscall.SYS_FCHMOD,
-	syscall.SYS_FCNTL,
-	syscall.SYS_FSTAT,
-	syscall.SYS_FSYNC,
-	syscall.SYS_FTRUNCATE,
-	syscall.SYS_FUTEX,
-	syscall.SYS_GETDENTS64,
-	syscall.SYS_GETPID,
-	unix.SYS_GETRANDOM,
-	syscall.SYS_GETSOCKOPT,
-	syscall.SYS_GETTID,
-	syscall.SYS_GETTIMEOFDAY,
-	syscall.SYS_LISTEN,
-	syscall.SYS_LSEEK,
-	syscall.SYS_MADVISE,
-	syscall.SYS_MINCORE,
-	syscall.SYS_MMAP,
-	syscall.SYS_MPROTECT,
-	syscall.SYS_MUNMAP,
-	syscall.SYS_NEWFSTATAT,
-	syscall.SYS_POLL,
-	syscall.SYS_PREAD64,
-	syscall.SYS_PSELECT6,
-	syscall.SYS_PWRITE64,
-	syscall.SYS_READ,
-	syscall.SYS_READLINKAT,
-	syscall.SYS_READV,
-	syscall.SYS_RECVMSG,
-	syscall.SYS_RENAMEAT,
-	syscall.SYS_RESTART_SYSCALL,
-	syscall.SYS_RT_SIGACTION,
-	syscall.SYS_RT_SIGPROCMASK,
-	syscall.SYS_RT_SIGRETURN,
-	syscall.SYS_SCHED_YIELD,
-	syscall.SYS_SENDMSG,
-	syscall.SYS_SETITIMER,
-	syscall.SYS_SHUTDOWN,
-	syscall.SYS_SIGALTSTACK,
-	syscall.SYS_SYNC_FILE_RANGE,
-	syscall.SYS_TGKILL,
-	syscall.SYS_UTIMENSAT,
-	syscall.SYS_WRITE,
-	syscall.SYS_WRITEV,
+var allowedSyscalls = seccomp.SyscallRules{
+	syscall.SYS_ACCEPT:          {},
+	syscall.SYS_ARCH_PRCTL:      {},
+	syscall.SYS_CLOCK_GETTIME:   {},
+	syscall.SYS_CLONE:           {},
+	syscall.SYS_CLOSE:           {},
+	syscall.SYS_DUP:             {},
+	syscall.SYS_DUP2:            {},
+	syscall.SYS_EPOLL_CREATE1:   {},
+	syscall.SYS_EPOLL_CTL:       {},
+	syscall.SYS_EPOLL_PWAIT:     {},
+	syscall.SYS_EPOLL_WAIT:      {},
+	syscall.SYS_EVENTFD2:        {},
+	syscall.SYS_EXIT:            {},
+	syscall.SYS_EXIT_GROUP:      {},
+	syscall.SYS_FALLOCATE:       {},
+	syscall.SYS_FCHMOD:          {},
+	syscall.SYS_FCNTL:           {},
+	syscall.SYS_FSTAT:           {},
+	syscall.SYS_FSYNC:           {},
+	syscall.SYS_FTRUNCATE:       {},
+	syscall.SYS_FUTEX:           {},
+	syscall.SYS_GETDENTS64:      {},
+	syscall.SYS_GETPID:          {},
+	unix.SYS_GETRANDOM:          {},
+	syscall.SYS_GETSOCKOPT:      {},
+	syscall.SYS_GETTID:          {},
+	syscall.SYS_GETTIMEOFDAY:    {},
+	syscall.SYS_LISTEN:          {},
+	syscall.SYS_LSEEK:           {},
+	syscall.SYS_MADVISE:         {},
+	syscall.SYS_MINCORE:         {},
+	syscall.SYS_MMAP:            {},
+	syscall.SYS_MPROTECT:        {},
+	syscall.SYS_MUNMAP:          {},
+	syscall.SYS_NEWFSTATAT:      {},
+	syscall.SYS_POLL:            {},
+	syscall.SYS_PREAD64:         {},
+	syscall.SYS_PSELECT6:        {},
+	syscall.SYS_PWRITE64:        {},
+	syscall.SYS_READ:            {},
+	syscall.SYS_READLINKAT:      {},
+	syscall.SYS_READV:           {},
+	syscall.SYS_RECVMSG:         {},
+	syscall.SYS_RENAMEAT:        {},
+	syscall.SYS_RESTART_SYSCALL: {},
+	syscall.SYS_RT_SIGACTION:    {},
+	syscall.SYS_RT_SIGPROCMASK:  {},
+	syscall.SYS_RT_SIGRETURN:    {},
+	syscall.SYS_SCHED_YIELD:     {},
+	syscall.SYS_SENDMSG:         {},
+	syscall.SYS_SETITIMER:       {},
+	syscall.SYS_SHUTDOWN:        {},
+	syscall.SYS_SIGALTSTACK:     {},
+	syscall.SYS_SYNC_FILE_RANGE: {},
+	syscall.SYS_TGKILL:          {},
+	syscall.SYS_UTIMENSAT:       {},
+	syscall.SYS_WRITE:           {},
+	syscall.SYS_WRITEV:          {},
 }
 
 // TODO: Ioctl is needed in order to support tty consoles.
 // Once filters support argument-checking, we should only allow ioctl
 // with tty-related arguments.
-func consoleFilters() []uintptr {
-	return []uintptr{
-		syscall.SYS_IOCTL,
+func consoleFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_IOCTL: {},
 	}
 }
 
@@ -97,79 +98,79 @@ func consoleFilters() []uintptr {
 // file operations that would otherwise be disabled by seccomp when a Gofer is
 // used. When whitelistFS is not used, openning new FD in the Sentry is
 // disallowed.
-func whitelistFSFilters() []uintptr {
-	return []uintptr{
-		syscall.SYS_ACCESS,
-		syscall.SYS_FCHMOD,
-		syscall.SYS_FSTAT,
-		syscall.SYS_FSYNC,
-		syscall.SYS_FTRUNCATE,
-		syscall.SYS_GETCWD,
-		syscall.SYS_GETDENTS,
-		syscall.SYS_GETDENTS64,
-		syscall.SYS_LSEEK,
-		syscall.SYS_LSTAT,
-		syscall.SYS_MKDIR,
-		syscall.SYS_MKDIRAT,
-		syscall.SYS_NEWFSTATAT,
-		syscall.SYS_OPEN,
-		syscall.SYS_OPENAT,
-		syscall.SYS_PREAD64,
-		syscall.SYS_PWRITE64,
-		syscall.SYS_READ,
-		syscall.SYS_READLINK,
-		syscall.SYS_READLINKAT,
-		syscall.SYS_RENAMEAT,
-		syscall.SYS_STAT,
-		syscall.SYS_SYMLINK,
-		syscall.SYS_SYMLINKAT,
-		syscall.SYS_SYNC_FILE_RANGE,
-		syscall.SYS_UNLINK,
-		syscall.SYS_UNLINKAT,
-		syscall.SYS_UTIMENSAT,
-		syscall.SYS_WRITE,
+func whitelistFSFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_ACCESS:          {},
+		syscall.SYS_FCHMOD:          {},
+		syscall.SYS_FSTAT:           {},
+		syscall.SYS_FSYNC:           {},
+		syscall.SYS_FTRUNCATE:       {},
+		syscall.SYS_GETCWD:          {},
+		syscall.SYS_GETDENTS:        {},
+		syscall.SYS_GETDENTS64:      {},
+		syscall.SYS_LSEEK:           {},
+		syscall.SYS_LSTAT:           {},
+		syscall.SYS_MKDIR:           {},
+		syscall.SYS_MKDIRAT:         {},
+		syscall.SYS_NEWFSTATAT:      {},
+		syscall.SYS_OPEN:            {},
+		syscall.SYS_OPENAT:          {},
+		syscall.SYS_PREAD64:         {},
+		syscall.SYS_PWRITE64:        {},
+		syscall.SYS_READ:            {},
+		syscall.SYS_READLINK:        {},
+		syscall.SYS_READLINKAT:      {},
+		syscall.SYS_RENAMEAT:        {},
+		syscall.SYS_STAT:            {},
+		syscall.SYS_SYMLINK:         {},
+		syscall.SYS_SYMLINKAT:       {},
+		syscall.SYS_SYNC_FILE_RANGE: {},
+		syscall.SYS_UNLINK:          {},
+		syscall.SYS_UNLINKAT:        {},
+		syscall.SYS_UTIMENSAT:       {},
+		syscall.SYS_WRITE:           {},
 	}
 }
 
 // hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
-func hostInetFilters() []uintptr {
-	return []uintptr{
-		syscall.SYS_ACCEPT4,
-		syscall.SYS_BIND,
-		syscall.SYS_CONNECT,
-		syscall.SYS_GETPEERNAME,
-		syscall.SYS_GETSOCKNAME,
-		syscall.SYS_GETSOCKOPT,
-		syscall.SYS_IOCTL,
-		syscall.SYS_LISTEN,
-		syscall.SYS_READV,
-		syscall.SYS_RECVFROM,
-		syscall.SYS_RECVMSG,
-		syscall.SYS_SENDMSG,
-		syscall.SYS_SENDTO,
-		syscall.SYS_SETSOCKOPT,
-		syscall.SYS_SHUTDOWN,
-		syscall.SYS_SOCKET,
-		syscall.SYS_WRITEV,
+func hostInetFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_ACCEPT4:     {},
+		syscall.SYS_BIND:        {},
+		syscall.SYS_CONNECT:     {},
+		syscall.SYS_GETPEERNAME: {},
+		syscall.SYS_GETSOCKNAME: {},
+		syscall.SYS_GETSOCKOPT:  {},
+		syscall.SYS_IOCTL:       {},
+		syscall.SYS_LISTEN:      {},
+		syscall.SYS_READV:       {},
+		syscall.SYS_RECVFROM:    {},
+		syscall.SYS_RECVMSG:     {},
+		syscall.SYS_SENDMSG:     {},
+		syscall.SYS_SENDTO:      {},
+		syscall.SYS_SETSOCKOPT:  {},
+		syscall.SYS_SHUTDOWN:    {},
+		syscall.SYS_SOCKET:      {},
+		syscall.SYS_WRITEV:      {},
 	}
 }
 
 // ptraceFilters returns syscalls made exclusively by the ptrace platform.
-func ptraceFilters() []uintptr {
-	return []uintptr{
-		syscall.SYS_PTRACE,
-		syscall.SYS_WAIT4,
-		unix.SYS_GETCPU,
-		unix.SYS_SCHED_SETAFFINITY,
+func ptraceFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_PTRACE:         {},
+		syscall.SYS_WAIT4:          {},
+		unix.SYS_GETCPU:            {},
+		unix.SYS_SCHED_SETAFFINITY: {},
 	}
 }
 
 // kvmFilters returns syscalls made exclusively by the KVM platform.
-func kvmFilters() []uintptr {
-	return []uintptr{
-		syscall.SYS_IOCTL,
-		syscall.SYS_RT_SIGSUSPEND,
-		syscall.SYS_RT_SIGTIMEDWAIT,
-		0xffffffffffffffff, // KVM uses syscall -1 to transition to host.
+func kvmFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_IOCTL:           {},
+		syscall.SYS_RT_SIGSUSPEND:   {},
+		syscall.SYS_RT_SIGTIMEDWAIT: {},
+		0xffffffffffffffff:          {}, // KVM uses syscall -1 to transition to host.
 	}
 }
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
index e10d9bf4c..82cf00dfb 100644
--- a/runsc/boot/filter/extra_filters.go
+++ b/runsc/boot/filter/extra_filters.go
@@ -16,9 +16,13 @@
 
 package filter
 
+import (
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
 // instrumentationFilters returns additional filters for syscalls used by
 // Go intrumentation tools, e.g. -race, -msan.
 // Returns empty when disabled.
-func instrumentationFilters() []uintptr {
+func instrumentationFilters() seccomp.SyscallRules {
 	return nil
 }
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index a862340f6..76f3f6865 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -18,13 +18,15 @@ package filter
 
 import (
 	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 )
 
 // instrumentationFilters returns additional filters for syscalls used by MSAN.
-func instrumentationFilters() []uintptr {
+func instrumentationFilters() seccomp.SyscallRules {
 	Report("MSAN is enabled: syscall filters less restrictive!")
-	return []uintptr{
-		syscall.SYS_SCHED_GETAFFINITY,
-		syscall.SYS_SET_ROBUST_LIST,
+	return seccomp.SyscallRules{
+		syscall.SYS_SCHED_GETAFFINITY: {},
+		syscall.SYS_SET_ROBUST_LIST:   {},
 	}
 }
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
index b0c74a58a..c810773df 100644
--- a/runsc/boot/filter/extra_filters_race.go
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -18,16 +18,21 @@ package filter
 
 import (
 	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 )
 
 // instrumentationFilters returns additional filters for syscalls used by TSAN.
-func instrumentationFilters() []uintptr {
+func instrumentationFilters() seccomp.SyscallRules {
 	Report("TSAN is enabled: syscall filters less restrictive!")
-	return []uintptr{
-		syscall.SYS_BRK,
-		syscall.SYS_MUNLOCK,
-		syscall.SYS_NANOSLEEP,
-		syscall.SYS_OPEN,
-		syscall.SYS_SET_ROBUST_LIST,
+	return seccomp.SyscallRules{
+		syscall.SYS_BRK:             {},
+		syscall.SYS_CLONE:           {},
+		syscall.SYS_FUTEX:           {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_MUNLOCK:         {},
+		syscall.SYS_NANOSLEEP:       {},
+		syscall.SYS_OPEN:            {},
+		syscall.SYS_SET_ROBUST_LIST: {},
 	}
 }
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index 3ba56a318..6ea9c464e 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -33,26 +33,26 @@ func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error
 
 	// Set of additional filters used by -race and -msan. Returns empty
 	// when not enabled.
-	s = append(s, instrumentationFilters()...)
+	s.Merge(instrumentationFilters())
 
 	if whitelistFS {
 		Report("direct file access allows unrestricted file access!")
-		s = append(s, whitelistFSFilters()...)
+		s.Merge(whitelistFSFilters())
 	}
 	if console {
 		Report("console is enabled: syscall filters less restrictive!")
-		s = append(s, consoleFilters()...)
+		s.Merge(consoleFilters())
 	}
 	if hostNetwork {
 		Report("host networking enabled: syscall filters less restrictive!")
-		s = append(s, hostInetFilters()...)
+		s.Merge(hostInetFilters())
 	}
 
 	switch p := p.(type) {
 	case *ptrace.PTrace:
-		s = append(s, ptraceFilters()...)
+		s.Merge(ptraceFilters())
 	case *kvm.KVM:
-		s = append(s, kvmFilters()...)
+		s.Merge(kvmFilters())
 	default:
 		return fmt.Errorf("unknown platform type %T", p)
 	}
-- 
cgit v1.2.3


From a0e2126be49e5eda45dcaead497129c08e08a1e5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 4 Jun 2018 11:25:40 -0700
Subject: Refactor container_test in preparation for sandbox_test

Common code to setup and run sandbox is moved to testutil. Also, don't
link "boot" and "gofer" commands with test binary. Instead, use runsc
binary from the build. This not only make the test setup simpler, but
also resolves a dependency issue with sandbox_tests not depending on
container package.

PiperOrigin-RevId: 199164478
Change-Id: I27226286ca3f914d4d381358270dd7d70ee8372f
---
 runsc/BUILD                       |   3 +
 runsc/container/BUILD             |   9 +-
 runsc/container/container_test.go | 172 ++++++++------------------------------
 runsc/specutils/specutils.go      |   8 +-
 runsc/test/testutil/BUILD         |  17 ++++
 runsc/test/testutil/testutil.go   | 133 +++++++++++++++++++++++++++++
 6 files changed, 198 insertions(+), 144 deletions(-)
 create mode 100644 runsc/test/testutil/BUILD
 create mode 100644 runsc/test/testutil/testutil.go

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index 8f8e2ee35..2f0bbaf2b 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -8,6 +8,9 @@ go_binary(
         "main.go",
     ],
     pure = "on",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
     x_defs = {"main.gitRevision": "{GIT_REVISION}"},
     deps = [
         "//pkg/log",
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index c558b4b0a..fe477abf2 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -27,18 +27,17 @@ go_test(
     name = "container_test",
     size = "small",
     srcs = ["container_test.go"],
-    pure = "on",
-    rundir = ".",
+    data = [
+        "//runsc",
+    ],
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
-        "//runsc/boot",
-        "//runsc/cmd",
         "//runsc/container",
-        "@com_github_google_subcommands//:go_default_library",
+        "//runsc/test/testutil",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 24e9de3ce..0844cb9df 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -15,7 +15,6 @@
 package container_test
 
 import (
-	"encoding/json"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -29,9 +28,6 @@ import (
 	"testing"
 	"time"
 
-	"context"
-	"flag"
-	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -39,80 +35,15 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
-	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/cmd"
 	"gvisor.googlesource.com/gvisor/runsc/container"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
 func init() {
 	log.SetLevel(log.Debug)
-}
-
-// writeSpec writes the spec to disk in the given directory.
-func writeSpec(dir string, spec *specs.Spec) error {
-	b, err := json.Marshal(spec)
-	if err != nil {
-		return err
-	}
-	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
-}
-
-// newSpecWithArgs creates a simple spec with the given args suitable for use
-// in tests.
-func newSpecWithArgs(args ...string) *specs.Spec {
-	spec := &specs.Spec{
-		// The host filesystem root is the container root.
-		Root: &specs.Root{
-			Path:     "/",
-			Readonly: true,
-		},
-		Process: &specs.Process{
-			Args: args,
-			Env: []string{
-				"PATH=" + os.Getenv("PATH"),
-			},
-		},
-	}
-	return spec
-}
-
-// setupContainer creates a bundle and root dir for the container, generates a
-// test config, and writes the spec to config.json in the bundle dir.
-func setupContainer(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
-	rootDir, err = ioutil.TempDir("", "containers")
-	if err != nil {
-		return "", "", nil, fmt.Errorf("error creating root dir: %v", err)
-	}
-
-	bundleDir, err = ioutil.TempDir("", "bundle")
-	if err != nil {
-		return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err)
-	}
-
-	if err = writeSpec(bundleDir, spec); err != nil {
-		return "", "", nil, fmt.Errorf("error writing spec: %v", err)
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
 	}
-
-	conf = &boot.Config{
-		RootDir: rootDir,
-		Network: boot.NetworkNone,
-		// Don't add flags when calling subprocesses, since the test
-		// runner does not know about all the flags. We control the
-		// Config in the subprocess anyways, so it does not matter.
-		TestModeNoFlags: true,
-	}
-
-	return rootDir, bundleDir, conf, nil
-}
-
-// uniqueContainerID generates a unique container id for each test.
-//
-// The container id is used to create an abstract unix domain socket, which must
-// be unique.  While the container forbids creating two containers with the same
-// name, sometimes between test runs the socket does not get cleaned up quickly
-// enough, causing container creation to fail.
-func uniqueContainerID() string {
-	return fmt.Sprintf("test-container-%d", time.Now().UnixNano())
 }
 
 // waitForProcessList waits for the given process list to show up in the container.
@@ -167,9 +98,9 @@ func procListToString(pl []*control.Process) string {
 func TestLifecycle(t *testing.T) {
 	// The container will just sleep for a long time.  We will kill it before
 	// it finishes sleeping.
-	spec := newSpecWithArgs("sleep", "100")
+	spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	rootDir, bundleDir, conf, err := setupContainer(spec)
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -187,7 +118,7 @@ func TestLifecycle(t *testing.T) {
 		},
 	}
 	// Create the container.
-	id := uniqueContainerID()
+	id := testutil.UniqueContainerID()
 	if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -298,13 +229,13 @@ func TestExePath(t *testing.T) {
 		{path: "bin/thisfiledoesntexit", success: false},
 		{path: "/bin/thisfiledoesntexit", success: false},
 	} {
-		spec := newSpecWithArgs(test.path)
-		rootDir, bundleDir, conf, err := setupContainer(spec)
+		spec := testutil.NewSpecWithArgs(test.path)
+		rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 		if err != nil {
 			t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
 		}
 
-		ws, err := container.Run(uniqueContainerID(), spec, conf, bundleDir, "", "")
+		ws, err := container.Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 
 		os.RemoveAll(rootDir)
 		os.RemoveAll(bundleDir)
@@ -327,16 +258,16 @@ func TestExePath(t *testing.T) {
 // Test the we can retrieve the application exit status from the container.
 func TestAppExitStatus(t *testing.T) {
 	// First container will succeed.
-	succSpec := newSpecWithArgs("true")
+	succSpec := testutil.NewSpecWithArgs("true")
 
-	rootDir, bundleDir, conf, err := setupContainer(succSpec)
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(succSpec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
 	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
-	ws, err := container.Run(uniqueContainerID(), succSpec, conf, bundleDir, "", "")
+	ws, err := container.Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
@@ -346,16 +277,16 @@ func TestAppExitStatus(t *testing.T) {
 
 	// Second container exits with non-zero status.
 	wantStatus := 123
-	errSpec := newSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
+	errSpec := testutil.NewSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
 
-	rootDir2, bundleDir2, conf, err := setupContainer(errSpec)
+	rootDir2, bundleDir2, conf, err := testutil.SetupContainer(errSpec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
 	defer os.RemoveAll(rootDir2)
 	defer os.RemoveAll(bundleDir2)
 
-	ws, err = container.Run(uniqueContainerID(), succSpec, conf, bundleDir2, "", "")
+	ws, err = container.Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir2, "", "")
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
@@ -367,9 +298,9 @@ func TestAppExitStatus(t *testing.T) {
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
 	const uid = 343
-	spec := newSpecWithArgs("sleep", "100")
+	spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	rootDir, bundleDir, conf, err := setupContainer(spec)
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -377,7 +308,7 @@ func TestExec(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	s, err := container.Create(uniqueContainerID(), spec, conf, bundleDir, "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -454,7 +385,7 @@ func TestExec(t *testing.T) {
 func TestCapabilities(t *testing.T) {
 	const uid = 343
 	const gid = 2401
-	spec := newSpecWithArgs("sleep", "100")
+	spec := testutil.NewSpecWithArgs("sleep", "100")
 
 	// We generate files in the host temporary directory.
 	spec.Mounts = append(spec.Mounts, specs.Mount{
@@ -463,7 +394,7 @@ func TestCapabilities(t *testing.T) {
 		Type:        "bind",
 	})
 
-	rootDir, bundleDir, conf, err := setupContainer(spec)
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -471,7 +402,7 @@ func TestCapabilities(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	s, err := container.Create(uniqueContainerID(), spec, conf, bundleDir, "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -540,8 +471,8 @@ func TestCapabilities(t *testing.T) {
 
 // Test that an tty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	spec := newSpecWithArgs("true")
-	rootDir, bundleDir, conf, err := setupContainer(spec)
+	spec := testutil.NewSpecWithArgs("true")
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -569,7 +500,7 @@ func TestConsoleSocket(t *testing.T) {
 	defer os.Remove(socketPath)
 
 	// Create the container and pass the socket name.
-	id := uniqueContainerID()
+	id := testutil.UniqueContainerID()
 	s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
@@ -618,21 +549,21 @@ func TestConsoleSocket(t *testing.T) {
 }
 
 func TestSpecUnsupported(t *testing.T) {
-	spec := newSpecWithArgs("/bin/true")
+	spec := testutil.NewSpecWithArgs("/bin/true")
 	spec.Process.SelinuxLabel = "somelabel"
 
 	// These are normally set by docker and will just cause warnings to be logged.
 	spec.Process.ApparmorProfile = "someprofile"
 	spec.Linux = &specs.Linux{Seccomp: &specs.LinuxSeccomp{}}
 
-	rootDir, bundleDir, conf, err := setupContainer(spec)
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
 	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
-	id := uniqueContainerID()
+	id := testutil.UniqueContainerID()
 	_, err = container.Create(id, spec, conf, bundleDir, "", "")
 	if err == nil || !strings.Contains(err.Error(), "is not supported") {
 		t.Errorf("container.Create() wrong error, got: %v, want: *is not supported, spec.Process: %+v", err, spec.Process)
@@ -642,14 +573,17 @@ func TestSpecUnsupported(t *testing.T) {
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-priviledged user.
 func TestRunNonRoot(t *testing.T) {
-	spec := newSpecWithArgs("/bin/true")
+	spec := testutil.NewSpecWithArgs("/bin/true")
 	spec.Process.User.UID = 343
 	spec.Process.User.GID = 2401
 
 	// User that container runs as can't list '$TMP/blocked' and would fail to
 	// mount it.
-	dir := path.Join(os.TempDir(), "blocked")
-	if err := os.Mkdir(dir, 0700); err != nil {
+	dir, err := ioutil.TempDir("", "blocked")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed: %v", err)
+	}
+	if err := os.Chmod(dir, 0700); err != nil {
 		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
 	}
 	dir = path.Join(dir, "test")
@@ -664,7 +598,7 @@ func TestRunNonRoot(t *testing.T) {
 		Type:        "bind",
 	})
 
-	rootDir, bundleDir, conf, err := setupContainer(spec)
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -672,7 +606,7 @@ func TestRunNonRoot(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	s, err := container.Create(uniqueContainerID(), spec, conf, bundleDir, "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -688,39 +622,3 @@ func TestRunNonRoot(t *testing.T) {
 		t.Errorf("container failed, waitStatus: %v", ws)
 	}
 }
-
-// TestMain acts like runsc if it is called with the "boot" argument, otherwise
-// it just runs the tests.  This is required because creating a container will
-// call "/proc/self/exe boot".  Normally /proc/self/exe is the runsc binary,
-// but for tests we have to fake it.
-func TestMain(m *testing.M) {
-	// exit writes coverage data before exiting.
-	exit := func(status int) {
-		os.Exit(status)
-	}
-
-	if !flag.Parsed() {
-		flag.Parse()
-	}
-
-	// If we are passed one of the commands then run it.
-	subcommands.Register(new(cmd.Boot), "boot")
-	subcommands.Register(new(cmd.Gofer), "gofer")
-	switch flag.Arg(0) {
-	case "boot", "gofer":
-		conf := &boot.Config{
-			RootDir: "unused-root-dir",
-			Network: boot.NetworkNone,
-		}
-		var ws syscall.WaitStatus
-		subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
-		if subcmdCode != subcommands.ExitSuccess {
-			panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode))
-		}
-		// Container exited. Shut down this process.
-		exit(ws.ExitStatus())
-	default:
-		// Otherwise run the tests.
-		exit(m.Run())
-	}
-}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 5f455dec4..3161360b4 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -32,6 +32,10 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 )
 
+// ExePath must point to runsc binary, which is normally the same binary. It's
+// changed in tests that aren't linked in the same binary.
+var ExePath = "/proc/self/exe"
+
 // LogSpec logs the spec in a human-friendly way.
 func LogSpec(spec *specs.Spec) {
 	log.Debugf("Spec: %+v", spec)
@@ -197,9 +201,9 @@ func Is9PMount(m specs.Mount) bool {
 // BinPath returns the real path to self, resolving symbolink links. This is done
 // to make the process name appears as 'runsc', instead of 'exe'.
 func BinPath() (string, error) {
-	binPath, err := filepath.EvalSymlinks("/proc/self/exe")
+	binPath, err := filepath.EvalSymlinks(ExePath)
 	if err != nil {
-		return "", fmt.Errorf(`error resolving "/proc/self/exe" symlink: %v`, err)
+		return "", fmt.Errorf(`error resolving %q symlink: %v`, ExePath, err)
 	}
 	return binPath, nil
 }
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
new file mode 100644
index 000000000..2c2555d98
--- /dev/null
+++ b/runsc/test/testutil/BUILD
@@ -0,0 +1,17 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "testutil",
+    srcs = ["testutil.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/test/testutil",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//runsc/boot",
+        "//runsc/specutils",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
new file mode 100644
index 000000000..87db0a170
--- /dev/null
+++ b/runsc/test/testutil/testutil.go
@@ -0,0 +1,133 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil contains utility functions for runsc tests.
+package testutil
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// ConfigureExePath configures the executable for runsc in the test environment.
+func ConfigureExePath() error {
+
+	// runsc is in a directory like: 'runsc/linux_amd64_pure_stripped/runsc'.
+	// Since I don't want to construct 'linux_amd64_pure_stripped' based on the
+	// build type, do a quick search for: 'runsc/*/runsc'
+	exePath := ""
+	lv1 := "./runsc"
+	lv1fis, err := ioutil.ReadDir(lv1)
+	if err != nil {
+		return err
+	}
+	for _, fi := range lv1fis {
+		if !fi.IsDir() {
+			continue
+		}
+		lv2fis, err := ioutil.ReadDir(filepath.Join(lv1, fi.Name()))
+		if err != nil {
+			return err
+		}
+		for _, candidate := range lv2fis {
+			if !candidate.IsDir() && candidate.Name() == "runsc" {
+				exePath, err = filepath.Abs(filepath.Join(lv1, fi.Name(), candidate.Name()))
+				if err != nil {
+					return err
+				}
+				break
+			}
+		}
+	}
+	if exePath == "" {
+		return fmt.Errorf("path to runsc not found")
+	}
+	specutils.ExePath = exePath
+	return nil
+}
+
+// NewSpecWithArgs creates a simple spec with the given args suitable for use
+// in tests.
+func NewSpecWithArgs(args ...string) *specs.Spec {
+	spec := &specs.Spec{
+		// The host filesystem root is the container root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: args,
+			Env: []string{
+				"PATH=" + os.Getenv("PATH"),
+			},
+		},
+	}
+	return spec
+}
+
+// SetupContainer creates a bundle and root dir for the container, generates a
+// test config, and writes the spec to config.json in the bundle dir.
+func SetupContainer(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
+	rootDir, err = ioutil.TempDir("", "containers")
+	if err != nil {
+		return "", "", nil, fmt.Errorf("error creating root dir: %v", err)
+	}
+
+	bundleDir, err = ioutil.TempDir("", "bundle")
+	if err != nil {
+		return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+	}
+
+	if err = writeSpec(bundleDir, spec); err != nil {
+		return "", "", nil, fmt.Errorf("error writing spec: %v", err)
+	}
+
+	conf = &boot.Config{
+		RootDir: rootDir,
+		Network: boot.NetworkNone,
+		// Don't add flags when calling subprocesses, since the test
+		// runner does not know about all the flags. We control the
+		// Config in the subprocess anyways, so it does not matter.
+		TestModeNoFlags: true,
+	}
+
+	return rootDir, bundleDir, conf, nil
+}
+
+// writeSpec writes the spec to disk in the given directory.
+func writeSpec(dir string, spec *specs.Spec) error {
+	b, err := json.Marshal(spec)
+	if err != nil {
+		return err
+	}
+	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
+}
+
+// UniqueContainerID generates a unique container id for each test.
+//
+// The container id is used to create an abstract unix domain socket, which must
+// be unique.  While the container forbids creating two containers with the same
+// name, sometimes between test runs the socket does not get cleaned up quickly
+// enough, causing container creation to fail.
+func UniqueContainerID() string {
+	return fmt.Sprintf("test-container-%d", time.Now().UnixNano())
+}
-- 
cgit v1.2.3


From 55a37ceef1e33cc72236db6e95f159963ddf40bd Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 4 Jun 2018 11:51:27 -0700
Subject: Fix leaky FD

9P socket was being created without CLOEXEC and was being inherited
by the children. This would prevent the gofer from detecting that the
sandbox had exited, because the socket would not be closed.

PiperOrigin-RevId: 199168959
Change-Id: I3ee1a07cbe7331b0aeb1cf2b697e728ce24f85a7
---
 runsc/sandbox/BUILD           | 16 +++++++++-
 runsc/sandbox/sandbox.go      |  2 +-
 runsc/sandbox/sandbox_test.go | 74 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 runsc/sandbox/sandbox_test.go

(limited to 'runsc')

diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index e89b19552..a961c3cc7 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
 go_library(
     name = "sandbox",
@@ -28,3 +28,17 @@ go_library(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+go_test(
+    name = "sandbox_test",
+    size = "small",
+    srcs = ["sandbox_test.go"],
+    data = [
+        "//runsc",
+    ],
+    embed = [":sandbox"],
+    deps = [
+        "//pkg/log",
+        "//runsc/test/testutil",
+    ],
+)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index a9486cfdc..91c44c996 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -195,7 +195,7 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	goferEnds := make([]*os.File, 0, mountCount)
 	for i := 0; i < mountCount; i++ {
 		// Create socket that connects the sandbox and gofer.
-		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
 		if err != nil {
 			return nil, err
 		}
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
new file mode 100644
index 000000000..e25290d5e
--- /dev/null
+++ b/runsc/sandbox/sandbox_test.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"os"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
+	}
+}
+
+func TestGoferExits(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create, start and wait for the container.
+	s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start("123", spec, conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	sandboxProc, err := os.FindProcess(s.Pid)
+	if err != nil {
+		t.Fatalf("error finding sandbox process: %v", err)
+	}
+	gofer, err := os.FindProcess(s.GoferPid)
+	if err != nil {
+		t.Fatalf("error finding sandbox process: %v", err)
+	}
+
+	// Kill sandbox and expect gofer to exit on its own.
+	if err := sandboxProc.Kill(); err != nil {
+		t.Fatalf("error killing sandbox process: %v", err)
+	}
+	if _, err := sandboxProc.Wait(); err != nil {
+		t.Fatalf("error waiting for sandbox process: %v", err)
+	}
+
+	if _, err := gofer.Wait(); err != nil {
+		t.Fatalf("error waiting for gofer process: %v", err)
+	}
+	if s.IsRunning() {
+		t.Errorf("Sandbox shouldn't be running, sandbox: %+v", s)
+	}
+}
-- 
cgit v1.2.3


From 78ccd1298e1386d9c5e0eb10d328ecb16b28ea02 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 4 Jun 2018 12:13:33 -0700
Subject: Return 'running' if gofer is still alive

Containerd will start deleting container and rootfs after container
is stopped. However, if gofer is still running, rootfs cleanup will
fail because of device busy.

This CL makes sure that gofer is not running when container state is
stopped.

Change from: lantaol@google.com

PiperOrigin-RevId: 199172668
Change-Id: I9d874eec3ecf74fd9c8edd7f62d9f998edef66fe
---
 runsc/container/container_test.go |  2 ++
 runsc/sandbox/sandbox.go          | 24 +++++++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 0844cb9df..3af8d620c 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -186,6 +186,8 @@ func TestLifecycle(t *testing.T) {
 	// ourselves.
 	p, _ := os.FindProcess(s.Sandbox.Pid)
 	p.Wait()
+	g, _ := os.FindProcess(s.Sandbox.GoferPid)
+	g.Wait()
 
 	// Load the container from disk and check the status.
 	s, err = container.Load(rootDir, id)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 91c44c996..bfaead1f2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -440,13 +440,27 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 	return nil
 }
 
-// IsRunning returns true iff the sandbox process is running.
+// IsRunning returns true if the sandbox or gofer process is running.
 func (s *Sandbox) IsRunning() bool {
-	// Send a signal 0 to the sandbox process.
-	if err := killProcess(s.Pid, 0); err != nil {
-		return false
+	if s.Pid != 0 {
+		// Send a signal 0 to the sandbox process.
+		if err := killProcess(s.Pid, 0); err == nil {
+			return true
+		}
+	}
+	if s.GoferPid != 0 {
+		// Send a signal 0 to the gofer process.
+		if err := killProcess(s.GoferPid, 0); err == nil {
+			log.Warningf("Found orphan gofer process, pid: %d", s.GoferPid)
+			// Attempt to kill gofer if it's orphan.
+			killProcess(s.GoferPid, unix.SIGKILL)
+
+			// Don't wait for gofer to die. Return 'running' and hope gofer is dead
+			// next time around.
+			return true
+		}
 	}
-	return true
+	return false
 }
 
 // killProcess sends a signal to the host process (i.e. a sandbox or gofer
-- 
cgit v1.2.3


From 6c585b8eb69362db9af5ed150763096874832b86 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 4 Jun 2018 12:30:47 -0700
Subject: Create destination mount dir if it doesn't exist

PiperOrigin-RevId: 199175296
Change-Id: I694ad1cfa65572c92f77f22421fdcac818f44630
---
 runsc/boot/fs.go                  | 12 ++++++-
 runsc/container/container_test.go | 72 +++++++++++++++++++++++++++++----------
 2 files changed, 65 insertions(+), 19 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e5b7663d0..82bbea4d7 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -288,11 +288,21 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 
 	if useOverlay {
 		log.Debugf("Adding overlay on top of mount %q", m.Destination)
-		if inode, err = addOverlay(ctx, conf, inode, m.Type, mf); err != nil {
+		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+		if err != nil {
 			return err
 		}
 	}
 
+	// Create destination in case it doesn't exist. This is required, in addition
+	// to 'addSubmountOverlay', in case there are symlinks to create directories
+	// in the right location, e.g.
+	//   mount: /var/run/secrets, may be created in '/run/secrets' if
+	//   '/var/run' => '/var'.
+	if err := mkdirAll(ctx, mns, m.Destination); err != nil {
+		return err
+	}
+
 	root := mns.Root()
 	defer root.DecRef()
 	dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 3af8d620c..da59c0331 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -92,6 +92,35 @@ func procListToString(pl []*control.Process) string {
 	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
 }
 
+// run starts the sandbox and waits for it to exit, checking that the
+// application succeeded.
+func run(spec *specs.Spec) error {
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		return fmt.Errorf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create, start and wait for the container.
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		return fmt.Errorf("error creating container: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		return fmt.Errorf("error starting container: %v", err)
+	}
+	ws, err := s.Wait()
+	if err != nil {
+		return fmt.Errorf("error waiting on container: %v", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		return fmt.Errorf("container failed, waitStatus: %v", ws)
+	}
+	return nil
+}
+
 // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
@@ -600,27 +629,34 @@ func TestRunNonRoot(t *testing.T) {
 		Type:        "bind",
 	})
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
+	if err := run(spec); err != nil {
+		t.Fatalf("error running sadbox: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+}
 
-	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
+// TestMountNewDir check that runsc will create destination directory if it
+// doesn't exit.
+func TestMountNewDir(t *testing.T) {
+	srcDir := path.Join(os.TempDir(), "src", "newdir", "anotherdir")
+	if err := os.MkdirAll(srcDir, 0755); err != nil {
+		t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
 	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-	ws, err := s.Wait()
-	if err != nil {
-		t.Errorf("error waiting on container: %v", err)
+
+	// Attempt to remove dir to ensure it doesn't exist.
+	mountDir := path.Join(os.TempDir(), "newdir")
+	if err := os.RemoveAll(mountDir); err != nil {
+		t.Fatalf("os.RemoveAll(%q) failed: %v", mountDir, err)
 	}
-	if !ws.Exited() || ws.ExitStatus() != 0 {
-		t.Errorf("container failed, waitStatus: %v", ws)
+	mountDir = path.Join(mountDir, "anotherdir")
+
+	spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: mountDir,
+		Source:      srcDir,
+		Type:        "bind",
+	})
+
+	if err := run(spec); err != nil {
+		t.Fatalf("error running sadbox: %v", err)
 	}
 }
-- 
cgit v1.2.3


From 19a0e83b50fbcfd89927baedbb1f1fd14dc448ca Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 4 Jun 2018 18:04:05 -0700
Subject: Make fsgofer attach more strict

Refuse to mount paths with "." and ".." in the path to prevent
a compromised Sentry to mount "../../secrets". Only allow
Attach to be called once per mount point.

PiperOrigin-RevId: 199225929
Change-Id: I2a3eb7ea0b23f22eb8dde2e383e32563ec003bd5
---
 runsc/fsgofer/fsgofer.go      | 25 ++++++++++++++++++++-----
 runsc/fsgofer/fsgofer_test.go | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 5 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index cd6224de3..f685738c3 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -26,7 +26,6 @@ import (
 	"math"
 	"os"
 	"path"
-	"path/filepath"
 	"strings"
 	"sync"
 	"syscall"
@@ -83,6 +82,9 @@ type Config struct {
 type attachPoint struct {
 	prefix string
 	conf   Config
+
+	mu       sync.Mutex
+	attached bool
 }
 
 // NewAttachPoint creates a new attacher that gives local file
@@ -93,19 +95,22 @@ func NewAttachPoint(prefix string, c Config) p9.Attacher {
 
 // Attach implements p9.Attacher.
 func (a *attachPoint) Attach(appPath string) (p9.File, error) {
+	// Only proceed if 'appPath' is valid.
 	if !path.IsAbs(appPath) {
 		return nil, fmt.Errorf("invalid path %q", appPath)
 	}
+	if path.Clean(appPath) != appPath {
+		return nil, fmt.Errorf("invalid path %q", appPath)
+	}
 
-	root := filepath.Join(a.prefix, appPath)
+	root := path.Join(a.prefix, appPath)
 	fi, err := os.Stat(root)
 	if err != nil {
 		return nil, err
 	}
-
-	mode := syscall.O_RDWR
+	mode := os.O_RDWR
 	if a.conf.ROMount || fi.IsDir() {
-		mode = syscall.O_RDONLY
+		mode = os.O_RDONLY
 	}
 
 	f, err := os.OpenFile(root, mode|openFlags, 0)
@@ -114,8 +119,18 @@ func (a *attachPoint) Attach(appPath string) (p9.File, error) {
 	}
 	stat, err := stat(int(f.Fd()))
 	if err != nil {
+		f.Close()
 		return nil, fmt.Errorf("failed to stat file %q, err: %v", root, err)
 	}
+
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if a.attached {
+		f.Close()
+		return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
+	}
+	a.attached = true
+
 	return newLocalFile(a.conf, f, root, stat)
 }
 
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 249f67bf9..8d038eaf6 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -19,6 +19,7 @@ import (
 	"io/ioutil"
 	"os"
 	"path"
+	"strings"
 	"syscall"
 	"testing"
 
@@ -622,3 +623,45 @@ func TestAttachFile(t *testing.T) {
 		t.Fatalf("ReadAt() wrong data, got: %s, expected: %s", string(rBuf), "foobar")
 	}
 }
+
+func TestAttachError(t *testing.T) {
+	conf := Config{ROMount: false}
+	root, err := ioutil.TempDir("", "root-")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed, err: %v", err)
+	}
+	defer os.RemoveAll(root)
+	a := NewAttachPoint(root, conf)
+
+	c := path.Join(root, "test")
+	if err := os.Mkdir(c, 0700); err != nil {
+		t.Fatalf("os.Create(%q) failed, err: %v", c, err)
+	}
+
+	for _, p := range []string{"test", "/test/../", "/test/./", "/test//"} {
+		_, err := a.Attach(p)
+		if err == nil {
+			t.Fatalf("Attach(%q) should have failed", p)
+		}
+		if want := "invalid path"; !strings.Contains(err.Error(), want) {
+			t.Fatalf("Attach(%q) wrong error, got: %v, wanted: %v", p, err, want)
+		}
+	}
+}
+
+func TestDoubleAttachError(t *testing.T) {
+	conf := Config{ROMount: false}
+	root, err := ioutil.TempDir("", "root-")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed, err: %v", err)
+	}
+	defer os.RemoveAll(root)
+	a := NewAttachPoint(root, conf)
+
+	if _, err := a.Attach("/"); err != nil {
+		t.Fatalf("Attach(%q) failed: %v", "/", err)
+	}
+	if _, err := a.Attach("/"); err == nil {
+		t.Fatalf("Attach(%q) should have failed", "test")
+	}
+}
-- 
cgit v1.2.3


From 722275c3d1a7b420915e6e6a3d623ae941c494cf Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Wed, 6 Jun 2018 11:43:01 -0700
Subject: Added a function to the controller to checkpoint a container.

Functionality for checkpoint is not complete, more to come.

PiperOrigin-RevId: 199500803
Change-Id: Iafb0fcde68c584270000fea898e6657a592466f7
---
 pkg/sentry/control/BUILD    |  4 +++
 pkg/sentry/control/state.go | 73 +++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/controller.go    | 19 +++++++++++-
 runsc/boot/loader.go        |  6 ++--
 4 files changed, 99 insertions(+), 3 deletions(-)
 create mode 100644 pkg/sentry/control/state.go

(limited to 'runsc')

diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 4d1d0d019..6169891f7 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "control.go",
         "proc.go",
+        "state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/control",
     visibility = [
@@ -14,6 +15,7 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
         "//pkg/sentry/kernel",
@@ -21,7 +23,9 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
+        "//pkg/sentry/state",
         "//pkg/sentry/usage",
+        "//pkg/sentry/watchdog",
         "//pkg/urpc",
     ],
 )
diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
new file mode 100644
index 000000000..cee4db636
--- /dev/null
+++ b/pkg/sentry/control/state.go
@@ -0,0 +1,73 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"errors"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// ErrInvalidFiles is returned when the urpc call to Save does not include an
+// appropriate file payload (e.g. there is no output file!).
+var ErrInvalidFiles = errors.New("exactly one file must be provided")
+
+// State includes state-related functions.
+type State struct {
+	Kernel   *kernel.Kernel
+	Watchdog *watchdog.Watchdog
+}
+
+// SaveOpts contains options for the Save RPC call.
+type SaveOpts struct {
+	// Key is used for state integrity check.
+	Key []byte `json:"key"`
+
+	// Metadata is the set of metadata to prepend to the state file.
+	Metadata map[string]string `json:"metadata"`
+
+	// FilePayload contains the destination for the state.
+	urpc.FilePayload
+}
+
+// Save saves the running system.
+func (s *State) Save(o *SaveOpts, _ *struct{}) error {
+	// Create an output stream.
+	if len(o.FilePayload.Files) != 1 {
+		return ErrInvalidFiles
+	}
+	defer o.FilePayload.Files[0].Close()
+
+	// Save to the first provided stream.
+	saveOpts := state.SaveOpts{
+		Destination: o.FilePayload.Files[0],
+		Key:         o.Key,
+		Metadata:    o.Metadata,
+		Callback: func(err error) {
+			if err == nil {
+				log.Infof("Save succeeded: exiting...")
+			} else {
+				log.Warningf("Save failed: exiting...")
+				s.Kernel.SetExitError(err)
+			}
+			s.Kernel.Kill(kernel.ExitStatus{})
+		},
+	}
+	return saveOpts.Save(s.Kernel, s.Watchdog)
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 8fc0a9076..095b0a9b9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,9 +22,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
 )
 
 const (
+	// ContainerCheckpoint checkpoints a container.
+	ContainerCheckpoint = "containerManager.Checkpoint"
+
 	// ContainerEvent is the URPC endpoint for getting stats about the
 	// container used by "runsc events".
 	ContainerEvent = "containerManager.Event"
@@ -69,7 +73,7 @@ type controller struct {
 }
 
 // newController creates a new controller and starts it listening.
-func newController(fd int, k *kernel.Kernel) (*controller, error) {
+func newController(fd int, k *kernel.Kernel, w *watchdog.Watchdog) (*controller, error) {
 	srv, err := server.CreateFromFD(fd)
 	if err != nil {
 		return nil, err
@@ -79,6 +83,7 @@ func newController(fd int, k *kernel.Kernel) (*controller, error) {
 		startChan:       make(chan struct{}),
 		startResultChan: make(chan error),
 		k:               k,
+		watchdog:        w,
 	}
 	srv.Register(manager)
 
@@ -113,6 +118,9 @@ type containerManager struct {
 	// k is the emulated linux kernel on which the sandboxed
 	// containers run.
 	k *kernel.Kernel
+
+	// watchdog is the kernel watchdog.
+	watchdog *watchdog.Watchdog
 }
 
 // StartRoot will start the root container process.
@@ -136,6 +144,15 @@ func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) err
 	return nil
 }
 
+// Checkpoint pauses a sandbox and saves its state.
+func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
+	state := control.State{
+		Kernel:   cm.k,
+		Watchdog: cm.watchdog,
+	}
+	return state.Save(o, nil)
+}
+
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
 	// TODO: Use the cid and wait on the init process in that
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 76edbb905..41d1ee50d 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -186,6 +186,9 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		atomic.StoreUint32(&sniffer.LogPackets, 0)
 	}
 
+	// Create a watchdog.
+	watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning)
+
 	// Create the control server using the provided FD.
 	//
 	// This must be done *after* we have initialized the kernel since the
@@ -195,7 +198,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// misconfigured process will cause an error, and we want the control
 	// server up before that so that we don't time out trying to connect to
 	// it.
-	ctrl, err := newController(controllerFD, k)
+	ctrl, err := newController(controllerFD, k, watchdog)
 	if err != nil {
 		return nil, fmt.Errorf("error creating control server: %v", err)
 	}
@@ -254,7 +257,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// the emulated kernel.
 	stopSignalForwarding := sighandling.StartForwarding(k)
 
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning)
 	return &Loader{
 		k:                    k,
 		ctrl:                 ctrl,
-- 
cgit v1.2.3


From 0c34b460f21d6f756b6402688203cfc5e533caa1 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Wed, 6 Jun 2018 12:31:01 -0700
Subject: Add runsc checkpoint command.

Checkpoint command is plumbed through container and sandbox.
Restore has also been added but it is only a stub. None of this
works yet. More changes to come.

PiperOrigin-RevId: 199510105
Change-Id: Ibd08d57f4737847eb25ca20b114518e487320185
---
 runsc/cmd/BUILD              |  2 ++
 runsc/cmd/checkpoint.go      | 70 ++++++++++++++++++++++++++++++++++++++++++++
 runsc/cmd/restore.go         | 51 ++++++++++++++++++++++++++++++++
 runsc/container/container.go | 10 +++++++
 runsc/sandbox/sandbox.go     | 15 ++++++++++
 5 files changed, 148 insertions(+)
 create mode 100644 runsc/cmd/checkpoint.go
 create mode 100644 runsc/cmd/restore.go

(limited to 'runsc')

diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 4b4afa4a0..a8c84a6a3 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "cmd",
     srcs = [
         "boot.go",
+        "checkpoint.go",
         "cmd.go",
         "create.go",
         "delete.go",
@@ -16,6 +17,7 @@ go_library(
         "list.go",
         "path.go",
         "ps.go",
+        "restore.go",
         "run.go",
         "start.go",
         "state.go",
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
new file mode 100644
index 000000000..9b045da1c
--- /dev/null
+++ b/runsc/cmd/checkpoint.go
@@ -0,0 +1,70 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Checkpoint implements subcommands.Command for the "checkpoint" command.
+type Checkpoint struct {
+}
+
+// Name implements subcommands.Command.Name.
+func (*Checkpoint) Name() string {
+	return "checkpoint"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Checkpoint) Synopsis() string {
+	return "checkpoint current state of container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Checkpoint) Usage() string {
+	return `checkpoint [flags] <container id> - save current state of container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Checkpoint) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	cont, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading container: %v", err)
+	}
+
+	if err := cont.Checkpoint(); err != nil {
+		Fatalf("checkpoint failed: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
new file mode 100644
index 000000000..a535197a4
--- /dev/null
+++ b/runsc/cmd/restore.go
@@ -0,0 +1,51 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+)
+
+// Restore implements subcommands.Command for the "restore" command.
+type Restore struct {
+}
+
+// Name implements subcommands.Command.Name.
+func (*Restore) Name() string {
+	return "restore"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Restore) Synopsis() string {
+	return "restore a saved state of container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Restore) Usage() string {
+	return `restore [flags] <container id> - restore last saved state of container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Restore) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	Fatalf("restore not implemented")
+	return subcommands.ExitFailure
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index f20ec2453..eee148f5a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -299,6 +299,16 @@ func (c *Container) Signal(sig syscall.Signal) error {
 	return c.Sandbox.Signal(c.ID, sig)
 }
 
+// Checkpoint sends the checkpoint call to the container.
+func (c *Container) Checkpoint() error {
+	log.Debugf("Checkpoint container %q", c.ID)
+	if c.Status == Stopped {
+		log.Warningf("container %q not running, not checkpointing", c.ID)
+		return nil
+	}
+	return c.Sandbox.Checkpoint(c.ID)
+}
+
 // State returns the metadata of the container.
 func (c *Container) State() specs.State {
 	return specs.State{
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index bfaead1f2..2a434cfb7 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -440,6 +440,21 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 	return nil
 }
 
+// Checkpoint sends the checkpoint call for a container in the sandbox.
+func (s *Sandbox) Checkpoint(cid string) error {
+	log.Debugf("Checkpoint sandbox %q", s.ID)
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.ContainerCheckpoint, nil, nil); err != nil {
+		return fmt.Errorf("err checkpointing container %q: %v", cid, err)
+	}
+	return nil
+}
+
 // IsRunning returns true if the sandbox or gofer process is running.
 func (s *Sandbox) IsRunning() bool {
 	if s.Pid != 0 {
-- 
cgit v1.2.3


From 206e90d057211f2ac53174907b2ff04801f9a481 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 6 Jun 2018 16:12:58 -0700
Subject: runsc: Support abbreviated container IDs.

Just a UI/usability addition. It's a lot easier to type "60" than
"60185c721d7e10c00489f1fa210ee0d35c594873d6376b457fb1815e4fdbfc2c".

PiperOrigin-RevId: 199547932
Change-Id: I19011b5061a88aba48a9ad7f8cf954a6782de854
---
 runsc/cmd/delete_test.go          |  4 +--
 runsc/container/container.go      | 43 ++++++++++++++++++++++++++--
 runsc/container/container_test.go | 59 ++++++++++++++++++++++++++++++++++++++-
 runsc/test/testutil/testutil.go   | 25 +++++++++++++----
 4 files changed, 121 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index 928e9ee2c..f6d164394 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -31,11 +31,11 @@ func TestNotFound(t *testing.T) {
 
 	d := Delete{}
 	if err := d.execute(ids, conf); err == nil {
-		t.Error("Deleting non-existend container should have failed")
+		t.Error("Deleting non-existent container should have failed")
 	}
 
 	d = Delete{force: true}
 	if err := d.execute(ids, conf); err != nil {
-		t.Errorf("Deleting non-existend container with --force should NOT have failed: %v", err)
+		t.Errorf("Deleting non-existent container with --force should NOT have failed: %v", err)
 	}
 }
diff --git a/runsc/container/container.go b/runsc/container/container.go
index eee148f5a..66a2f27a1 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -23,6 +23,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"strconv"
+	"strings"
 	"syscall"
 	"time"
 
@@ -92,14 +93,22 @@ type Container struct {
 	Sandbox *sandbox.Sandbox `json:"sandbox"`
 }
 
-// Load loads a container with the given id from a metadata file.
+// Load loads a container with the given id from a metadata file. id may be an
+// abbreviation of the full container id, in which case Load loads the
+// container to which id unambiguously refers to.
 // Returns ErrNotExist if container doesn't exits.
 func Load(rootDir, id string) (*Container, error) {
 	log.Debugf("Load container %q %q", rootDir, id)
 	if err := validateID(id); err != nil {
 		return nil, err
 	}
-	metaFile := filepath.Join(rootDir, id, metadataFilename)
+
+	cRoot, err := findContainerRoot(rootDir, id)
+	if err != nil {
+		return nil, err
+	}
+
+	metaFile := filepath.Join(cRoot, metadataFilename)
 	metaBytes, err := ioutil.ReadFile(metaFile)
 	if err != nil {
 		if os.IsNotExist(err) {
@@ -133,6 +142,36 @@ func Load(rootDir, id string) (*Container, error) {
 	return &c, nil
 }
 
+func findContainerRoot(rootDir, partialID string) (string, error) {
+	// Check whether the id fully specifies an existing container.
+	cRoot := filepath.Join(rootDir, partialID)
+	if _, err := os.Stat(cRoot); err == nil {
+		return cRoot, nil
+	}
+
+	// Now see whether id could be an abbreviation of exactly 1 of the
+	// container ids. If id is ambigious (it could match more than 1
+	// container), it is an error.
+	cRoot = ""
+	ids, err := List(rootDir)
+	if err != nil {
+		return "", err
+	}
+	for _, id := range ids {
+		if strings.HasPrefix(id, partialID) {
+			if cRoot != "" {
+				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, cRoot, id)
+			}
+			cRoot = id
+		}
+	}
+	if cRoot == "" {
+		return "", os.ErrNotExist
+	}
+	log.Debugf("abbreviated id %q resolves to full id %q", partialID, cRoot)
+	return filepath.Join(rootDir, cRoot), nil
+}
+
 // List returns all container ids in the given root directory.
 func List(rootDir string) ([]string, error) {
 	log.Debugf("List containers %q", rootDir)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index da59c0331..43cd177ce 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -634,7 +634,7 @@ func TestRunNonRoot(t *testing.T) {
 	}
 }
 
-// TestMountNewDir check that runsc will create destination directory if it
+// TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
 	srcDir := path.Join(os.TempDir(), "src", "newdir", "anotherdir")
@@ -660,3 +660,60 @@ func TestMountNewDir(t *testing.T) {
 		t.Fatalf("error running sadbox: %v", err)
 	}
 }
+
+// TestAbbreviatedIDs checks that runsc supports using abbreviated container
+// IDs in place of full IDs.
+func TestAbbreviatedIDs(t *testing.T) {
+	cids := []string{
+		"foo-" + testutil.UniqueContainerID(),
+		"bar-" + testutil.UniqueContainerID(),
+		"baz-" + testutil.UniqueContainerID(),
+	}
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	for _, cid := range cids {
+		spec := testutil.NewSpecWithArgs("sleep", "100")
+		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// Create and start the container.
+		cont, err := container.Create(cid, spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+	}
+
+	// These should all be unambigious.
+	unambiguous := map[string]string{
+		"f":     cids[0],
+		cids[0]: cids[0],
+		"bar":   cids[1],
+		cids[1]: cids[1],
+		"baz":   cids[2],
+		cids[2]: cids[2],
+	}
+	for shortid, longid := range unambiguous {
+		if _, err := container.Load(rootDir, shortid); err != nil {
+			t.Errorf("%q should resolve to %q: %v", shortid, longid, err)
+		}
+	}
+
+	// These should be ambiguous.
+	ambiguous := []string{
+		"b",
+		"ba",
+	}
+	for _, shortid := range ambiguous {
+		if s, err := container.Load(rootDir, shortid); err == nil {
+			t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID)
+		}
+	}
+}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 87db0a170..1c8fd3ba2 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -84,21 +84,36 @@ func NewSpecWithArgs(args ...string) *specs.Spec {
 	return spec
 }
 
+// SetupRootDir creates a root directory for containers.
+func SetupRootDir() (string, error) {
+	rootDir, err := ioutil.TempDir("", "containers")
+	if err != nil {
+		return "", fmt.Errorf("error creating root dir: %v", err)
+	}
+	return rootDir, nil
+}
+
 // SetupContainer creates a bundle and root dir for the container, generates a
 // test config, and writes the spec to config.json in the bundle dir.
 func SetupContainer(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
-	rootDir, err = ioutil.TempDir("", "containers")
+	rootDir, err = SetupRootDir()
 	if err != nil {
-		return "", "", nil, fmt.Errorf("error creating root dir: %v", err)
+		return "", "", nil, err
 	}
+	bundleDir, conf, err = SetupContainerInRoot(rootDir, spec)
+	return rootDir, bundleDir, conf, err
+}
 
+// SetupContainerInRoot creates a bundle for the container, generates a test
+// config, and writes the spec to config.json in the bundle dir.
+func SetupContainerInRoot(rootDir string, spec *specs.Spec) (bundleDir string, conf *boot.Config, err error) {
 	bundleDir, err = ioutil.TempDir("", "bundle")
 	if err != nil {
-		return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+		return "", nil, fmt.Errorf("error creating bundle dir: %v", err)
 	}
 
 	if err = writeSpec(bundleDir, spec); err != nil {
-		return "", "", nil, fmt.Errorf("error writing spec: %v", err)
+		return "", nil, fmt.Errorf("error writing spec: %v", err)
 	}
 
 	conf = &boot.Config{
@@ -110,7 +125,7 @@ func SetupContainer(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Con
 		TestModeNoFlags: true,
 	}
 
-	return rootDir, bundleDir, conf, nil
+	return bundleDir, conf, nil
 }
 
 // writeSpec writes the spec to disk in the given directory.
-- 
cgit v1.2.3


From 5c51bc51e43a0f1d1f06ae490b0d352d1b483766 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 8 Jun 2018 09:58:29 -0700
Subject: Drop capabilities not needed by Gofer

PiperOrigin-RevId: 199808391
Change-Id: Ib37a4fb6193dc85c1f93bc16769d6aa41854b9d4
---
 runsc/boot/BUILD         |   2 -
 runsc/boot/capability.go | 121 ----------------------------------------
 runsc/cmd/BUILD          |   2 +
 runsc/cmd/boot.go        |  53 +++++++-----------
 runsc/cmd/capability.go  | 142 +++++++++++++++++++++++++++++++++++++++++++++++
 runsc/cmd/cmd.go         |  26 +++++++++
 runsc/cmd/gofer.go       |  30 ++++++++++
 runsc/sandbox/sandbox.go |  12 ++--
 8 files changed, 226 insertions(+), 162 deletions(-)
 delete mode 100644 runsc/boot/capability.go
 create mode 100644 runsc/cmd/capability.go

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 1746df988..73893d699 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "boot",
     srcs = [
-        "capability.go",
         "config.go",
         "controller.go",
         "events.go",
@@ -72,7 +71,6 @@ go_library(
         "//runsc/boot/filter",
         "//runsc/specutils",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
 
diff --git a/runsc/boot/capability.go b/runsc/boot/capability.go
deleted file mode 100644
index efa28fb97..000000000
--- a/runsc/boot/capability.go
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"fmt"
-	"os"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/syndtr/gocapability/capability"
-)
-
-// ApplyCaps applies the capabilities in the spec to the current thread.
-//
-// Note that it must be called with current thread locked.
-func ApplyCaps(conf *Config, caps *specs.LinuxCapabilities) error {
-	setter, err := capability.NewPid2(os.Getpid())
-	if err != nil {
-		return err
-	}
-
-	bounding, err := capsFromNames(caps.Bounding)
-	if err != nil {
-		return err
-	}
-	effective, err := capsFromNames(caps.Effective)
-	if err != nil {
-		return err
-	}
-	permitted, err := capsFromNames(caps.Permitted)
-	if err != nil {
-		return err
-	}
-	inheritable, err := capsFromNames(caps.Inheritable)
-	if err != nil {
-		return err
-	}
-	ambient, err := capsFromNames(caps.Ambient)
-	if err != nil {
-		return err
-	}
-
-	// Ptrace platform requires extra capabilities.
-	if conf.Platform == PlatformPtrace {
-		bounding = append(bounding, capability.CAP_SYS_PTRACE)
-		effective = append(effective, capability.CAP_SYS_PTRACE)
-		permitted = append(permitted, capability.CAP_SYS_PTRACE)
-	}
-
-	setter.Set(capability.BOUNDS, bounding...)
-	setter.Set(capability.PERMITTED, permitted...)
-	setter.Set(capability.INHERITABLE, inheritable...)
-	setter.Set(capability.EFFECTIVE, effective...)
-	setter.Set(capability.AMBIENT, ambient...)
-	return setter.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS)
-}
-
-func capsFromNames(names []string) ([]capability.Cap, error) {
-	var caps []capability.Cap
-	for _, name := range names {
-		cap, ok := capFromName[name]
-		if !ok {
-			return nil, fmt.Errorf("invalid capability %q", name)
-		}
-		caps = append(caps, cap)
-	}
-	return caps, nil
-}
-
-var capFromName = map[string]capability.Cap{
-	"CAP_CHOWN":            capability.CAP_CHOWN,
-	"CAP_DAC_OVERRIDE":     capability.CAP_DAC_OVERRIDE,
-	"CAP_DAC_READ_SEARCH":  capability.CAP_DAC_READ_SEARCH,
-	"CAP_FOWNER":           capability.CAP_FOWNER,
-	"CAP_FSETID":           capability.CAP_FSETID,
-	"CAP_KILL":             capability.CAP_KILL,
-	"CAP_SETGID":           capability.CAP_SETGID,
-	"CAP_SETUID":           capability.CAP_SETUID,
-	"CAP_SETPCAP":          capability.CAP_SETPCAP,
-	"CAP_LINUX_IMMUTABLE":  capability.CAP_LINUX_IMMUTABLE,
-	"CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE,
-	"CAP_NET_BROADCAST":    capability.CAP_NET_BROADCAST,
-	"CAP_NET_ADMIN":        capability.CAP_NET_ADMIN,
-	"CAP_NET_RAW":          capability.CAP_NET_RAW,
-	"CAP_IPC_LOCK":         capability.CAP_IPC_LOCK,
-	"CAP_IPC_OWNER":        capability.CAP_IPC_OWNER,
-	"CAP_SYS_MODULE":       capability.CAP_SYS_MODULE,
-	"CAP_SYS_RAWIO":        capability.CAP_SYS_RAWIO,
-	"CAP_SYS_CHROOT":       capability.CAP_SYS_CHROOT,
-	"CAP_SYS_PTRACE":       capability.CAP_SYS_PTRACE,
-	"CAP_SYS_PACCT":        capability.CAP_SYS_PACCT,
-	"CAP_SYS_ADMIN":        capability.CAP_SYS_ADMIN,
-	"CAP_SYS_BOOT":         capability.CAP_SYS_BOOT,
-	"CAP_SYS_NICE":         capability.CAP_SYS_NICE,
-	"CAP_SYS_RESOURCE":     capability.CAP_SYS_RESOURCE,
-	"CAP_SYS_TIME":         capability.CAP_SYS_TIME,
-	"CAP_SYS_TTY_CONFIG":   capability.CAP_SYS_TTY_CONFIG,
-	"CAP_MKNOD":            capability.CAP_MKNOD,
-	"CAP_LEASE":            capability.CAP_LEASE,
-	"CAP_AUDIT_WRITE":      capability.CAP_AUDIT_WRITE,
-	"CAP_AUDIT_CONTROL":    capability.CAP_AUDIT_CONTROL,
-	"CAP_SETFCAP":          capability.CAP_SETFCAP,
-	"CAP_MAC_OVERRIDE":     capability.CAP_MAC_OVERRIDE,
-	"CAP_MAC_ADMIN":        capability.CAP_MAC_ADMIN,
-	"CAP_SYSLOG":           capability.CAP_SYSLOG,
-	"CAP_WAKE_ALARM":       capability.CAP_WAKE_ALARM,
-	"CAP_BLOCK_SUSPEND":    capability.CAP_BLOCK_SUSPEND,
-	"CAP_AUDIT_READ":       capability.CAP_AUDIT_READ,
-}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index a8c84a6a3..63d8036bd 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "cmd",
     srcs = [
         "boot.go",
+        "capability.go",
         "checkpoint.go",
         "cmd.go",
         "create.go",
@@ -39,6 +40,7 @@ go_library(
         "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 3bdc2ced0..34dd8b3c0 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -16,7 +16,6 @@ package cmd
 
 import (
 	"os"
-	"runtime"
 	"runtime/debug"
 	"strings"
 	"syscall"
@@ -24,7 +23,6 @@ import (
 	"context"
 	"flag"
 	"github.com/google/subcommands"
-	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -106,8 +104,26 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if b.applyCaps {
-		setCapsAndCallSelf(conf, spec)
-		Fatalf("setCapsAndCallSelf must never return")
+		caps := spec.Process.Capabilities
+		if conf.Platform == boot.PlatformPtrace {
+			// Ptrace platform requires extra capabilities.
+			const c = "CAP_SYS_PTRACE"
+			caps.Bounding = append(caps.Bounding, c)
+			caps.Effective = append(caps.Effective, c)
+			caps.Permitted = append(caps.Permitted, c)
+		}
+
+		// Remove --apply-caps arg to call myself.
+		var args []string
+		for _, arg := range os.Args {
+			if !strings.Contains(arg, "apply-caps") {
+				args = append(args, arg)
+			}
+		}
+		if err := setCapsAndCallSelf(spec, args, caps); err != nil {
+			Fatalf("%v", err)
+		}
+		panic("setCapsAndCallSelf must never return success")
 	}
 
 	// Create the loader.
@@ -130,32 +146,3 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	*waitStatus = syscall.WaitStatus(ws.Status())
 	return subcommands.ExitSuccess
 }
-
-// setCapsAndCallSelf sets capabilities to the current thread and then execve's
-// itself again with the same arguments except '--apply-caps' to restart the
-// whole process with the desired capabilities.
-func setCapsAndCallSelf(conf *boot.Config, spec *specs.Spec) {
-	// Keep thread locked while capabilities are changed.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	if err := boot.ApplyCaps(conf, spec.Process.Capabilities); err != nil {
-		Fatalf("ApplyCaps, err: %v", err)
-	}
-	binPath, err := specutils.BinPath()
-	if err != nil {
-		Fatalf("%v", err)
-	}
-
-	// Remove --apply-caps arg to call myself.
-	var args []string
-	for _, arg := range os.Args {
-		if !strings.Contains(arg, "apply-caps") {
-			args = append(args, arg)
-		}
-	}
-
-	log.Infof("Execve 'boot' again, bye!")
-	log.Infof("%s %v", binPath, args)
-	syscall.Exec(binPath, args, []string{})
-}
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
new file mode 100644
index 000000000..0209feb1b
--- /dev/null
+++ b/runsc/cmd/capability.go
@@ -0,0 +1,142 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+	"os"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// applyCaps applies the capabilities in the spec to the current thread.
+//
+// Note that it must be called with current thread locked.
+func applyCaps(caps *specs.LinuxCapabilities) error {
+	setter, err := capability.NewPid2(os.Getpid())
+	if err != nil {
+		return err
+	}
+	if err := setter.Load(); err != nil {
+		return err
+	}
+
+	bounding, err := trimCaps(caps.Bounding, setter)
+	if err != nil {
+		return err
+	}
+	setter.Set(capability.BOUNDS, bounding...)
+
+	effective, err := trimCaps(caps.Effective, setter)
+	if err != nil {
+		return err
+	}
+	setter.Set(capability.EFFECTIVE, effective...)
+
+	permitted, err := trimCaps(caps.Permitted, setter)
+	if err != nil {
+		return err
+	}
+	setter.Set(capability.PERMITTED, permitted...)
+
+	inheritable, err := trimCaps(caps.Inheritable, setter)
+	if err != nil {
+		return err
+	}
+	setter.Set(capability.INHERITABLE, inheritable...)
+
+	ambient, err := trimCaps(caps.Ambient, setter)
+	if err != nil {
+		return err
+	}
+	setter.Set(capability.AMBIENT, ambient...)
+
+	return setter.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS)
+}
+
+func trimCaps(names []string, setter capability.Capabilities) ([]capability.Cap, error) {
+	wantedCaps, err := capsFromNames(names)
+	if err != nil {
+		return nil, err
+	}
+
+	// Trim down capabilities that aren't possible to acquire.
+	var caps []capability.Cap
+	for _, c := range wantedCaps {
+		// Capability rules are more complicated than this, but this catches most
+		// problems with tests running with non-priviledged user.
+		if setter.Get(capability.PERMITTED, c) {
+			caps = append(caps, c)
+		} else {
+			log.Warningf("Capability %q is not permitted, dropping it.", c)
+		}
+	}
+	return caps, nil
+}
+
+func capsFromNames(names []string) ([]capability.Cap, error) {
+	var caps []capability.Cap
+	for _, name := range names {
+		cap, ok := capFromName[name]
+		if !ok {
+			return nil, fmt.Errorf("invalid capability %q", name)
+		}
+		caps = append(caps, cap)
+	}
+	return caps, nil
+}
+
+var capFromName = map[string]capability.Cap{
+	"CAP_CHOWN":            capability.CAP_CHOWN,
+	"CAP_DAC_OVERRIDE":     capability.CAP_DAC_OVERRIDE,
+	"CAP_DAC_READ_SEARCH":  capability.CAP_DAC_READ_SEARCH,
+	"CAP_FOWNER":           capability.CAP_FOWNER,
+	"CAP_FSETID":           capability.CAP_FSETID,
+	"CAP_KILL":             capability.CAP_KILL,
+	"CAP_SETGID":           capability.CAP_SETGID,
+	"CAP_SETUID":           capability.CAP_SETUID,
+	"CAP_SETPCAP":          capability.CAP_SETPCAP,
+	"CAP_LINUX_IMMUTABLE":  capability.CAP_LINUX_IMMUTABLE,
+	"CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE,
+	"CAP_NET_BROADCAST":    capability.CAP_NET_BROADCAST,
+	"CAP_NET_ADMIN":        capability.CAP_NET_ADMIN,
+	"CAP_NET_RAW":          capability.CAP_NET_RAW,
+	"CAP_IPC_LOCK":         capability.CAP_IPC_LOCK,
+	"CAP_IPC_OWNER":        capability.CAP_IPC_OWNER,
+	"CAP_SYS_MODULE":       capability.CAP_SYS_MODULE,
+	"CAP_SYS_RAWIO":        capability.CAP_SYS_RAWIO,
+	"CAP_SYS_CHROOT":       capability.CAP_SYS_CHROOT,
+	"CAP_SYS_PTRACE":       capability.CAP_SYS_PTRACE,
+	"CAP_SYS_PACCT":        capability.CAP_SYS_PACCT,
+	"CAP_SYS_ADMIN":        capability.CAP_SYS_ADMIN,
+	"CAP_SYS_BOOT":         capability.CAP_SYS_BOOT,
+	"CAP_SYS_NICE":         capability.CAP_SYS_NICE,
+	"CAP_SYS_RESOURCE":     capability.CAP_SYS_RESOURCE,
+	"CAP_SYS_TIME":         capability.CAP_SYS_TIME,
+	"CAP_SYS_TTY_CONFIG":   capability.CAP_SYS_TTY_CONFIG,
+	"CAP_MKNOD":            capability.CAP_MKNOD,
+	"CAP_LEASE":            capability.CAP_LEASE,
+	"CAP_AUDIT_WRITE":      capability.CAP_AUDIT_WRITE,
+	"CAP_AUDIT_CONTROL":    capability.CAP_AUDIT_CONTROL,
+	"CAP_SETFCAP":          capability.CAP_SETFCAP,
+	"CAP_MAC_OVERRIDE":     capability.CAP_MAC_OVERRIDE,
+	"CAP_MAC_ADMIN":        capability.CAP_MAC_ADMIN,
+	"CAP_SYSLOG":           capability.CAP_SYSLOG,
+	"CAP_WAKE_ALARM":       capability.CAP_WAKE_ALARM,
+	"CAP_BLOCK_SUSPEND":    capability.CAP_BLOCK_SUSPEND,
+	"CAP_AUDIT_READ":       capability.CAP_AUDIT_READ,
+}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index 9f7fd6e25..940c8cd14 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -18,9 +18,13 @@ package cmd
 import (
 	"fmt"
 	"os"
+	"runtime"
 	"strconv"
+	"syscall"
 
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 // Fatalf logs to stderr and exits with a failure status code.
@@ -64,3 +68,25 @@ func (i *intFlags) Set(s string) error {
 	*i = append(*i, fd)
 	return nil
 }
+
+// setCapsAndCallSelf sets capabilities to the current thread and then execve's
+// itself again with the arguments specified in 'args' to restart the process
+// with the desired capabilities.
+func setCapsAndCallSelf(spec *specs.Spec, args []string, caps *specs.LinuxCapabilities) error {
+	// Keep thread locked while capabilities are changed.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if err := applyCaps(caps); err != nil {
+		return fmt.Errorf("applyCaps() failed: %v", err)
+	}
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		return err
+	}
+
+	log.Infof("Capabilities applied: %+v", caps)
+	log.Infof("Execve %q again, bye!", binPath)
+	syscall.Exec(binPath, args, []string{})
+	panic("unreachable")
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 844e16dbf..39803bde5 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -15,11 +15,13 @@
 package cmd
 
 import (
+	"os"
 	"sync"
 
 	"context"
 	"flag"
 	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
@@ -32,6 +34,7 @@ import (
 type Gofer struct {
 	bundleDir string
 	ioFDs     intFlags
+	applyCaps bool
 }
 
 // Name implements subcommands.Command.
@@ -53,6 +56,7 @@ func (*Gofer) Usage() string {
 func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
 	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
+	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
 }
 
 // Execute implements subcommands.Command.
@@ -66,6 +70,32 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
 	}
+
+	if g.applyCaps {
+		// Minimal set of capabilities needed by the Gofer to operate on files.
+		caps := []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_DAC_READ_SEARCH",
+			"CAP_FOWNER",
+			"CAP_FSETID",
+		}
+		lc := &specs.LinuxCapabilities{
+			Bounding:  caps,
+			Effective: caps,
+			Permitted: caps,
+		}
+
+		// Disable caps when calling myself again.
+		// Note: minimal argument handling for the default case to keep it simple.
+		args := os.Args
+		args = append(args, "--apply-caps=false")
+		if err := setCapsAndCallSelf(spec, args, lc); err != nil {
+			Fatalf("Unable to apply caps: %v", err)
+		}
+		panic("unreachable")
+	}
+
 	specutils.LogSpec(spec)
 
 	// Start with root mount, then add any other addition mount as needed.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 2a434cfb7..48388aa7f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -295,23 +295,23 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// process. IPC and UTS namespaces from the host are not used as they
 	// are virtualized inside the sandbox. Be paranoid and run inside an empty
 	// namespace for these.
-	log.Infof("Sandbox will be started in empty IPC and UTS namespaces")
+	log.Infof("Sandbox will be started in new IPC and UTS namespaces")
 	nss := []specs.LinuxNamespace{
 		{Type: specs.IPCNamespace},
 		{Type: specs.UTSNamespace},
 	}
 
 	if conf.Platform == boot.PlatformPtrace {
-		// TODO: Also set an empty PID namespace so that we limit
+		// TODO: Also set a new PID namespace so that we limit
 		// access to other host processes.
 		log.Infof("Sandbox will be started in the current PID namespace")
 	} else {
-		log.Infof("Sandbox will be started in empty PID namespace")
+		log.Infof("Sandbox will be started in a new PID namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
 	}
 
 	if conf.FileAccess == boot.FileAccessProxy {
-		log.Infof("Sandbox will be started in empty mount namespace")
+		log.Infof("Sandbox will be started in new mount namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
 	} else {
 		log.Infof("Sandbox will be started in the current mount namespace")
@@ -324,7 +324,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
 		nss = append(nss, ns)
 	} else {
-		log.Infof("Sandbox will be started in empty network namespace")
+		log.Infof("Sandbox will be started in new network namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
 	}
 
@@ -347,7 +347,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		cmd.Args = append(cmd.Args, "--apply-caps=true")
 
 	} else {
-		log.Infof("Sandbox will be started in empty user namespace")
+		log.Infof("Sandbox will be started in new user namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
 	}
 
-- 
cgit v1.2.3


From 48335318a23f4f536c395e602c0cd338c4c4e890 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 12 Jun 2018 10:24:56 -0700
Subject: Enable debug logging in tests

Unit tests call runsc directly now, so all command line arguments
are valid. On the other hand, enabling debug in the test binary
doesn't affect runsc. It needs to be set in the config.

PiperOrigin-RevId: 200237706
Change-Id: I0b5922db17f887f58192dbc2f8dd2fd058b76ec7
---
 runsc/boot/config.go            |  8 --------
 runsc/test/testutil/testutil.go | 12 ++++++------
 2 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index d5dd400d1..c13ac150d 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -176,18 +176,10 @@ type Config struct {
 	// DisableSeccomp indicates whether seccomp syscall filters should be
 	// disabled. Pardon the double negation, but default to enabled is important.
 	DisableSeccomp bool
-
-	// TestModeNoFlags indicates that the ToFlags method should return
-	// empty. This should only be used in tests, since the test runner does
-	// not know about all the flags.
-	TestModeNoFlags bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
 func (c *Config) ToFlags() []string {
-	if c.TestModeNoFlags {
-		return nil
-	}
 	return []string{
 		"--root=" + c.RootDir,
 		"--debug=" + strconv.FormatBool(c.Debug),
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 1c8fd3ba2..9be4407e0 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -117,12 +117,12 @@ func SetupContainerInRoot(rootDir string, spec *specs.Spec) (bundleDir string, c
 	}
 
 	conf = &boot.Config{
-		RootDir: rootDir,
-		Network: boot.NetworkNone,
-		// Don't add flags when calling subprocesses, since the test
-		// runner does not know about all the flags. We control the
-		// Config in the subprocess anyways, so it does not matter.
-		TestModeNoFlags: true,
+		Debug:      true,
+		LogFormat:  "text",
+		LogPackets: true,
+		Network:    boot.NetworkNone,
+		RootDir:    rootDir,
+		Strace:     true,
 	}
 
 	return bundleDir, conf, nil
-- 
cgit v1.2.3


From 2dc9cd7bf73d971a37fa22b52a70961f27f6c970 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 12 Jun 2018 11:02:35 -0700
Subject: runsc: enable terminals in the sandbox.

runsc now mounts the devpts filesystem, so you get a real terminal using
ssh+sshd.

PiperOrigin-RevId: 200244830
Change-Id: If577c805ad0138fda13103210fa47178d8ac6605
---
 runsc/boot/BUILD |  1 +
 runsc/boot/fs.go | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 73893d699..1a81acde5 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -35,6 +35,7 @@ go_library(
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/sys",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/fs/tty",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 82bbea4d7..28c3e8cd0 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -27,6 +27,7 @@ import (
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -109,6 +110,14 @@ func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *f
 		return err
 	}
 
+	// Always mount /dev/pts.
+	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		Type:        "devpts",
+		Destination: "/dev/pts",
+	}); err != nil {
+		return err
+	}
+
 	// Mount proc and sys even if the user did not ask for it, as the spec
 	// says we SHOULD.
 	if !procMounted {
@@ -214,7 +223,7 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 	var fsName string
 	var useOverlay bool
 	switch m.Type {
-	case "proc", "sysfs", "devtmpfs":
+	case "devpts", "devtmpfs", "proc", "sysfs":
 		fsName = m.Type
 	case "none":
 		fsName = "sysfs"
-- 
cgit v1.2.3


From 711a9869e54743b05fc3478be5adce31d45cefe5 Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Tue, 12 Jun 2018 13:24:22 -0700
Subject: Runsc checkpoint works.

This is the first iteration of checkpoint that actually saves to a file.
Tests for checkpoint are included.

Ran into an issue when private unix sockets are enabled. An error message
was added for this case and the mutex state was set.

PiperOrigin-RevId: 200269470
Change-Id: I28d29a9f92c44bf73dc4a4b12ae0509ee4070e93
---
 pkg/sentry/fs/gofer/session.go       |  5 ++--
 pkg/sentry/fs/gofer/session_state.go |  9 ++++++
 runsc/boot/loader.go                 |  4 ++-
 runsc/cmd/checkpoint.go              | 17 +++++++++++-
 runsc/container/container.go         |  5 ++--
 runsc/container/container_test.go    | 53 +++++++++++++++++++++++++++++++++++-
 runsc/sandbox/sandbox.go             | 11 ++++++--
 7 files changed, 95 insertions(+), 9 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 1076e3e55..baf00d8e7 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -28,8 +28,9 @@ import (
 )
 
 type endpointMap struct {
-	mu sync.RWMutex
-	m  map[device.MultiDeviceKey]unix.BoundEndpoint
+	mu sync.RWMutex `state:"nosave"`
+	// TODO: Make map with private unix sockets savable.
+	m map[device.MultiDeviceKey]unix.BoundEndpoint
 }
 
 // add adds the endpoint to the map.
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 4d993a219..0154810c8 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -22,6 +22,15 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
+// beforeSave is invoked by stateify.
+//
+// TODO: Make map with private unix sockets savable.
+func (e *endpointMap) beforeSave() {
+	if len(e.m) != 0 {
+		panic("EndpointMap with existing private unix sockets cannot be saved")
+	}
+}
+
 // afterLoad is invoked by stateify.
 func (s *session) afterLoad() {
 	// The restore environment contains the 9p connection of this mount.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 41d1ee50d..4a6528307 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -100,7 +100,9 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	}
 
 	// Create VDSO.
-	vdso, err := loader.PrepareVDSO(p)
+	//
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("error creating vdso: %v", err)
 	}
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 9b045da1c..927027c2b 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -15,6 +15,8 @@
 package cmd
 
 import (
+	"os"
+
 	"context"
 	"flag"
 	"github.com/google/subcommands"
@@ -24,6 +26,7 @@ import (
 
 // Checkpoint implements subcommands.Command for the "checkpoint" command.
 type Checkpoint struct {
+	imagePath string
 }
 
 // Name implements subcommands.Command.Name.
@@ -44,6 +47,7 @@ func (*Checkpoint) Usage() string {
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (c *Checkpoint) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.imagePath, "image-path", "", "path to saved container image")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -62,7 +66,18 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("error loading container: %v", err)
 	}
 
-	if err := cont.Checkpoint(); err != nil {
+	if c.imagePath == "" {
+		Fatalf("image-path flag must be provided")
+	}
+
+	// Create the image file and open for writing.
+	file, err := os.OpenFile(c.imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+	if err != nil {
+		Fatalf("os.OpenFile(%q) failed: %v", c.imagePath, err)
+	}
+	defer file.Close()
+
+	if err := cont.Checkpoint(file); err != nil {
 		Fatalf("checkpoint failed: %v", err)
 	}
 
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 66a2f27a1..d323388fb 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -339,13 +339,14 @@ func (c *Container) Signal(sig syscall.Signal) error {
 }
 
 // Checkpoint sends the checkpoint call to the container.
-func (c *Container) Checkpoint() error {
+// The statefile will be written to f, the file at the specified image-path.
+func (c *Container) Checkpoint(f *os.File) error {
 	log.Debugf("Checkpoint container %q", c.ID)
 	if c.Status == Stopped {
 		log.Warningf("container %q not running, not checkpointing", c.ID)
 		return nil
 	}
-	return c.Sandbox.Checkpoint(c.ID)
+	return c.Sandbox.Checkpoint(c.ID, f)
 }
 
 // State returns the metadata of the container.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 43cd177ce..b6d19bf33 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -408,6 +408,57 @@ func TestExec(t *testing.T) {
 	}
 }
 
+// TestCheckpoint verifies that calling checkpoint with an image-path flag succeeds.
+// Since there is no current default image path, confirming that calling
+// checkpoint without an image path fails.
+// Checks that there is a file with the name and location given by image path.
+func TestCheckpoint(t *testing.T) {
+	// Container will succeed.
+	spec := testutil.NewSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont.Destroy()
+	if err := cont.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Set the image path, which is where the checkpoint image will be saved.
+	imagePath := filepath.Join(os.TempDir(), "test-image-file")
+
+	// Create the image file and open for writing.
+	file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+	if err != nil {
+		t.Fatalf("error opening new file at imagePath: %v", err)
+	}
+	defer file.Close()
+
+	// Checkpoint running container; save state into new file.
+	if err := cont.Checkpoint(file); err != nil {
+		t.Fatalf("error checkpointing container to empty file: %v", err)
+	}
+	defer os.RemoveAll(imagePath)
+
+	// Check to see if file exists and contains data.
+	fileInfo, err := os.Stat(imagePath)
+	if err != nil {
+		t.Fatalf("error checkpointing container: %v", err)
+	}
+	if size := fileInfo.Size(); size == 0 {
+		t.Fatalf("failed checkpoint, file still appears empty: %v", err)
+	}
+}
+
 // TestCapabilities verifies that:
 // - Running exec as non-root UID and GID will result in an error (because the
 //   executable file can't be read).
@@ -602,7 +653,7 @@ func TestSpecUnsupported(t *testing.T) {
 }
 
 // TestRunNonRoot checks that sandbox can be configured when running as
-// non-priviledged user.
+// non-privileged user.
 func TestRunNonRoot(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/true")
 	spec.Process.User.UID = 343
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 48388aa7f..c1efab7f5 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -441,7 +441,8 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 }
 
 // Checkpoint sends the checkpoint call for a container in the sandbox.
-func (s *Sandbox) Checkpoint(cid string) error {
+// The statefile will be written to f.
+func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
 	log.Debugf("Checkpoint sandbox %q", s.ID)
 	conn, err := s.connect()
 	if err != nil {
@@ -449,7 +450,13 @@ func (s *Sandbox) Checkpoint(cid string) error {
 	}
 	defer conn.Close()
 
-	if err := conn.Call(boot.ContainerCheckpoint, nil, nil); err != nil {
+	opt := control.SaveOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+
+	if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil {
 		return fmt.Errorf("err checkpointing container %q: %v", cid, err)
 	}
 	return nil
-- 
cgit v1.2.3


From 2506b9b11f4e20bf4895f6eb59334ea5cb7d20e8 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Tue, 12 Jun 2018 13:54:02 -0700
Subject: runsc: do not include sub target if it is not started with '/'.

PiperOrigin-RevId: 200274828
Change-Id: I956703217df08d8650a881479b7ade8f9f119912
---
 runsc/boot/fs.go          |  3 +++
 runsc/boot/loader_test.go | 22 ++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 28c3e8cd0..7243153f2 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -445,6 +445,9 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 // children of the given root. The returned paths are relative to the root.
 func subtargets(root string, mnts []specs.Mount) []string {
 	r := filepath.Clean(root)
+	if len(r) > 0 && r[len(r)-1] != '/' {
+		r += "/"
+	}
 	var targets []string
 	for _, mnt := range mnts {
 		t := filepath.Clean(mnt.Destination)
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 5bc6f1646..3ce7855f6 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -15,6 +15,7 @@
 package boot
 
 import (
+	"io/ioutil"
 	"os"
 	"sync"
 	"testing"
@@ -150,6 +151,12 @@ func TestCreateMountNamespace(t *testing.T) {
 		DisableSeccomp: true,
 	}
 
+	testFile, err := ioutil.TempFile(os.TempDir(), "create-mount-namespace-")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed, err: %v", err)
+	}
+	defer os.RemoveAll(testFile.Name())
+
 	testCases := []struct {
 		name string
 		// Spec that will be used to create the mount manager.  Note
@@ -202,7 +209,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
 		{
-			// Mounts are nested inside eachother.
+			// Mounts are nested inside each other.
 			name: "nested mounts",
 			spec: specs.Spec{
 				Root: &specs.Root{
@@ -218,6 +225,16 @@ func TestCreateMountNamespace(t *testing.T) {
 						Destination: "/foo",
 						Type:        "tmpfs",
 					},
+					{
+						Destination: "/foo/qux",
+						Source:      testFile.Name(),
+						Type:        "bind",
+					},
+					{
+						// File mounts with the same prefix.
+						Destination: "/foo/qux-quz",
+						Type:        "tmpfs",
+					},
 					{
 						Destination: "/foo/bar",
 						Type:        "tmpfs",
@@ -233,7 +250,8 @@ func TestCreateMountNamespace(t *testing.T) {
 					},
 				},
 			},
-			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
+				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
 	}
 
-- 
cgit v1.2.3


From 717f2501c9c4cec4e4fb6c76d49779d899f024ae Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 13 Jun 2018 10:19:03 -0700
Subject: Fix failure to mount volume that sandbox process has no access

Boot loader tries to stat mount to determine whether it's a file or not. This
may file if the sandbox process doesn't have access to the file. Instead, add
overlay on top of file, which is better anyway since we don't want to propagate
changes to the host.

PiperOrigin-RevId: 200411261
Change-Id: I14222410e8bc00ed037b779a1883d503843ffebb
---
 pkg/sentry/fs/overlay.go | 22 ++++++++++++++++++++++
 runsc/boot/fs.go         | 16 +++++++++-------
 2 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 40eed3feb..90d21642e 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -103,6 +103,28 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
 	return newOverlayInode(ctx, overlay, msrc), nil
 }
 
+// NewOverlayRootFile produces the root of an overlay that points to a file.
+//
+// Preconditions:
+//
+// - lower must be non-nil.
+// - lower should not expose character devices, pipes, or sockets, because
+//   copying up these types of files is not supported. Neither it can be a dir.
+// - lower must not require that file objects be revalidated.
+// - lower must not have dynamic file/directory content.
+func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
+	if IsRegular(lower.StableAttr) {
+		return nil, fmt.Errorf("lower Inode is not a regular file")
+	}
+	msrc := newOverlayMountSource(upperMS, lower.MountSource, flags)
+	overlay, err := newOverlayEntry(ctx, nil, lower, true)
+	if err != nil {
+		msrc.DecRef()
+		return nil, err
+	}
+	return newOverlayInode(ctx, overlay, msrc), nil
+}
+
 // newOverlayInode creates a new Inode for an overlay.
 func newOverlayInode(ctx context.Context, o *overlayEntry, msrc *MountSource) *Inode {
 	var inode *Inode
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 7243153f2..3113f1857 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"os"
 	"path/filepath"
 	"strings"
 
@@ -209,6 +208,13 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 	lowerFlags.ReadOnly = false
 
 	tmpFS := mustFindFilesystem("tmpfs")
+	if !fs.IsDir(lower.StableAttr) {
+		// Create overlay on top of mount file, e.g. /etc/hostname.
+		msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
+		return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
+	}
+
+	// Create overlay on top of mount dir.
 	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
 	if err != nil {
 		return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err)
@@ -248,13 +254,9 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 		default:
 			return fmt.Errorf("invalid file access type: %v", conf.FileAccess)
 		}
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
-		fi, err := os.Stat(m.Source)
-		if err != nil {
-			return err
-		}
-		// Add overlay to all writable mounts, except when mapping an individual file.
-		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly && fi.Mode().IsDir()
 	default:
 		// TODO: Support all the mount types and make this a
 		// fatal error.  Most applications will "just work" without
-- 
cgit v1.2.3


From d71f5ef6885b9c241018308944e4b2e4b4857029 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 14 Jun 2018 10:10:09 -0700
Subject: Add nanosleep filter for Go 1.11 support

golang.org/cl/108538 replaces pselect6 with nanosleep in runtime.usleep. Update
the filters accordingly.

PiperOrigin-RevId: 200574612
Change-Id: Ifb2296fcb3781518fc047aabbbffedb9ae488cd7
---
 runsc/boot/filter/BUILD           |  2 ++
 runsc/boot/filter/config.go       |  1 -
 runsc/boot/filter/config_go110.go | 30 ++++++++++++++++++++++++++++++
 runsc/boot/filter/config_go111.go | 27 +++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 runsc/boot/filter/config_go110.go
 create mode 100644 runsc/boot/filter/config_go111.go

(limited to 'runsc')

diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index fd1b18717..c9837c236 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "filter",
     srcs = [
         "config.go",
+        "config_go110.go",
+        "config_go111.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 86c256c5b..4e286c5da 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -61,7 +61,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_NEWFSTATAT:      {},
 	syscall.SYS_POLL:            {},
 	syscall.SYS_PREAD64:         {},
-	syscall.SYS_PSELECT6:        {},
 	syscall.SYS_PWRITE64:        {},
 	syscall.SYS_READ:            {},
 	syscall.SYS_READLINKAT:      {},
diff --git a/runsc/boot/filter/config_go110.go b/runsc/boot/filter/config_go110.go
new file mode 100644
index 000000000..f4feb4ce4
--- /dev/null
+++ b/runsc/boot/filter/config_go110.go
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !go1.11
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// TODO: Remove this file and merge config_go111.go back into
+// config.go once we no longer build with Go 1.10.
+
+func init() {
+	allowedSyscalls[syscall.SYS_PSELECT6] = []seccomp.Rule{}
+}
diff --git a/runsc/boot/filter/config_go111.go b/runsc/boot/filter/config_go111.go
new file mode 100644
index 000000000..f5eb2c3c8
--- /dev/null
+++ b/runsc/boot/filter/config_go111.go
@@ -0,0 +1,27 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.11
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_NANOSLEEP] = []seccomp.Rule{}
+}
-- 
cgit v1.2.3


From bd1e83ff60166703153443f379987d45edb0ad8f Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Thu, 14 Jun 2018 15:44:08 -0700
Subject: Fix typo.

PiperOrigin-RevId: 200631795
Change-Id: I297fe3e30fb06b04fccd8358c933e45019dcc1fa
---
 runsc/container/container.go | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index d323388fb..40c31ca7f 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -54,12 +54,12 @@ func validateID(id string) error {
 //
 // Container metadata can be saved and loaded to disk. Within a root directory,
 // we maintain subdirectories for each container named with the container id.
-// The container metadata is is stored as json within the container directory
-// in a file named "meta.json". This metadata format is defined by us, and is
+// The container metadata is stored as a json within the container directory
+// in a file named "meta.json". This metadata format is defined by us and is
 // not part of the OCI spec.
 //
-// Containers must write their metadata file after any change to their internal
-// state. The entire container directory is deleted when the container is
+// Containers must write their metadata files after any change to their internal
+// states. The entire container directory is deleted when the container is
 // destroyed.
 type Container struct {
 	// ID is the container ID.
@@ -96,7 +96,7 @@ type Container struct {
 // Load loads a container with the given id from a metadata file. id may be an
 // abbreviation of the full container id, in which case Load loads the
 // container to which id unambiguously refers to.
-// Returns ErrNotExist if container doesn't exits.
+// Returns ErrNotExist if container doesn't exist.
 func Load(rootDir, id string) (*Container, error) {
 	log.Debugf("Load container %q %q", rootDir, id)
 	if err := validateID(id); err != nil {
@@ -214,7 +214,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		Owner:         os.Getenv("USER"),
 	}
 
-	// TODO: If the metadata annotations indicates that this
+	// TODO: If the metadata annotations indicate that this
 	// container should be started in another sandbox, we must do so. The
 	// metadata will indicate the ID of the sandbox, which is the same as
 	// the ID of the init container in the sandbox. We can look up that
@@ -258,7 +258,7 @@ func (c *Container) Start(conf *boot.Config) error {
 	}
 
 	// "If any prestart hook fails, the runtime MUST generate an error,
-	// stop and destroy the container".
+	// stop and destroy the container" -OCI spec.
 	if c.Spec.Hooks != nil {
 		if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
 			c.Destroy()
@@ -273,7 +273,7 @@ func (c *Container) Start(conf *boot.Config) error {
 
 	// "If any poststart hook fails, the runtime MUST log a warning, but
 	// the remaining hooks and lifecycle continue as if the hook had
-	// succeeded".
+	// succeeded" -OCI spec.
 	if c.Spec.Hooks != nil {
 		executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
 	}
@@ -379,7 +379,7 @@ func (c *Container) Destroy() error {
 	}
 
 	// "If any poststop hook fails, the runtime MUST log a warning, but the
-	// remaining hooks and lifecycle continue as if the hook had succeeded".
+	// remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
 	if c.Spec.Hooks != nil && (c.Status == Created || c.Status == Running) {
 		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
 	}
-- 
cgit v1.2.3


From ef5dd4df9b65fb98d952b83baa736c14b2627fe7 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 15 Jun 2018 09:17:40 -0700
Subject: Set kernel.applicationCores to the number of processor on the host

The right number to use is the number of processors assigned to the cgroup. But until
we make the sandbox join the respective cgroup, just use the number of processors on
the host.

Closes #65, closes #66

PiperOrigin-RevId: 200725483
Change-Id: I34a566b1a872e26c66f56fa6e3100f42aaf802b1
---
 runsc/boot/loader.go      | 10 ++++++----
 runsc/boot/loader_test.go |  5 ++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 4a6528307..89300a953 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -18,6 +18,7 @@ package boot
 import (
 	"fmt"
 	"math/rand"
+	"runtime"
 	"sync/atomic"
 	"syscall"
 	gtime "time"
@@ -171,10 +172,11 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		Timekeeper:        tk,
 		RootUserNamespace: creds.UserNamespace,
 		NetworkStack:      networkStack,
-		ApplicationCores:  8,
-		Vdso:              vdso,
-		RootUTSNamespace:  utsns,
-		RootIPCNamespace:  ipcns,
+		// TODO: use number of logical processors from cgroups.
+		ApplicationCores: uint(runtime.NumCPU()),
+		Vdso:             vdso,
+		RootUTSNamespace: utsns,
+		RootIPCNamespace: ipcns,
 	}); err != nil {
 		return nil, fmt.Errorf("error initializing kernel: %v", err)
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 3ce7855f6..ca78c2cd6 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -15,7 +15,9 @@
 package boot
 
 import (
+	"fmt"
 	"io/ioutil"
+	"math/rand"
 	"os"
 	"sync"
 	"testing"
@@ -29,6 +31,7 @@ import (
 
 func init() {
 	log.SetLevel(log.Debug)
+	rand.Seed(time.Now().UnixNano())
 }
 
 // testSpec returns a simple spec that can be used in tests.
@@ -46,7 +49,7 @@ func testSpec() *specs.Spec {
 }
 
 func createLoader() (*Loader, error) {
-	fd, err := server.CreateSocket(ControlSocketAddr("123"))
+	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
 	if err != nil {
 		return nil, err
 	}
-- 
cgit v1.2.3


From 52110bfc33f294805d89b82385911ab06b9330ba Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Fri, 15 Jun 2018 11:05:10 -0700
Subject: runsc/cmd: fix kill signal parsing

Signal is arg 1, not 2.
Killing with SIGABRT is useful to get Go traces.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>

Change-Id: I0b78e34a9de3fb3385108e26fdb4ff6e9347aeff
PiperOrigin-RevId: 200742743
---
 runsc/cmd/kill.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 97a505fac..0979b002b 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -72,7 +72,7 @@ func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	// The OCI command-line spec says that the signal should be specified
 	// via a flag, but runc (and things that call runc) pass it as an
 	// argument.
-	signal := f.Arg(2)
+	signal := f.Arg(1)
 	if signal == "" {
 		signal = "TERM"
 	}
-- 
cgit v1.2.3


From 2081c5e7f73eadb2ec84640d4b03f4eb1881950e Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Fri, 15 Jun 2018 13:57:29 -0700
Subject: runsc: support /dev bind mount which does not conflict with default
 /dev mount.

PiperOrigin-RevId: 200768923
Change-Id: I4b8da10bcac296e8171fe6754abec5aabfec5e65
---
 runsc/boot/fs.go             | 59 +++++++++++++++++++-------------------------
 runsc/boot/loader_test.go    | 45 +++++++++++++++++++++++++++++++++
 runsc/specutils/specutils.go | 38 +++++++++++++++++++++++++++-
 3 files changed, 108 insertions(+), 34 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 3113f1857..7786e4d4a 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 type fdDispenser struct {
@@ -78,16 +79,29 @@ func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *f
 	// Keep track of whether proc, sys, and tmp were mounted.
 	var procMounted, sysMounted, tmpMounted bool
 
+	// Always mount /dev.
+	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		Type:        "devtmpfs",
+		Destination: "/dev",
+	}); err != nil {
+		return err
+	}
+
+	// Always mount /dev/pts.
+	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		Type:        "devpts",
+		Destination: "/dev/pts",
+	}); err != nil {
+		return err
+	}
+
 	// Mount all submounts from the spec.
 	for _, m := range spec.Mounts {
-		// OCI spec uses many different mounts for the things inside of '/dev'. We
-		// have a single mount at '/dev' that is always mounted, regardless of
-		// whether it was asked for, as the spec says we SHOULD.
-		if strings.HasPrefix(m.Destination, "/dev") {
+		if !specutils.IsSupportedDevMount(m) {
 			log.Warningf("ignoring dev mount at %q", m.Destination)
 			continue
 		}
-		switch m.Destination {
+		switch filepath.Clean(m.Destination) {
 		case "/proc":
 			procMounted = true
 		case "/sys":
@@ -101,22 +115,6 @@ func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *f
 		}
 	}
 
-	// Always mount /dev.
-	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
-		Type:        "devtmpfs",
-		Destination: "/dev",
-	}); err != nil {
-		return err
-	}
-
-	// Always mount /dev/pts.
-	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
-		Type:        "devpts",
-		Destination: "/dev/pts",
-	}); err != nil {
-		return err
-	}
-
 	// Mount proc and sys even if the user did not ask for it, as the spec
 	// says we SHOULD.
 	if !procMounted {
@@ -282,18 +280,13 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 
 	// If there are submounts, we need to overlay the mount on top of a
 	// ramfs with stub directories for submount paths.
-	//
-	// We do not do this for /dev, since there will usually be submounts in
-	// the spec, but our devfs implementation contains all the necessary
-	// directories and files (well, most of them anyways).
-	if m.Destination != "/dev" {
-		submounts := subtargets(m.Destination, spec.Mounts)
-		if len(submounts) > 0 {
-			log.Infof("Adding submount overlay over %q", m.Destination)
-			inode, err = addSubmountOverlay(ctx, inode, submounts)
-			if err != nil {
-				return fmt.Errorf("error adding submount overlay: %v", err)
-			}
+	mounts := specutils.SupportedMounts(spec.Mounts)
+	submounts := subtargets(m.Destination, mounts)
+	if len(submounts) > 0 {
+		log.Infof("Adding submount overlay over %q", m.Destination)
+		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		if err != nil {
+			return fmt.Errorf("error adding submount overlay: %v", err)
 		}
 	}
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index ca78c2cd6..a7f59f775 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -256,6 +256,51 @@ func TestCreateMountNamespace(t *testing.T) {
 			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
 				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
+		{
+			name: "mount inside /dev",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev",
+						Type:        "tmpfs",
+					},
+					{
+						// Mounted by runsc by default.
+						Destination: "/dev/fd",
+						Type:        "tmpfs",
+					},
+					{
+						// Mount with the same prefix.
+						Destination: "/dev/fd-foo",
+						Source:      testFile.Name(),
+						Type:        "bind",
+					},
+					{
+						// Unsupported fs type.
+						Destination: "/dev/mqueue",
+						Type:        "mqueue",
+					},
+					{
+						Destination: "/dev/foo",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev/bar",
+						Source:      testFile.Name(),
+						Type:        "bind",
+					},
+				},
+			},
+			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
+		},
 	}
 
 	for _, tc := range testCases {
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 3161360b4..0bb462eb5 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -195,7 +195,43 @@ func capsFromNames(names []string) (auth.CapabilitySet, error) {
 
 // Is9PMount returns true if the given mount can be mounted as an external gofer.
 func Is9PMount(m specs.Mount) bool {
-	return m.Type == "bind" && m.Source != "" && !strings.HasPrefix(m.Destination, "/dev")
+	return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m)
+}
+
+// IsSupportedDevMount returns true if the mount is a supported /dev mount.
+// Only mount that does not conflict with runsc default /dev mount is
+// supported.
+func IsSupportedDevMount(m specs.Mount) bool {
+	// These are devices exist inside sentry. See pkg/sentry/fs/dev/dev.go
+	var existingDevices = []string{
+		"/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr",
+		"/dev/null", "/dev/zero", "/dev/full", "/dev/random",
+		"/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx",
+	}
+	dst := filepath.Clean(m.Destination)
+	if dst == "/dev" {
+		// OCI spec uses many different mounts for the things inside of '/dev'. We
+		// have a single mount at '/dev' that is always mounted, regardless of
+		// whether it was asked for, as the spec says we SHOULD.
+		return false
+	}
+	for _, dev := range existingDevices {
+		if dst == dev || strings.HasPrefix(dst, dev+"/") {
+			return false
+		}
+	}
+	return true
+}
+
+// SupportedMounts filters out unsupported mounts.
+func SupportedMounts(mounts []specs.Mount) []specs.Mount {
+	var newMounts []specs.Mount
+	for _, m := range mounts {
+		if IsSupportedDevMount(m) {
+			newMounts = append(newMounts, m)
+		}
+	}
+	return newMounts
 }
 
 // BinPath returns the real path to self, resolving symbolink links. This is done
-- 
cgit v1.2.3


From 437890dc4b6987a64ac98766c752ce64091757dc Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 15 Jun 2018 14:07:00 -0700
Subject: runsc: Make gofer logs show up in test output.

PiperOrigin-RevId: 200770591
Change-Id: Ifc096d88615b63135210d93c2b4cee2eaecf1eee
---
 runsc/sandbox/sandbox.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index c1efab7f5..1c0d23161 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -209,6 +209,8 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	}
 
 	cmd := exec.Command(binPath, args...)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
 	cmd.ExtraFiles = goferEnds
 
 	// Setup any uid/gid mappings, and create or join the configured user
-- 
cgit v1.2.3


From 0786707cd94b8feffaeb083077eccaf10873e682 Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Fri, 15 Jun 2018 16:08:20 -0700
Subject: Added code for a pause command for a container process.

Like runc, the pause command will pause the processes of the given container.
It will set that container's status to "paused."
A resume command will be be added to unpause and continue running the process.

PiperOrigin-RevId: 200789624
Change-Id: I72a5d7813d90ecfc4d01cc252d6018855016b1ea
---
 runsc/boot/controller.go          |  9 ++++++
 runsc/cmd/BUILD                   |  1 +
 runsc/cmd/pause.go                | 67 +++++++++++++++++++++++++++++++++++++++
 runsc/container/container.go      | 19 ++++++++++-
 runsc/container/container_test.go | 34 ++++++++++++++++++++
 runsc/container/status.go         | 18 +++++++----
 runsc/sandbox/sandbox.go          | 15 +++++++++
 7 files changed, 156 insertions(+), 7 deletions(-)
 create mode 100644 runsc/cmd/pause.go

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 095b0a9b9..564f2d271 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -37,6 +37,9 @@ const (
 	// container..
 	ContainerExecute = "containerManager.Execute"
 
+	// ContainerPause pauses the container.
+	ContainerPause = "containerManager.Pause"
+
 	// ContainerProcesses is the URPC endpoint for getting the list of
 	// processes running in a container.
 	ContainerProcesses = "containerManager.Processes"
@@ -153,6 +156,12 @@ func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
 	return state.Save(o, nil)
 }
 
+// Pause suspends the process in a container.
+func (cm *containerManager) Pause(_, _ *struct{}) error {
+	cm.k.Pause()
+	return nil
+}
+
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
 	// TODO: Use the cid and wait on the init process in that
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 63d8036bd..8fbce294f 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -17,6 +17,7 @@ go_library(
         "kill.go",
         "list.go",
         "path.go",
+        "pause.go",
         "ps.go",
         "restore.go",
         "run.go",
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
new file mode 100644
index 000000000..ac393b48e
--- /dev/null
+++ b/runsc/cmd/pause.go
@@ -0,0 +1,67 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Pause implements subcommands.Command for the "pause" command.
+type Pause struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Pause) Name() string {
+	return "pause"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Pause) Synopsis() string {
+	return "pause suspends all processes in a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Pause) Usage() string {
+	return `pause <container id> - pause process in instance of container.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Pause) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	cont, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading container: %v", err)
+	}
+
+	if err := cont.Pause(); err != nil {
+		Fatalf("pause failed: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 40c31ca7f..dc7fccdee 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -316,7 +316,7 @@ func (c *Container) Event() (*boot.Event, error) {
 // Pid returns the Pid of the sandbox the container is running in, or -1 if the
 // container is not running.
 func (c *Container) Pid() int {
-	if c.Status != Running && c.Status != Created {
+	if c.Status != Running && c.Status != Created && c.Status != Paused {
 		return -1
 	}
 	return c.Sandbox.Pid
@@ -349,6 +349,23 @@ func (c *Container) Checkpoint(f *os.File) error {
 	return c.Sandbox.Checkpoint(c.ID, f)
 }
 
+// Pause suspends the container and its kernel.
+// The call only succeeds if the container's status is created or running.
+func (c *Container) Pause() error {
+	log.Debugf("Pausing container %q", c.ID)
+	switch c.Status {
+	case Created, Running:
+		if err := c.Sandbox.Pause(c.ID); err != nil {
+			return fmt.Errorf("error pausing container: %v", err)
+		}
+		c.Status = Paused
+		return c.save()
+	default:
+		log.Warningf("container %q not created or running, not pausing", c.ID)
+		return nil
+	}
+}
+
 // State returns the metadata of the container.
 func (c *Container) State() specs.State {
 	return specs.State{
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index b6d19bf33..5659abab3 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -459,6 +459,40 @@ func TestCheckpoint(t *testing.T) {
 	}
 }
 
+// TestPause tests that calling pause successfully pauses the container.
+// It checks that no errors are returned and that the state of the container
+// is in fact 'Paused.'
+func TestPause(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont.Destroy()
+	if err := cont.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Pause the running container.
+	if err := cont.Pause(); err != nil {
+		t.Errorf("error pausing container: %v", err)
+	}
+
+	// Confirm the status of the container is paused.
+	if got, want := cont.Status, container.Paused; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+}
+
 // TestCapabilities verifies that:
 // - Running exec as non-root UID and GID will result in an error (because the
 //   executable file can't be read).
diff --git a/runsc/container/status.go b/runsc/container/status.go
index 8da1b4e89..bf177e78a 100644
--- a/runsc/container/status.go
+++ b/runsc/container/status.go
@@ -19,13 +19,17 @@ package container
 type Status int
 
 const (
-	// Creating indicates "the container is being created".
-	Creating Status = iota
-
 	// Created indicates "the runtime has finished the create operation and
 	// the container process has neither exited nor executed the
 	// user-specified program".
-	Created
+	Created Status = iota
+
+	// Creating indicates "the container is being created".
+	Creating
+
+	// Paused indicates that the process within the container has been
+	// suspended.
+	Paused
 
 	// Running indicates "the container process has executed the
 	// user-specified program but has not exited".
@@ -39,10 +43,12 @@ const (
 // CLI spec and should not be changed.
 func (s Status) String() string {
 	switch s {
-	case Creating:
-		return "creating"
 	case Created:
 		return "created"
+	case Creating:
+		return "creating"
+	case Paused:
+		return "paused"
 	case Running:
 		return "running"
 	case Stopped:
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 1c0d23161..f9129a179 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -464,6 +464,21 @@ func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
 	return nil
 }
 
+// Pause sends the pause call for a container in the sandbox.
+func (s *Sandbox) Pause(cid string) error {
+	log.Debugf("Pause sandbox %q", s.ID)
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.ContainerPause, nil, nil); err != nil {
+		return fmt.Errorf("err pausing container %q: %v", cid, err)
+	}
+	return nil
+}
+
 // IsRunning returns true if the sandbox or gofer process is running.
 func (s *Sandbox) IsRunning() bool {
 	if s.Pid != 0 {
-- 
cgit v1.2.3


From 775982ed4b54230165c3d6e0ab7f4cceaa4860ba Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 18 Jun 2018 09:59:14 -0700
Subject: Automated rollback of changelist 200770591

PiperOrigin-RevId: 201012131
Change-Id: I5cd69e795555129319eb41135ecf26db9a0b1fcb
---
 runsc/sandbox/sandbox.go | 2 --
 1 file changed, 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f9129a179..b008eba1e 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -209,8 +209,6 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	}
 
 	cmd := exec.Command(binPath, args...)
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
 	cmd.ExtraFiles = goferEnds
 
 	// Setup any uid/gid mappings, and create or join the configured user
-- 
cgit v1.2.3


From 821aaf531d62dc4f66078528901e536524951e3b Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Mon, 18 Jun 2018 10:33:06 -0700
Subject: runsc: support "rw" mount option.

PiperOrigin-RevId: 201018483
Change-Id: I52fe3d01c83c8a2f0e9275d9d88c37e46fa224a2
---
 runsc/boot/fs.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 7786e4d4a..7ebf22de8 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -392,12 +392,14 @@ func mountFlags(opts []string) fs.MountSourceFlags {
 	mf := fs.MountSourceFlags{}
 	for _, o := range opts {
 		switch o {
+		case "rw":
+			mf.ReadOnly = false
 		case "ro":
 			mf.ReadOnly = true
 		case "noatime":
 			mf.NoAtime = true
 		default:
-			log.Warningf("ignorning unknown mount option %q", o)
+			log.Warningf("ignoring unknown mount option %q", o)
 		}
 	}
 	return mf
-- 
cgit v1.2.3


From f3727528e57ab720fac55553471b31877163cc12 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Mon, 18 Jun 2018 13:36:55 -0700
Subject: runsc: support symlink to the exec path.

PiperOrigin-RevId: 201049912
Change-Id: Idd937492217a4c2ca3d59c602e41576a3b203dd9
---
 runsc/specutils/specutils.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 0bb462eb5..8dae3efb1 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -106,7 +106,9 @@ func GetExecutablePath(exec, root string, env []string) (string, error) {
 	// for.
 	for _, p := range path {
 		abs := filepath.Join(root, p, exec)
-		if _, err := os.Stat(abs); err == nil {
+		// Do not follow symlink link because the target is in the container
+		// root filesystem.
+		if _, err := os.Lstat(abs); err == nil {
 			// We found it!  Return the path relative to the root.
 			return filepath.Join("/", p, exec), nil
 		}
-- 
cgit v1.2.3


From 873ec0c414973e829c1570f21d0d2e2a0df681f4 Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Mon, 18 Jun 2018 15:19:36 -0700
Subject: Modified boot.go to allow for restores.

A file descriptor was added as a flag to boot so a state file can restore a
container that was checkpointed.

PiperOrigin-RevId: 201068699
Change-Id: I18e96069488ffa3add468861397f3877725544aa
---
 runsc/boot/BUILD          |  1 +
 runsc/boot/loader.go      | 46 +++++++++++++++++++++++++++++++---------------
 runsc/boot/loader_test.go |  2 +-
 runsc/cmd/boot.go         |  6 +++++-
 4 files changed, 38 insertions(+), 17 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 1a81acde5..8b3b09a22 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -51,6 +51,7 @@ go_library(
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netlink/route",
         "//pkg/sentry/socket/unix",
+        "//pkg/sentry/state",
         "//pkg/sentry/strace",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/time",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 89300a953..526e8f8bb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -18,6 +18,7 @@ package boot
 import (
 	"fmt"
 	"math/rand"
+	"os"
 	"runtime"
 	"sync/atomic"
 	"syscall"
@@ -35,6 +36,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
 	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
@@ -90,7 +92,7 @@ func init() {
 }
 
 // New initializes a new kernel loader configured by spec.
-func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) {
+func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []int, console bool) (*Loader, error) {
 	// Create kernel and platform.
 	p, err := createPlatform(conf)
 	if err != nil {
@@ -165,20 +167,34 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// Run().
 	networkStack := newEmptyNetworkStack(conf, k)
 
-	// Initiate the Kernel object, which is required by the Context passed
-	// to createVFS in order to mount (among other things) procfs.
-	if err = k.Init(kernel.InitKernelArgs{
-		FeatureSet:        cpuid.HostFeatureSet(),
-		Timekeeper:        tk,
-		RootUserNamespace: creds.UserNamespace,
-		NetworkStack:      networkStack,
-		// TODO: use number of logical processors from cgroups.
-		ApplicationCores: uint(runtime.NumCPU()),
-		Vdso:             vdso,
-		RootUTSNamespace: utsns,
-		RootIPCNamespace: ipcns,
-	}); err != nil {
-		return nil, fmt.Errorf("error initializing kernel: %v", err)
+	// Check if we need to restore the kernel
+	if restoreFD != -1 {
+		restoreFile := os.NewFile(uintptr(restoreFD), "restore_file")
+		defer restoreFile.Close()
+
+		// Load the state.
+		loadOpts := state.LoadOpts{
+			Source: restoreFile,
+		}
+		if err := loadOpts.Load(k, p, networkStack); err != nil {
+			return nil, err
+		}
+	} else {
+		// Initiate the Kernel object, which is required by the Context passed
+		// to createVFS in order to mount (among other things) procfs.
+		if err = k.Init(kernel.InitKernelArgs{
+			FeatureSet:        cpuid.HostFeatureSet(),
+			Timekeeper:        tk,
+			RootUserNamespace: creds.UserNamespace,
+			NetworkStack:      networkStack,
+			// TODO: use number of logical processors from cgroups.
+			ApplicationCores: uint(runtime.NumCPU()),
+			Vdso:             vdso,
+			RootUTSNamespace: utsns,
+			RootIPCNamespace: ipcns,
+		}); err != nil {
+			return nil, fmt.Errorf("error initializing kernel: %v", err)
+		}
 	}
 
 	// Turn on packet logging if enabled.
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index a7f59f775..dab7ad0c5 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -59,7 +59,7 @@ func createLoader() (*Loader, error) {
 		FileAccess:     FileAccessDirect,
 		DisableSeccomp: true,
 	}
-	return New(testSpec(), conf, fd, nil, false)
+	return New(testSpec(), conf, fd, -1, nil, false)
 }
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 34dd8b3c0..86f597c09 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -48,6 +48,9 @@ type Boot struct {
 	// applyCaps determines if capabilities defined in the spec should be applied
 	// to the process.
 	applyCaps bool
+
+	// restoreFD is the file descriptor to the state file to be restored.
+	restoreFD int
 }
 
 // Name implements subcommands.Command.Name.
@@ -72,6 +75,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+	f.IntVar(&b.restoreFD, "restore-fd", -1, "FD of the state file to be restored")
 }
 
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
@@ -127,7 +131,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-	l, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
+	l, err := boot.New(spec, conf, b.controllerFD, b.restoreFD, b.ioFDs.GetArray(), b.console)
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
-- 
cgit v1.2.3


From a6dbef045ff684e92f472280eb6f7f688b9bc87a Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Tue, 19 Jun 2018 15:22:23 -0700
Subject: Added a resume command to unpause a paused container.

Resume checks the status of the container and unpauses the kernel
if its status is paused. Otherwise nothing happens.
Tests were added to ensure that the process is in the correct state
after various commands.

PiperOrigin-RevId: 201251234
Change-Id: Ifd11b336c33b654fea6238738f864fcf2bf81e19
---
 pkg/sentry/control/proc.go        |   2 +
 runsc/boot/controller.go          |  11 ++-
 runsc/cmd/BUILD                   |   1 +
 runsc/cmd/resume.go               |  68 +++++++++++++++++
 runsc/container/container.go      |  21 +++++-
 runsc/container/container_test.go | 152 ++++++++++++++++++++++++++++++++++++--
 runsc/main.go                     |   2 +
 runsc/sandbox/sandbox.go          |  15 ++++
 8 files changed, 262 insertions(+), 10 deletions(-)
 create mode 100644 runsc/cmd/resume.go

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index d77b30c90..d94ae560f 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"sort"
 	"syscall"
 	"text/tabwriter"
 	"time"
@@ -245,6 +246,7 @@ func Processes(k *kernel.Kernel, out *[]*Process) error {
 			Cmd:   tg.Leader().Name(),
 		})
 	}
+	sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
 	return nil
 }
 
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 564f2d271..ae727f144 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -44,6 +44,9 @@ const (
 	// processes running in a container.
 	ContainerProcesses = "containerManager.Processes"
 
+	// ContainerResume unpauses the paused container.
+	ContainerResume = "containerManager.Resume"
+
 	// ContainerSignal is used to send a signal to a container.
 	ContainerSignal = "containerManager.Signal"
 
@@ -156,12 +159,18 @@ func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
 	return state.Save(o, nil)
 }
 
-// Pause suspends the process in a container.
+// Pause suspends a container.
 func (cm *containerManager) Pause(_, _ *struct{}) error {
 	cm.k.Pause()
 	return nil
 }
 
+// Resume unpauses a container.
+func (cm *containerManager) Resume(_, _ *struct{}) error {
+	cm.k.Unpause()
+	return nil
+}
+
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
 	// TODO: Use the cid and wait on the init process in that
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 8fbce294f..fffb6f359 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -20,6 +20,7 @@ go_library(
         "pause.go",
         "ps.go",
         "restore.go",
+        "resume.go",
         "run.go",
         "start.go",
         "state.go",
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
new file mode 100644
index 000000000..a12adf1a3
--- /dev/null
+++ b/runsc/cmd/resume.go
@@ -0,0 +1,68 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Resume implements subcommands.Command for the "resume" command.
+type Resume struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Resume) Name() string {
+	return "resume"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Resume) Synopsis() string {
+	return "Resume unpauses a paused container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Resume) Usage() string {
+	return `resume <container id> - resume a paused container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Resume) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	cont, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading container: %v", err)
+	}
+
+	if err := cont.Resume(); err != nil {
+		Fatalf("resume failed: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index dc7fccdee..571784e07 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -361,8 +361,23 @@ func (c *Container) Pause() error {
 		c.Status = Paused
 		return c.save()
 	default:
-		log.Warningf("container %q not created or running, not pausing", c.ID)
-		return nil
+		return fmt.Errorf("container %q not created or running, not pausing", c.ID)
+	}
+}
+
+// Resume unpauses the container and its kernel.
+// The call only succeeds if the container's status is paused.
+func (c *Container) Resume() error {
+	log.Debugf("Resuming container %q", c.ID)
+	switch c.Status {
+	case Paused:
+		if err := c.Sandbox.Resume(c.ID); err != nil {
+			return fmt.Errorf("error resuming container: %v", err)
+		}
+		c.Status = Running
+		return c.save()
+	default:
+		return fmt.Errorf("container %q not paused, not resuming", c.ID)
 	}
 }
 
@@ -380,7 +395,7 @@ func (c *Container) State() specs.State {
 // Processes retrieves the list of processes and associated metadata inside a
 // container.
 func (c *Container) Processes() ([]*control.Process, error) {
-	if c.Status != Running {
+	if c.Status != Running && c.Status != Paused {
 		return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", c.ID, c.Status)
 	}
 	return c.Sandbox.Processes(c.ID)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5659abab3..7818990a7 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -84,6 +84,19 @@ func procListsEqual(got, want []*control.Process) bool {
 	return true
 }
 
+// getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
+// test for equality. This is because we already confirmed that exec occurred.
+func getAndCheckProcLists(cont *container.Container, want []*control.Process) error {
+	got, err := cont.Processes()
+	if err != nil {
+		return fmt.Errorf("error getting process data from container: %v", err)
+	}
+	if procListsEqual(got, want) {
+		return nil
+	}
+	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
+}
+
 func procListToString(pl []*control.Process) string {
 	strs := make([]string, 0, len(pl))
 	for _, p := range pl {
@@ -459,11 +472,14 @@ func TestCheckpoint(t *testing.T) {
 	}
 }
 
-// TestPause tests that calling pause successfully pauses the container.
-// It checks that no errors are returned and that the state of the container
-// is in fact 'Paused.'
-func TestPause(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("sleep", "100")
+// TestPauseResume tests that we can successfully pause and resume a container.
+// It checks starts running sleep and executes another sleep. It pauses and checks
+// that both processes are still running: sleep will be paused and still exist.
+// It will then unpause and confirm that both processes are running. Then it will
+// wait until one sleep completes and check to make sure the other is running.
+func TestPauseResume(t *testing.T) {
+	const uid = 343
+	spec := testutil.NewSpecWithArgs("sleep", "20")
 
 	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
@@ -482,15 +498,139 @@ func TestPause(t *testing.T) {
 		t.Fatalf("error starting container: %v", err)
 	}
 
+	// expectedPL lists the expected process state of the container.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  uid,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	execArgs := control.ExecArgs{
+		Filename:         "/bin/sleep",
+		Argv:             []string{"sleep", "5"},
+		Envv:             []string{"PATH=" + os.Getenv("PATH")},
+		WorkingDirectory: "/",
+		KUID:             uid,
+	}
+
+	// First, start running exec (whick blocks).
+	go cont.Execute(&execArgs)
+
+	// Verify that "sleep 5" is running.
+	if err := waitForProcessList(cont, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
 	// Pause the running container.
 	if err := cont.Pause(); err != nil {
 		t.Errorf("error pausing container: %v", err)
 	}
+	if got, want := cont.Status, container.Paused; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	time.Sleep(10 * time.Second)
+
+	// Verify that the two processes still exist. Sleep 5 is paused so
+	// it should still be in the process list after 10 seconds.
+	if err := getAndCheckProcLists(cont, expectedPL); err != nil {
+		t.Fatal(err)
+	}
 
-	// Confirm the status of the container is paused.
+	// Resume the running container.
+	if err := cont.Resume(); err != nil {
+		t.Errorf("error pausing container: %v", err)
+	}
+	if got, want := cont.Status, container.Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	if err := getAndCheckProcLists(cont, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	expectedPL2 := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Verify there is only one process left since we waited 10 at most seconds for
+	// sleep 5 to end.
+	if err := waitForProcessList(cont, expectedPL2); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// TestPauseResumeStatus makes sure that the statuses are set correctly
+// with calls to pause and resume and that pausing and resuming only
+// occurs given the correct state.
+func TestPauseResumeStatus(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("sleep", "20")
+
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont.Destroy()
+	if err := cont.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Pause the running container.
+	if err := cont.Pause(); err != nil {
+		t.Errorf("error pausing container: %v", err)
+	}
 	if got, want := cont.Status, container.Paused; got != want {
 		t.Errorf("container status got %v, want %v", got, want)
 	}
+
+	// Try to Pause again. Should cause error.
+	if err := cont.Pause(); err == nil {
+		t.Errorf("error pausing container that was already paused: %v", err)
+	}
+	if got, want := cont.Status, container.Paused; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Resume the running container.
+	if err := cont.Resume(); err != nil {
+		t.Errorf("error resuming container: %v", err)
+	}
+	if got, want := cont.Status, container.Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Try to resume again. Should cause error.
+	if err := cont.Resume(); err == nil {
+		t.Errorf("error resuming container already running: %v", err)
+	}
+	if got, want := cont.Status, container.Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
 }
 
 // TestCapabilities verifies that:
diff --git a/runsc/main.go b/runsc/main.go
index 42c8ee315..4d69f5803 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -76,7 +76,9 @@ func main() {
 	subcommands.Register(new(cmd.Gofer), "")
 	subcommands.Register(new(cmd.Kill), "")
 	subcommands.Register(new(cmd.List), "")
+	subcommands.Register(new(cmd.Pause), "")
 	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Resume), "")
 	subcommands.Register(new(cmd.Run), "")
 	subcommands.Register(new(cmd.Start), "")
 	subcommands.Register(new(cmd.State), "")
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index b008eba1e..0181dc9d4 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -477,6 +477,21 @@ func (s *Sandbox) Pause(cid string) error {
 	return nil
 }
 
+// Resume sends the resume call for a container in the sandbox.
+func (s *Sandbox) Resume(cid string) error {
+	log.Debugf("Resume sandbox %q", s.ID)
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.ContainerResume, nil, nil); err != nil {
+		return fmt.Errorf("err resuming container %q: %v", cid, err)
+	}
+	return nil
+}
+
 // IsRunning returns true if the sandbox or gofer process is running.
 func (s *Sandbox) IsRunning() bool {
 	if s.Pid != 0 {
-- 
cgit v1.2.3


From 33f29c730f46aacb56cb7710c31d19dbe0d5ff3f Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 19 Jun 2018 17:03:55 -0700
Subject: runsc: Fix flakey container_test.

Verified that this is no longer flakey over 10K repetitions.

PiperOrigin-RevId: 201267499
Change-Id: I793c916fe725412aec25953f764cb4f52c9fbed3
---
 runsc/container/container_test.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7818990a7..7f87ea5ab 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -164,6 +164,7 @@ func TestLifecycle(t *testing.T) {
 	if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
+
 	// Load the container from disk and check the status.
 	s, err := container.Load(rootDir, id)
 	if err != nil {
@@ -206,14 +207,17 @@ func TestLifecycle(t *testing.T) {
 	go func() {
 		ws, err := s.Wait()
 		if err != nil {
-			t.Errorf("error waiting on container: %v", err)
+			t.Fatalf("error waiting on container: %v", err)
 		}
 		if got, want := ws.Signal(), syscall.SIGTERM; got != want {
-			t.Errorf("got signal %v, want %v", got, want)
+			t.Fatalf("got signal %v, want %v", got, want)
 		}
 		wg.Done()
 	}()
 
+	// Wait a bit to ensure that we've started waiting on the container
+	// before we signal.
+	time.Sleep(5 * time.Second)
 	// Send the container a SIGTERM which will cause it to stop.
 	if err := s.Signal(syscall.SIGTERM); err != nil {
 		t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
-- 
cgit v1.2.3


From 3ebd0e35f43d9ca282886aabce52fbb7fc7e1fc5 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 19 Jun 2018 17:16:39 -0700
Subject: runsc: Whitelist lstat, as it is now used in specutils.

When running multi-container, child containers are added after the filters have
been installed. Thus, lstat must be in the set of allowed syscalls.

PiperOrigin-RevId: 201269550
Change-Id: I03f2e6675a53d462ed12a0f651c10049b76d4c52
---
 runsc/boot/filter/config.go | 61 ++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 4e286c5da..fdc3e02c6 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -24,35 +24,38 @@ import (
 // allowedSyscalls is the set of syscalls executed by the Sentry
 // to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
-	syscall.SYS_ACCEPT:          {},
-	syscall.SYS_ARCH_PRCTL:      {},
-	syscall.SYS_CLOCK_GETTIME:   {},
-	syscall.SYS_CLONE:           {},
-	syscall.SYS_CLOSE:           {},
-	syscall.SYS_DUP:             {},
-	syscall.SYS_DUP2:            {},
-	syscall.SYS_EPOLL_CREATE1:   {},
-	syscall.SYS_EPOLL_CTL:       {},
-	syscall.SYS_EPOLL_PWAIT:     {},
-	syscall.SYS_EPOLL_WAIT:      {},
-	syscall.SYS_EVENTFD2:        {},
-	syscall.SYS_EXIT:            {},
-	syscall.SYS_EXIT_GROUP:      {},
-	syscall.SYS_FALLOCATE:       {},
-	syscall.SYS_FCHMOD:          {},
-	syscall.SYS_FCNTL:           {},
-	syscall.SYS_FSTAT:           {},
-	syscall.SYS_FSYNC:           {},
-	syscall.SYS_FTRUNCATE:       {},
-	syscall.SYS_FUTEX:           {},
-	syscall.SYS_GETDENTS64:      {},
-	syscall.SYS_GETPID:          {},
-	unix.SYS_GETRANDOM:          {},
-	syscall.SYS_GETSOCKOPT:      {},
-	syscall.SYS_GETTID:          {},
-	syscall.SYS_GETTIMEOFDAY:    {},
-	syscall.SYS_LISTEN:          {},
-	syscall.SYS_LSEEK:           {},
+	syscall.SYS_ACCEPT:        {},
+	syscall.SYS_ARCH_PRCTL:    {},
+	syscall.SYS_CLOCK_GETTIME: {},
+	syscall.SYS_CLONE:         {},
+	syscall.SYS_CLOSE:         {},
+	syscall.SYS_DUP:           {},
+	syscall.SYS_DUP2:          {},
+	syscall.SYS_EPOLL_CREATE1: {},
+	syscall.SYS_EPOLL_CTL:     {},
+	syscall.SYS_EPOLL_PWAIT:   {},
+	syscall.SYS_EPOLL_WAIT:    {},
+	syscall.SYS_EVENTFD2:      {},
+	syscall.SYS_EXIT:          {},
+	syscall.SYS_EXIT_GROUP:    {},
+	syscall.SYS_FALLOCATE:     {},
+	syscall.SYS_FCHMOD:        {},
+	syscall.SYS_FCNTL:         {},
+	syscall.SYS_FSTAT:         {},
+	syscall.SYS_FSYNC:         {},
+	syscall.SYS_FTRUNCATE:     {},
+	syscall.SYS_FUTEX:         {},
+	syscall.SYS_GETDENTS64:    {},
+	syscall.SYS_GETPID:        {},
+	unix.SYS_GETRANDOM:        {},
+	syscall.SYS_GETSOCKOPT:    {},
+	syscall.SYS_GETTID:        {},
+	syscall.SYS_GETTIMEOFDAY:  {},
+	syscall.SYS_LISTEN:        {},
+	syscall.SYS_LSEEK:         {},
+	// TODO: Remove SYS_LSTAT when executable lookup moves
+	// into the gofer.
+	syscall.SYS_LSTAT:           {},
 	syscall.SYS_MADVISE:         {},
 	syscall.SYS_MINCORE:         {},
 	syscall.SYS_MMAP:            {},
-- 
cgit v1.2.3


From 5397963b5d4d57bd3d3668df880b5314ca2fc3d8 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 19 Jun 2018 21:42:21 -0700
Subject: runsc: Enable container creation within existing sandboxes.

Containers are created as processes in the sandbox. Of the many things that
don't work yet, the biggest issue is that the fsgofer is launched with its root
as the sandbox's root directory. Thus, when a container is started and wants to
read anything (including the init binary of the container), the gofer tries to
serve from sandbox's root (which basically just has pause), not the container's.

PiperOrigin-RevId: 201294560
Change-Id: I6423aa8830538959c56ae908ce067e4199d627b1
---
 runsc/boot/controller.go          |  45 +++++++++
 runsc/boot/loader.go              | 193 ++++++++++++++++++++++++++------------
 runsc/cmd/events.go               |   2 +-
 runsc/cmd/exec.go                 |   2 +-
 runsc/cmd/ps.go                   |   2 +-
 runsc/container/BUILD             |   1 +
 runsc/container/container.go      |  66 +++++++++----
 runsc/container/container_test.go |  73 +++++++++++++-
 runsc/sandbox/sandbox.go          |  42 +++++++--
 runsc/sandbox/sandbox_test.go     |   2 +-
 runsc/specutils/specutils.go      |  55 ++++++++++-
 11 files changed, 386 insertions(+), 97 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index ae727f144..1a598199d 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -15,9 +15,12 @@
 package boot
 
 import (
+	"errors"
 	"fmt"
 
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -50,6 +53,10 @@ const (
 	// ContainerSignal is used to send a signal to a container.
 	ContainerSignal = "containerManager.Signal"
 
+	// ContainerStart is the URPC endpoint for running a non-root container
+	// within a sandbox.
+	ContainerStart = "containerManager.Start"
+
 	// ContainerWait is used to wait on the init process of the container
 	// and return its ExitStatus.
 	ContainerWait = "containerManager.Wait"
@@ -127,10 +134,14 @@ type containerManager struct {
 
 	// watchdog is the kernel watchdog.
 	watchdog *watchdog.Watchdog
+
+	// l is the loader that creates containers and sandboxes.
+	l *Loader
 }
 
 // StartRoot will start the root container process.
 func (cm *containerManager) StartRoot(_, _ *struct{}) error {
+	log.Debugf("containerManager.StartRoot")
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
 	return <-cm.startResultChan
@@ -138,11 +149,42 @@ func (cm *containerManager) StartRoot(_, _ *struct{}) error {
 
 // Processes retrieves information about processes running in the sandbox.
 func (cm *containerManager) Processes(_, out *[]*control.Process) error {
+	log.Debugf("containerManager.Processes")
 	return control.Processes(cm.k, out)
 }
 
+// StartArgs contains arguments to the Start method.
+type StartArgs struct {
+	// Spec is the spec of the container to start.
+	Spec *specs.Spec
+
+	// TODO: Separate sandbox and container configs.
+	// Config is the runsc-specific configuration for the sandbox.
+	Conf *Config
+}
+
+// Start runs a created container within a sandbox.
+func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Start")
+
+	// Validate arguments.
+	if args == nil {
+		return errors.New("start missing arguments")
+	}
+	if args.Spec == nil {
+		return errors.New("start arguments missing spec")
+	}
+	if args.Conf == nil {
+		return errors.New("start arguments missing config")
+	}
+
+	cm.l.startContainer(args, cm.k)
+	return nil
+}
+
 // Execute runs a command on a created or running sandbox.
 func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
+	log.Debugf("containerManager.Execute")
 	proc := control.Proc{Kernel: cm.k}
 	if err := proc.Exec(e, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
@@ -152,6 +194,7 @@ func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) err
 
 // Checkpoint pauses a sandbox and saves its state.
 func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
+	log.Debugf("containerManager.Checkpoint")
 	state := control.State{
 		Kernel:   cm.k,
 		Watchdog: cm.watchdog,
@@ -173,6 +216,7 @@ func (cm *containerManager) Resume(_, _ *struct{}) error {
 
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
+	log.Debugf("containerManager.Wait")
 	// TODO: Use the cid and wait on the init process in that
 	// container. Currently we just wait on PID 1 in the sandbox.
 	tg := cm.k.TaskSet().Root.ThreadGroupWithID(1)
@@ -195,6 +239,7 @@ type SignalArgs struct {
 
 // Signal sends a signal to the init process of the container.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Signal")
 	// TODO: Use the cid and send the signal to the init
 	// process in theat container. Currently we just signal PID 1 in the
 	// sandbox.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 526e8f8bb..d1a413cc7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package boot loads the kernel and runs a container..
+// Package boot loads the kernel and runs a container.
 package boot
 
 import (
@@ -79,8 +79,8 @@ type Loader struct {
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
 
-	// procArgs refers to the root container task.
-	procArgs kernel.CreateProcessArgs
+	// rootProcArgs refers to the root sandbox init task.
+	rootProcArgs kernel.CreateProcessArgs
 }
 
 func init() {
@@ -117,12 +117,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	}
 	tk.SetClocks(time.NewCalibratedClocks())
 
-	// Create initial limits.
-	ls, err := createLimitSet(spec)
-	if err != nil {
-		return nil, fmt.Errorf("error creating limits: %v", err)
-	}
-
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
@@ -154,13 +148,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		return nil, fmt.Errorf("failed to enable strace: %v", err)
 	}
 
-	// Get the executable path, which is a bit tricky because we have to
-	// inspect the environment PATH which is relative to the root path.
-	exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
-	if err != nil {
-		return nil, fmt.Errorf("error getting executable path: %v", err)
-	}
-
 	// Create an empty network stack because the network namespace may be empty at
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
@@ -223,16 +210,56 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		return nil, fmt.Errorf("error creating control server: %v", err)
 	}
 
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+	}
+	// Ensure that most signals received in sentry context are forwarded to
+	// the emulated kernel.
+	stopSignalForwarding := sighandling.StartForwarding(k)
+
+	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create root process: %v", err)
+	}
+
+	l := &Loader{
+		k:                    k,
+		ctrl:                 ctrl,
+		conf:                 conf,
+		console:              console,
+		watchdog:             watchdog,
+		stopSignalForwarding: stopSignalForwarding,
+		rootProcArgs:         procArgs,
+	}
+	ctrl.manager.l = l
+	return l, nil
+}
+
+// newProcess creates a process that can be run with kernel.CreateProcess.
+func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+	// Create initial limits.
+	ls, err := createLimitSet(spec)
+	if err != nil {
+		return kernel.CreateProcessArgs{}, fmt.Errorf("error creating limits: %v", err)
+	}
+
+	// Get the executable path, which is a bit tricky because we have to
+	// inspect the environment PATH which is relative to the root path.
+	exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
+	if err != nil {
+		return kernel.CreateProcessArgs{}, fmt.Errorf("error getting executable path: %v", err)
+	}
+
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
-		Filename:         exec,
-		Argv:             spec.Process.Args,
-		Envv:             spec.Process.Env,
-		WorkingDirectory: spec.Process.Cwd,
-		Credentials:      creds,
-		// Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so
-		// it must wait until we have a Kernel.
-		Umask:                uint(syscall.Umask(0)),
+		Filename:             exec,
+		Argv:                 spec.Process.Args,
+		Envv:                 spec.Process.Env,
+		WorkingDirectory:     spec.Process.Cwd,
+		Credentials:          creds,
+		Umask:                uint(0022),
 		Limits:               ls,
 		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
 		UTSNamespace:         utsns,
@@ -240,52 +267,42 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	}
 	ctx := procArgs.NewContext(k)
 
-	// Use root user to configure mounts. The current user might not have
-	// permission to do so.
-	rootProcArgs := kernel.CreateProcessArgs{
-		WorkingDirectory:     "/",
-		Credentials:          auth.NewRootCredentials(creds.UserNamespace),
-		Umask:                uint(syscall.Umask(0022)),
-		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-	}
-	rootCtx := rootProcArgs.NewContext(k)
-
-	// Create the virtual filesystem.
-	mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
-	if err != nil {
-		return nil, fmt.Errorf("error creating mounts: %v", err)
-	}
-	k.SetRootMountNamespace(mns)
-
-	// Create the FD map, which will set stdin, stdout, and stderr.  If console
-	// is true, then ioctl calls will be passed through to the host fd.
+	// Create the FD map, which will set stdin, stdout, and stderr.  If
+	// console is true, then ioctl calls will be passed through to the host
+	// fd.
 	fdm, err := createFDMap(ctx, k, ls, console)
 	if err != nil {
-		return nil, fmt.Errorf("error importing fds: %v", err)
+		return kernel.CreateProcessArgs{}, fmt.Errorf("error importing fds: %v", err)
 	}
 
 	// CreateProcess takes a reference on FDMap if successful. We
 	// won't need ours either way.
 	procArgs.FDMap = fdm
 
-	// We don't care about child signals; some platforms can generate a
-	// tremendous number of useless ones (I'm looking at you, ptrace).
-	if err := sighandling.IgnoreChildStop(); err != nil {
-		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+	// If this is the root container, we also need to setup the root mount
+	// namespace.
+	if k.RootMountNamespace() == nil {
+		// Use root user to configure mounts. The current user might not have
+		// permission to do so.
+		rootProcArgs := kernel.CreateProcessArgs{
+			WorkingDirectory: "/",
+			Credentials:      auth.NewRootCredentials(creds.UserNamespace),
+			// The sentry should run with a umask of 0.
+			Umask:                uint(syscall.Umask(0)),
+			MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+		}
+		rootCtx := rootProcArgs.NewContext(k)
+
+		// Create the virtual filesystem.
+		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
+		if err != nil {
+			return kernel.CreateProcessArgs{}, fmt.Errorf("error creating mounts: %v", err)
+		}
+
+		k.SetRootMountNamespace(mns)
 	}
-	// Ensure that most signals received in sentry context are forwarded to
-	// the emulated kernel.
-	stopSignalForwarding := sighandling.StartForwarding(k)
 
-	return &Loader{
-		k:                    k,
-		ctrl:                 ctrl,
-		conf:                 conf,
-		console:              console,
-		watchdog:             watchdog,
-		stopSignalForwarding: stopSignalForwarding,
-		procArgs:             procArgs,
-	}, nil
+	return procArgs, nil
 }
 
 // Destroy cleans up all resources used by the loader.
@@ -350,17 +367,69 @@ func (l *Loader) run() error {
 	}
 
 	// Create the root container init task.
-	if _, err := l.k.CreateProcess(l.procArgs); err != nil {
+	if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
 		return fmt.Errorf("failed to create init process: %v", err)
 	}
 
 	// CreateProcess takes a reference on FDMap if successful.
-	l.procArgs.FDMap.DecRef()
+	l.rootProcArgs.FDMap.DecRef()
 
 	l.watchdog.Start()
 	return l.k.Start()
 }
 
+func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) error {
+	spec := args.Spec
+	// Create capabilities.
+	caps, err := specutils.Capabilities(spec.Process.Capabilities)
+	if err != nil {
+		return fmt.Errorf("error creating capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+	for _, GID := range spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials. We reuse the root user namespace because the
+	// sentry currently supports only 1 mount namespace, which is tied to a
+	// single user namespace. Thus we must run in the same user namespace
+	// to access mounts.
+	// TODO: Create a new mount namespace for the container.
+	creds := auth.NewUserCredentials(
+		auth.KUID(spec.Process.User.UID),
+		auth.KGID(spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		l.k.RootUserNamespace())
+
+	// TODO New containers should be started in new PID namespaces
+	// when indicated by the spec.
+
+	procArgs, err := newProcess(
+		args.Spec,
+		args.Conf,
+		nil,   // ioFDs
+		false, // console
+		creds,
+		k.RootUTSNamespace(),
+		k.RootIPCNamespace(),
+		k)
+	if err != nil {
+		return fmt.Errorf("failed to create new process: %v", err)
+	}
+
+	if _, err := l.k.CreateProcess(procArgs); err != nil {
+		return fmt.Errorf("failed to create process in sentry: %v", err)
+	}
+
+	// CreateProcess takes a reference on FDMap if successful.
+	procArgs.FDMap.DecRef()
+
+	return nil
+}
+
 // WaitForStartSignal waits for a start signal from the control server.
 func (l *Loader) WaitForStartSignal() {
 	<-l.ctrl.manager.startChan
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index f221ad3ae..df65ea31d 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -76,7 +76,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandox: %v", err)
+		Fatalf("error loading sandbox: %v", err)
 	}
 
 	// Repeatedly get stats from the container.
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 235ed9bc6..cbce07c8e 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -104,7 +104,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandox: %v", err)
+		Fatalf("error loading sandbox: %v", err)
 	}
 
 	if e.WorkingDirectory == "" {
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 9f9f4d15e..5d219bfdc 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -62,7 +62,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandox: %v", err)
+		Fatalf("error loading sandbox: %v", err)
 	}
 	pList, err := c.Processes()
 	if err != nil {
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index fe477abf2..61e05e1c3 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -37,6 +37,7 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
         "//runsc/container",
+        "//runsc/specutils",
         "//runsc/test/testutil",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 571784e07..3b7f95af9 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -214,22 +214,43 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		Owner:         os.Getenv("USER"),
 	}
 
-	// TODO: If the metadata annotations indicate that this
-	// container should be started in another sandbox, we must do so. The
-	// metadata will indicate the ID of the sandbox, which is the same as
-	// the ID of the init container in the sandbox. We can look up that
-	// init container by ID to get the sandbox, then we need to expose a
-	// way to run a new container in the sandbox.
-
-	// Start a new sandbox for this container. Any errors after this point
-	// must destroy the container.
-	s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
-	if err != nil {
-		c.Destroy()
-		return nil, err
-	}
+	// If the metadata annotations indicate that this container should be
+	// started in an existing sandbox, we must do so. The metadata will
+	// indicate the ID of the sandbox, which is the same as the ID of the
+	// init container in the sandbox.
+	if specutils.ShouldCreateSandbox(spec) {
+		log.Debugf("Creating new sandbox for container %q", id)
+		// Start a new sandbox for this container. Any errors after this point
+		// must destroy the container.
+		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
+		if err != nil {
+			c.Destroy()
+			return nil, err
+		}
+		c.Sandbox = s
+	} else {
+		// This is sort of confusing. For a sandbox with a root
+		// container and a child container in it, runsc sees:
+		// * A container struct whose sandbox ID is equal to the
+		//   container ID. This is the root container that is tied to
+		//   the creation of the sandbox.
+		// * A container struct whose sandbox ID is equal to the above
+		//   container/sandbox ID, but that has a different container
+		//   ID. This is the child container.
+		sbid, ok := specutils.SandboxID(spec)
+		if !ok {
+			return nil, fmt.Errorf("no sandbox ID found when creating container")
+		}
+		log.Debugf("Creating new container %q in sandbox %q", c.ID, sbid)
 
-	c.Sandbox = s
+		// Find the sandbox associated with this ID.
+		sb, err := Load(conf.RootDir, sbid)
+		if err != nil {
+			c.Destroy()
+			return nil, err
+		}
+		c.Sandbox = sb.Sandbox
+	}
 	c.Status = Created
 
 	// Save the metadata file.
@@ -242,7 +263,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// this file is created, so it must be the last thing we do.
 	if pidFile != "" {
 		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.Pid())), 0644); err != nil {
-			s.Destroy()
+			c.Destroy()
 			return nil, fmt.Errorf("error writing pid file: %v", err)
 		}
 	}
@@ -266,9 +287,16 @@ func (c *Container) Start(conf *boot.Config) error {
 		}
 	}
 
-	if err := c.Sandbox.Start(c.ID, c.Spec, conf); err != nil {
-		c.Destroy()
-		return err
+	if specutils.ShouldCreateSandbox(c.Spec) {
+		if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
+			c.Destroy()
+			return err
+		}
+	} else {
+		if err := c.Sandbox.Start(c.Spec, conf); err != nil {
+			c.Destroy()
+			return err
+		}
 	}
 
 	// "If any poststart hook fails, the runtime MUST log a warning, but
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7f87ea5ab..1116ca170 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/container"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
@@ -51,7 +52,7 @@ func waitForProcessList(s *container.Container, expected []*control.Process) err
 	var got []*control.Process
 	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
 		var err error
-		got, err := s.Processes()
+		got, err = s.Processes()
 		if err != nil {
 			return fmt.Errorf("error getting process data from container: %v", err)
 		}
@@ -946,3 +947,73 @@ func TestAbbreviatedIDs(t *testing.T) {
 		}
 	}
 }
+
+// TestMultiContainerSanity checks that it is possible to run 2 dead-simple
+// containers in the same sandbox.
+func TestMultiContainerSanity(t *testing.T) {
+	containerIDs := []string{
+		testutil.UniqueContainerID(),
+		testutil.UniqueContainerID(),
+	}
+	containerAnnotations := []map[string]string{
+		// The first container creates a sandbox.
+		map[string]string{
+			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+		},
+		// The second container creates a container within the first
+		// container's sandbox.
+		map[string]string{
+			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+			specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
+		},
+	}
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// Setup the containers.
+	containers := make([]*container.Container, 0, len(containerIDs))
+	for i, annotations := range containerAnnotations {
+		spec := testutil.NewSpecWithArgs("sleep", "100")
+		spec.Annotations = annotations
+		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  0,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Check via ps that multiple processes are running.
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 0181dc9d4..90b46e247 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -81,9 +81,9 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	return s, nil
 }
 
-// Start starts running the containerized process inside the sandbox.
-func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
-	log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid)
+// StartRoot starts running the root container process inside the sandbox.
+func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
+	log.Debugf("Start root sandbox %q, pid: %d", s.ID, s.Pid)
 	conn, err := s.connect()
 	if err != nil {
 		return err
@@ -96,9 +96,7 @@ func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
 	}
 
 	// Send a message to the sandbox control server to start the root
-	// container..
-	//
-	// TODO: We need a way to start non-root containers.
+	// container.
 	if err := conn.Call(boot.RootContainerStart, nil, nil); err != nil {
 		return fmt.Errorf("error starting root container %v: %v", spec.Process.Args, err)
 	}
@@ -106,6 +104,26 @@ func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
+// Start starts running a non-root container inside the sandbox.
+func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config) error {
+	log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid)
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	args := boot.StartArgs{
+		Spec: spec,
+		Conf: conf,
+	}
+	if err := conn.Call(boot.ContainerStart, args, nil); err != nil {
+		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
+	}
+
+	return nil
+}
+
 // Processes retrieves the list of processes and associated metadata for a
 // given container in this sandbox.
 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
@@ -130,11 +148,11 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
 	conn, err := s.connect()
 	if err != nil {
-		return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+		return 0, s.connError(err)
 	}
 	defer conn.Close()
 
-	// Send a message to the sandbox control server to start the container..
+	// Send a message to the sandbox control server to start the container.
 	var waitStatus uint32
 	// TODO: Pass in the container id (cid) here. The sandbox
 	// should execute in the context of that container.
@@ -168,11 +186,15 @@ func (s *Sandbox) connect() (*urpc.Client, error) {
 	log.Debugf("Connecting to sandbox %q", s.ID)
 	conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
 	if err != nil {
-		return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+		return nil, s.connError(err)
 	}
 	return conn, nil
 }
 
+func (s *Sandbox) connError(err error) error {
+	return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+}
+
 func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) ([]*os.File, error) {
 	if conf.FileAccess != boot.FileAccessProxy {
 		// Don't start a gofer. The sandbox will access host FS directly.
@@ -266,7 +288,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the tty on the sandox process.
+	// pty master/slave pair and set the tty on the sandbox process.
 	if consoleEnabled {
 		// setupConsole will send the master on the socket, and return
 		// the slave.
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index e25290d5e..fee2de283 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -44,7 +44,7 @@ func TestGoferExits(t *testing.T) {
 		t.Fatalf("error creating container: %v", err)
 	}
 	defer s.Destroy()
-	if err := s.Start("123", spec, conf); err != nil {
+	if err := s.StartRoot(spec, conf); err != nil {
 		t.Fatalf("error starting container: %v", err)
 	}
 
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 8dae3efb1..c552111f2 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -63,6 +63,26 @@ func ValidateSpec(spec *specs.Spec) error {
 	if spec.Linux != nil && spec.Linux.Seccomp != nil {
 		log.Warningf("Seccomp spec is being ignored")
 	}
+
+	// 2 annotations are use by containerd to support multi-container pods.
+	//   "io.kubernetes.cri.container-type"
+	//   "io.kubernetes.cri.sandbox-id"
+	containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation]
+	_, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation]
+	switch {
+	// Non-containerd use won't set a container type.
+	case !hasContainerType:
+	case containerType == ContainerdContainerTypeSandbox:
+	// When starting a container in an existing sandbox, the sandbox ID
+	// must be set.
+	case containerType == ContainerdContainerTypeContainer:
+		if !hasSandboxID {
+			return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType)
+		}
+	default:
+		return fmt.Errorf("unknown container-type: %s", containerType)
+	}
+
 	return nil
 }
 
@@ -82,7 +102,7 @@ func ReadSpec(bundleDir string) (*specs.Spec, error) {
 }
 
 // GetExecutablePath returns the absolute path to the executable, relative to
-// the root.  It searches the environment PATH for the first file that exists
+// the root. It searches the environment PATH for the first file that exists
 // with the given name.
 func GetExecutablePath(exec, root string, env []string) (string, error) {
 	exec = filepath.Clean(exec)
@@ -246,6 +266,39 @@ func BinPath() (string, error) {
 	return binPath, nil
 }
 
+const (
+	// ContainerdContainerTypeAnnotation is the OCI annotation set by
+	// containerd to indicate whether the container to create should have
+	// its own sandbox or a container within an existing sandbox.
+	ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+	// ContainerdContainerTypeContainer is the container type value
+	// indicating the container should be created in an existing sandbox.
+	ContainerdContainerTypeContainer = "container"
+	// ContainerdContainerTypeSandbox is the container type value
+	// indicating the container should be created in a new sandbox.
+	ContainerdContainerTypeSandbox = "sandbox"
+
+	// ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
+	// which sandbox the container should be created in when the container
+	// is not the first container in the sandbox.
+	ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
+)
+
+// ShouldCreateSandbox returns true if the spec indicates that a new sandbox
+// should be created for the container. If false, the container should be
+// started in an existing sandbox.
+func ShouldCreateSandbox(spec *specs.Spec) bool {
+	t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]
+	return !ok || t == ContainerdContainerTypeSandbox
+}
+
+// SandboxID returns the ID of the sandbox to join and whether an ID was found
+// in the spec.
+func SandboxID(spec *specs.Spec) (string, bool) {
+	id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]
+	return id, ok
+}
+
 // WaitForReady waits for a process to become ready. The process is ready when
 // the 'ready' function returns true. It continues to wait if 'ready' returns
 // false. It returns error on timeout, if the process stops or if 'ready' fails.
-- 
cgit v1.2.3


From af6f9f56f80027a89ee517b79502ca6183094a39 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 20 Jun 2018 13:00:21 -0700
Subject: Add tool to configure runtime settings in docker

This will be used with the upcoming e2e image tests.

PiperOrigin-RevId: 201400832
Change-Id: I49509314e16ea54655ea8060dbf511a04a7a8f79
---
 runsc/tools/dockercfg/BUILD        |  12 +++
 runsc/tools/dockercfg/dockercfg.go | 189 +++++++++++++++++++++++++++++++++++++
 2 files changed, 201 insertions(+)
 create mode 100644 runsc/tools/dockercfg/BUILD
 create mode 100644 runsc/tools/dockercfg/dockercfg.go

(limited to 'runsc')

diff --git a/runsc/tools/dockercfg/BUILD b/runsc/tools/dockercfg/BUILD
new file mode 100644
index 000000000..5abb0c90a
--- /dev/null
+++ b/runsc/tools/dockercfg/BUILD
@@ -0,0 +1,12 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
+go_binary(
+    name = "dockercfg",
+    srcs = ["dockercfg.go"],
+    visibility = [
+        "//runsc/test:__subpackages__",
+    ],
+    deps = ["@com_github_google_subcommands//:go_default_library"],
+)
diff --git a/runsc/tools/dockercfg/dockercfg.go b/runsc/tools/dockercfg/dockercfg.go
new file mode 100644
index 000000000..0bd6cad93
--- /dev/null
+++ b/runsc/tools/dockercfg/dockercfg.go
@@ -0,0 +1,189 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Helper tool to configure Docker daemon.
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+)
+
+var (
+	configFile = flag.String("config_file", "/etc/docker/daemon.json", "path to Docker daemon config file")
+)
+
+func main() {
+	subcommands.Register(subcommands.HelpCommand(), "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+	subcommands.Register(&runtimeAdd{}, "")
+	subcommands.Register(&runtimeRemove{}, "")
+
+	// All subcommands must be registered before flag parsing.
+	flag.Parse()
+
+	exitCode := subcommands.Execute(context.Background())
+	os.Exit(int(exitCode))
+}
+
+type runtime struct {
+	Path        string   `json:"path,omitempty"`
+	RuntimeArgs []string `json:"runtimeArgs,omitempty"`
+}
+
+// runtimeAdd implements subcommands.Command.
+type runtimeAdd struct {
+}
+
+// Name implements subcommands.Command.Name.
+func (*runtimeAdd) Name() string {
+	return "runtime-add"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*runtimeAdd) Synopsis() string {
+	return "adds a runtime to docker daemon configuration"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*runtimeAdd) Usage() string {
+	return `runtime-add [flags] <name> <path> [args...]  -- if provided, args are passed as arguments to the runtime
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*runtimeAdd) SetFlags(*flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *runtimeAdd) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() < 2 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+	name := f.Arg(0)
+	path := f.Arg(1)
+	runtimeArgs := f.Args()[2:]
+
+	fmt.Printf("Adding runtime %q to file %q\n", name, *configFile)
+	c, err := readConfig(*configFile)
+	if err != nil {
+		log.Fatalf("Error reading config file %q: %v", *configFile, err)
+	}
+
+	var rts map[string]interface{}
+	if i, ok := c["runtimes"]; ok {
+		rts = i.(map[string]interface{})
+	} else {
+		rts = make(map[string]interface{})
+		c["runtimes"] = rts
+	}
+	rts[name] = runtime{Path: path, RuntimeArgs: runtimeArgs}
+
+	if err := writeConfig(c, *configFile); err != nil {
+		log.Fatalf("Error writing config file %q: %v", *configFile, err)
+	}
+	return subcommands.ExitSuccess
+}
+
+// runtimeRemove implements subcommands.Command.
+type runtimeRemove struct {
+}
+
+// Name implements subcommands.Command.Name.
+func (*runtimeRemove) Name() string {
+	return "runtime-rm"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*runtimeRemove) Synopsis() string {
+	return "removes a runtime from docker daemon configuration"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*runtimeRemove) Usage() string {
+	return `runtime-rm [flags] <name>
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*runtimeRemove) SetFlags(*flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *runtimeRemove) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+	name := f.Arg(0)
+
+	fmt.Printf("Removing runtime %q from file %q\n", name, *configFile)
+	c, err := readConfig(*configFile)
+	if err != nil {
+		log.Fatalf("Error reading config file %q: %v", *configFile, err)
+	}
+
+	var rts map[string]interface{}
+	if i, ok := c["runtimes"]; ok {
+		rts = i.(map[string]interface{})
+	} else {
+		log.Fatalf("runtime %q not found", name)
+	}
+	if _, ok := rts[name]; !ok {
+		log.Fatalf("runtime %q not found", name)
+	}
+	delete(rts, name)
+
+	if err := writeConfig(c, *configFile); err != nil {
+		log.Fatalf("Error writing config file %q: %v", *configFile, err)
+	}
+	return subcommands.ExitSuccess
+}
+
+func readConfig(path string) (map[string]interface{}, error) {
+	configBytes, err := ioutil.ReadFile(path)
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+	c := make(map[string]interface{})
+	if len(configBytes) > 0 {
+		if err := json.Unmarshal(configBytes, &c); err != nil {
+			return nil, err
+		}
+	}
+	return c, nil
+}
+
+func writeConfig(c map[string]interface{}, path string) error {
+	b, err := json.MarshalIndent(c, "", "    ")
+	if err != nil {
+		return err
+	}
+
+	if err := os.Rename(path, path+"~"); err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("error renaming config file %q: %v", path, err)
+	}
+	if err := ioutil.WriteFile(path, b, 0644); err != nil {
+		return fmt.Errorf("error writing config file %q: %v", path, err)
+	}
+	return nil
+}
-- 
cgit v1.2.3


From 4ad7315b6759afa81f492ec119080deb9a224101 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 20 Jun 2018 13:30:39 -0700
Subject: Add 'runsc debug' command

It prints sandbox stacks to the log to help debug stuckness. I expect
that many more options will be added in the future.

PiperOrigin-RevId: 201405931
Change-Id: I87e560800cd5a5a7b210dc25a5661363c8c3a16e
---
 pkg/log/log.go           |   8 ++--
 runsc/boot/BUILD         |   1 +
 runsc/boot/controller.go |   5 +++
 runsc/boot/debug.go      |  29 +++++++++++++
 runsc/cmd/BUILD          |   1 +
 runsc/cmd/debug.go       | 108 +++++++++++++++++++++++++++++++++++++++++++++++
 runsc/main.go            |   1 +
 runsc/sandbox/sandbox.go |  16 +++++++
 8 files changed, 165 insertions(+), 4 deletions(-)
 create mode 100644 runsc/boot/debug.go
 create mode 100644 runsc/cmd/debug.go

(limited to 'runsc')

diff --git a/pkg/log/log.go b/pkg/log/log.go
index cdfc0601a..c496e86e4 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -251,8 +251,8 @@ const defaultStackSize = 1 << 16 // 64KB
 // maxStackSize is the maximum buffer size to allocate for stack traces.
 const maxStackSize = 1 << 26 // 64MB
 
-// stacks returns goroutine stacks, like panic.
-func stacks(all bool) []byte {
+// Stacks returns goroutine stacks, like panic.
+func Stacks(all bool) []byte {
 	var trace []byte
 	for s := defaultStackSize; s <= maxStackSize; s *= 4 {
 		trace = make([]byte, s)
@@ -271,7 +271,7 @@ func stacks(all bool) []byte {
 //
 // This will be print a traceback, tb, as Warningf(format+":\n%s", v..., tb).
 func Traceback(format string, v ...interface{}) {
-	v = append(v, stacks(false))
+	v = append(v, Stacks(false))
 	Warningf(format+":\n%s", v...)
 }
 
@@ -279,7 +279,7 @@ func Traceback(format string, v ...interface{}) {
 //
 // This will be print a traceback, tb, as Warningf(format+":\n%s", v..., tb).
 func TracebackAll(format string, v ...interface{}) {
-	v = append(v, stacks(true))
+	v = append(v, Stacks(true))
 	Warningf(format+":\n%s", v...)
 }
 
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 8b3b09a22..924cc2b90 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "config.go",
         "controller.go",
+        "debug.go",
         "events.go",
         "fds.go",
         "fs.go",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 1a598199d..ec24c4dad 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -68,6 +68,9 @@ const (
 	// RootContainerStart is the URPC endpoint for starting a new sandbox
 	// with root container.
 	RootContainerStart = "containerManager.StartRoot"
+
+	// SandboxStacks collects sandbox stacks for debugging.
+	SandboxStacks = "debug.Stacks"
 )
 
 // ControlSocketAddr generates an abstract unix socket name for the given id.
@@ -107,6 +110,8 @@ func newController(fd int, k *kernel.Kernel, w *watchdog.Watchdog) (*controller,
 		srv.Register(net)
 	}
 
+	srv.Register(&debug{})
+
 	if err := srv.StartServing(); err != nil {
 		return nil, err
 	}
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
new file mode 100644
index 000000000..971962c91
--- /dev/null
+++ b/runsc/boot/debug.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+type debug struct {
+}
+
+// Stacks collects all sandbox stacks and copies them to 'stacks'.
+func (*debug) Stacks(_ *struct{}, stacks *string) error {
+	buf := log.Stacks(true)
+	*stacks = string(buf)
+	return nil
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index fffb6f359..747793efc 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -10,6 +10,7 @@ go_library(
         "checkpoint.go",
         "cmd.go",
         "create.go",
+        "debug.go",
         "delete.go",
         "events.go",
         "exec.go",
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
new file mode 100644
index 000000000..87ad21c9a
--- /dev/null
+++ b/runsc/cmd/debug.go
@@ -0,0 +1,108 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Debug implements subcommands.Command for the "debug" command.
+type Debug struct {
+	pid    int
+	stacks bool
+}
+
+// Name implements subcommands.Command.
+func (*Debug) Name() string {
+	return "debug"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Debug) Synopsis() string {
+	return "shows a variety of debug information"
+}
+
+// Usage implements subcommands.Command.
+func (*Debug) Usage() string {
+	return `debug [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.
+func (d *Debug) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set")
+	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	var c *container.Container
+	conf := args[0].(*boot.Config)
+
+	if d.pid == 0 {
+		// No pid, container ID must have been provided.
+		if f.NArg() != 1 {
+			f.Usage()
+			return subcommands.ExitUsageError
+		}
+		var err error
+		c, err = container.Load(conf.RootDir, f.Arg(0))
+		if err != nil {
+			Fatalf("error loading container %q: %v", f.Arg(0), err)
+		}
+	} else {
+		if f.NArg() != 0 {
+			f.Usage()
+			return subcommands.ExitUsageError
+		}
+		// Go over all sandboxes and find the one that matches PID.
+		ids, err := container.List(conf.RootDir)
+		if err != nil {
+			Fatalf("error listing containers: %v", err)
+		}
+		for _, id := range ids {
+			candidate, err := container.Load(conf.RootDir, id)
+			if err != nil {
+				Fatalf("error loading container %q: %v", id, err)
+			}
+			if candidate.Pid() == d.pid {
+				c = candidate
+				break
+			}
+		}
+		if c == nil {
+			Fatalf("container with PID %d not found", d.pid)
+		}
+	}
+
+	log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
+	if !c.Sandbox.IsRunning() {
+		Fatalf("sandbox %q is not running", c.Sandbox.ID)
+	}
+
+	if d.stacks {
+		log.Infof("Retrieving sandbox stacks")
+		stacks, err := c.Sandbox.Stacks()
+		if err != nil {
+			Fatalf("error retrieving stacks: %v", err)
+		}
+		log.Infof("     *** Stack dump ***\n%s", stacks)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/main.go b/runsc/main.go
index 4d69f5803..cd906e191 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -88,6 +88,7 @@ func main() {
 	// The string below will be printed above the commands.
 	const internalGroup = "internal use only"
 	subcommands.Register(new(cmd.Boot), internalGroup)
+	subcommands.Register(new(cmd.Debug), internalGroup)
 	subcommands.Register(new(cmd.Gofer), internalGroup)
 
 	// All subcommands must be registered before flag parsing.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 90b46e247..652910efa 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -537,6 +537,22 @@ func (s *Sandbox) IsRunning() bool {
 	return false
 }
 
+// Stacks collects and returns all stacks for the sandbox.
+func (s *Sandbox) Stacks() (string, error) {
+	log.Debugf("Stacks sandbox %q", s.ID)
+	conn, err := s.connect()
+	if err != nil {
+		return "", err
+	}
+	defer conn.Close()
+
+	var stacks string
+	if err := conn.Call(boot.SandboxStacks, nil, &stacks); err != nil {
+		return "", fmt.Errorf("err getting sandbox %q stacks: %v", s.ID, err)
+	}
+	return stacks, nil
+}
+
 // killProcess sends a signal to the host process (i.e. a sandbox or gofer
 // process). Sandbox.Signal should be used to send a signal to a process
 // running inside the sandbox.
-- 
cgit v1.2.3


From 2b5bdb525e99fc1ef099b2ef083a09772241ea58 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 20 Jun 2018 14:37:56 -0700
Subject: Add end-to-end image tests

PiperOrigin-RevId: 201418619
Change-Id: I7961b027394d98422642f829bc54745838c138bd
---
 runsc/test/image/BUILD          |  23 ++++
 runsc/test/image/image.go       |  16 +++
 runsc/test/image/image_test.go  | 230 ++++++++++++++++++++++++++++++++++++++++
 runsc/test/image/latin10k.txt   |  33 ++++++
 runsc/test/testutil/testutil.go |  19 ++++
 5 files changed, 321 insertions(+)
 create mode 100644 runsc/test/image/BUILD
 create mode 100644 runsc/test/image/image.go
 create mode 100644 runsc/test/image/image_test.go
 create mode 100644 runsc/test/image/latin10k.txt

(limited to 'runsc')

diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
new file mode 100644
index 000000000..2876d4256
--- /dev/null
+++ b/runsc/test/image/BUILD
@@ -0,0 +1,23 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_test(
+    name = "image_test",
+    size = "small",
+    srcs = ["image_test.go"],
+    data = ["latin10k.txt"],
+    embed = [":image"],
+    tags = [
+        # Requires docker and runsc to be configured before the test runs.
+        "manual",
+        "local",
+    ],
+    deps = ["//runsc/test/testutil"],
+)
+
+go_library(
+    name = "image",
+    srcs = ["image.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/test/image",
+)
diff --git a/runsc/test/image/image.go b/runsc/test/image/image.go
new file mode 100644
index 000000000..069d08013
--- /dev/null
+++ b/runsc/test/image/image.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package image is empty. See image_test.go for description.
+package image
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
new file mode 100644
index 000000000..08b1bf279
--- /dev/null
+++ b/runsc/test/image/image_test.go
@@ -0,0 +1,230 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package image provides end-to-end image tests for runsc. These tests require
+// docker and runsc to be installed on the machine.
+//
+// The tests expect the runtime name to be provided in the RUNSC_RUNTIME
+// environment variable (default: runsc-test).
+//
+// Each test calls docker commands to start up a container, and tests that it is
+// behaving properly, like connecting to a port or looking at the output. The
+// container is killed and deleted at the end.
+package image
+
+import (
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net/http"
+	"os"
+	"os/exec"
+	"path"
+	"regexp"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+func init() {
+	rand.Seed(time.Now().UnixNano())
+}
+
+func runtime() string {
+	r := os.Getenv("RUNSC_RUNTIME")
+	if r == "" {
+		return "runsc-test"
+	}
+	return r
+}
+
+func mountArg(source, target string) string {
+	return fmt.Sprintf("%s:%s", source, target)
+}
+
+func getLocalPath(file string) string {
+	return path.Join(".", file)
+}
+
+type docker struct {
+	runtime string
+	name    string
+}
+
+func makeDocker(namePrefix string) docker {
+	suffix := fmt.Sprintf("-%06d", rand.Int())[:7]
+	return docker{name: namePrefix + suffix, runtime: runtime()}
+}
+
+// do executes docker command.
+func (d *docker) do(args ...string) (string, error) {
+	cmd := exec.Command("docker", args...)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("error executing docker %s: %v", args, err)
+	}
+	return string(out), nil
+}
+
+// run calls 'docker run' with the arguments provided.
+func (d *docker) run(args ...string) (string, error) {
+	a := []string{"run", "--runtime", d.runtime, "--name", d.name, "-d"}
+	a = append(a, args...)
+	return d.do(a...)
+}
+
+// cleanUp kills and deletes the container.
+func (d *docker) cleanUp() error {
+	if _, err := d.do("kill", d.name); err != nil {
+		return fmt.Errorf("error killing container %q: %v", d.name, err)
+	}
+	if _, err := d.do("rm", d.name); err != nil {
+		return fmt.Errorf("error deleting container %q: %v", d.name, err)
+	}
+	return nil
+}
+
+// findPort returns the host port that is mapped to 'sandboxPort'. This calls
+// docker to allocate a free port in the host and prevent conflicts.
+func (d *docker) findPort(sandboxPort int) (int, error) {
+	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
+	out, err := d.do("inspect", "-f", format, d.name)
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving port: %v", err)
+	}
+	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
+	}
+	return port, nil
+}
+
+// waitForOutput calls 'docker logs' to retrieve containers output and searches
+// for the given pattern.
+func (d *docker) waitForOutput(pattern string, timeout time.Duration) error {
+	re := regexp.MustCompile(pattern)
+	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
+		out, err := d.do("logs", d.name)
+		if err != nil {
+			return err
+		}
+		if re.MatchString(out) {
+			return nil
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	return fmt.Errorf("timeout waiting for output %q", re.String())
+}
+
+func TestHelloWorld(t *testing.T) {
+	d := makeDocker("hello-test")
+	if out, err := d.run("hello-world"); err != nil {
+		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	}
+	defer d.cleanUp()
+
+	if err := d.waitForOutput(".*Hello from Docker!.*", 5*time.Second); err != nil {
+		t.Fatalf("docker didn't say hello: %v", err)
+	}
+}
+
+func TestHttpd(t *testing.T) {
+	d := makeDocker("http-test")
+
+	// Create temp directory to copy htdocs files. The sandbox doesn't have access
+	// to files in the test dir.
+	dir, err := ioutil.TempDir("", "httpd")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir failed: %v", err)
+	}
+	if err := os.Chmod(dir, 0777); err != nil {
+		t.Fatalf("os.Chmod(%q, 0777) failed: %v", dir, err)
+	}
+	src := getLocalPath("latin10k.txt")
+	dst := path.Join(dir, "latin10k.txt")
+	if err := testutil.Copy(src, dst); err != nil {
+		t.Fatalf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
+	}
+
+	// Start the container.
+	if out, err := d.run("-p", "80", "-v", mountArg(dir, "/usr/local/apache2/htdocs"), "httpd"); err != nil {
+		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	}
+	defer d.cleanUp()
+
+	// Find where port 80 is mapped to.
+	port, err := d.findPort(80)
+	if err != nil {
+		t.Fatalf("docker.findPort(80) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := d.waitForOutput(".*'httpd -D FOREGROUND'.*", 5*time.Second); err != nil {
+		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	}
+
+	url := fmt.Sprintf("http://localhost:%d/not-found", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Fatalf("error reaching http server: %v", err)
+	}
+	if want := http.StatusNotFound; resp.StatusCode != want {
+		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+
+	url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port)
+	resp, err = http.Get(url)
+	if err != nil {
+		t.Fatalf("Error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		t.Fatalf("Error reading http response: %v", err)
+	}
+	defer resp.Body.Close()
+
+	// READALL is the last word in the file. Ensures everything was read.
+	if want := "READALL"; strings.HasSuffix(string(body), want) {
+		t.Errorf("response doesn't contain %q, resp: %q", want, body)
+	}
+}
+
+func MainTest(m *testing.M) {
+	// Check correct docker is installed.
+	cmd := exec.Command("docker", "version")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		log.Fatalf("Error running %q: %v", "docker version", err)
+	}
+	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
+	matches := re.FindStringSubmatch(string(out))
+	if len(matches) != 3 {
+		log.Fatalf("Invalid docker output: %s", out)
+	}
+	major, _ := strconv.Atoi(matches[1])
+	minor, _ := strconv.Atoi(matches[2])
+	if major < 17 || (major == 17 && minor < 9) {
+		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
+	}
+
+	os.Exit(m.Run())
+}
diff --git a/runsc/test/image/latin10k.txt b/runsc/test/image/latin10k.txt
new file mode 100644
index 000000000..61341e00b
--- /dev/null
+++ b/runsc/test/image/latin10k.txt
@@ -0,0 +1,33 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras ut placerat felis. Maecenas urna est, auctor a efficitur sit amet, egestas et augue. Curabitur dignissim scelerisque nunc vel cursus. Ut vehicula est pretium, consectetur nunc non, pharetra ligula. Curabitur ut ultricies metus. Suspendisse pulvinar, orci sed fermentum vestibulum, eros turpis molestie lectus, nec elementum risus dolor mattis felis. Donec ultrices ipsum sem, at pretium lacus convallis at. Mauris nulla enim, tincidunt non bibendum at, vehicula pulvinar mauris.
+
+Duis in dapibus turpis. Pellentesque maximus magna odio, ac congue libero laoreet quis. Maecenas euismod risus in justo aliquam accumsan. Nunc quis ornare arcu, sit amet sodales elit. Phasellus nec scelerisque nisl, a tincidunt arcu. Proin ornare est nunc, sed suscipit orci interdum et. Suspendisse condimentum venenatis diam in tempor. Aliquam egestas lectus in rutrum tempus. Donec id egestas eros. Donec molestie consequat purus, sed posuere odio venenatis vitae. Nunc placerat augue id vehicula varius. In hac habitasse platea dictumst. Proin at est accumsan, venenatis quam a, fermentum risus. Phasellus posuere pellentesque enim, id suscipit magna consequat ut. Quisque ut tortor ante.
+
+Cras ut vulputate metus, a laoreet lectus. Vivamus ultrices molestie odio in tristique. Morbi faucibus mi eget sollicitudin fringilla. Fusce vitae lacinia ligula. Sed egestas sed diam eu posuere. Maecenas justo nisl, venenatis vel nibh vel, cursus aliquam velit. Praesent lacinia dui id erat venenatis rhoncus. Morbi gravida felis ante, sit amet vehicula orci rhoncus vitae.
+
+Sed finibus sagittis dictum. Proin auctor suscipit sem et mattis. Phasellus libero ligula, pellentesque ut felis porttitor, fermentum sollicitudin orci. Nulla eu nulla nibh. Fusce a eros risus. Proin vel magna risus. Donec nec elit eleifend, scelerisque sapien vitae, pharetra quam. Donec porttitor mauris scelerisque, tempus orci hendrerit, dapibus felis. Nullam libero elit, sollicitudin a aliquam at, ultrices in erat. Mauris eget ligula sodales, porta turpis et, scelerisque odio. Mauris mollis leo vitae purus gravida, in tempor nunc efficitur. Nulla facilisis posuere augue, nec pellentesque lectus eleifend ac. Vestibulum convallis est a feugiat tincidunt. Donec vitae enim volutpat, tincidunt eros eu, malesuada nibh.
+
+Quisque molestie, magna ornare elementum convallis, erat enim sagittis ipsum, eget porttitor sapien arcu id purus. Donec ut cursus diam. Nulla rutrum nulla et mi fermentum, vel tempus tellus posuere. Proin vitae pharetra nulla, nec ornare ex. Nulla consequat, augue a accumsan euismod, turpis leo ornare ligula, a pulvinar enim dolor ut augue. Quisque volutpat, lectus a varius mollis, nisl eros feugiat sem, at egestas lacus justo eu elit. Vestibulum scelerisque mauris est, sagittis interdum nunc accumsan sit amet. Maecenas aliquet ex ut lacus ornare, eu sagittis nibh imperdiet. Duis ultrices nisi velit, sed sodales risus sollicitudin et. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Etiam a accumsan augue, vitae pulvinar nulla. Pellentesque euismod sodales magna, nec luctus eros mattis eget. Sed lacinia suscipit lectus, eget consectetur dui pellentesque sed. Nullam nec mattis tellus.
+
+Aliquam erat volutpat. Praesent lobortis massa porttitor eros tincidunt, nec consequat diam pharetra. Duis efficitur non lorem sed mattis. Suspendisse justo nunc, pulvinar eu porttitor at, facilisis id eros. Suspendisse potenti. Cras molestie aliquet orci ut fermentum. In tempus aliquet eros nec suscipit. Suspendisse in mauris ut lectus ultrices blandit sit amet vitae est. Nam magna massa, porttitor ut semper id, feugiat vel quam. Suspendisse dignissim posuere scelerisque. Donec scelerisque lorem efficitur suscipit suscipit. Nunc luctus ligula et scelerisque lacinia.
+
+Suspendisse potenti. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Sed ultrices, sem in venenatis scelerisque, tellus ipsum porttitor urna, et iaculis lectus odio ac nisi. Integer luctus dui urna, at sollicitudin elit dapibus eu. Praesent nibh ante, porttitor a ante in, ullamcorper pretium felis. Aliquam vel tortor imperdiet, imperdiet lorem et, cursus mi. Proin tempus velit est, ut hendrerit metus gravida sed. Sed nibh sapien, faucibus quis ipsum in, scelerisque lacinia elit. In nec magna eu magna laoreet rhoncus. Donec vitae rutrum mauris. Integer urna felis, consequat at rhoncus vitae, auctor quis elit. Duis a pulvinar sem, nec gravida nisl. Nam non dapibus purus. Praesent vestibulum turpis nec erat porttitor, a scelerisque purus tincidunt.
+
+Nam fringilla leo nisi, nec placerat nisl luctus eget. Aenean malesuada nunc porta sapien sodales convallis. Suspendisse ut massa tempor, ullamcorper mi ut, faucibus turpis. Vivamus at sagittis metus. Donec varius ac mi eget sodales. Nulla feugiat, nulla eu fringilla fringilla, nunc lorem sollicitudin quam, vitae lacinia velit lorem eu orci. Mauris leo urna, pellentesque ac posuere non, pellentesque sit amet quam.
+
+Vestibulum porta diam urna, a aliquet nibh vestibulum et. Proin interdum bibendum nisl sed rhoncus. Sed vel diam hendrerit, faucibus ante et, hendrerit diam. Nunc dolor augue, mattis non dolor vel, luctus sodales neque. Cras malesuada fermentum dolor eu lobortis. Integer dapibus volutpat consequat. Maecenas posuere feugiat nunc. Donec vel mollis elit, volutpat consequat enim. Nulla id nisi finibus orci imperdiet elementum. Phasellus ultrices, elit vitae consequat rutrum, nisl est congue massa, quis condimentum justo nisi vitae turpis. Maecenas aliquet risus sit amet accumsan elementum. Proin non finibus elit, sit amet lobortis augue.
+
+Morbi pretium pulvinar sem vel sollicitudin. Proin imperdiet fringilla leo, non pellentesque lacus gravida nec. Vivamus ullamcorper consectetur ligula eu consectetur. Curabitur sit amet tempus purus. Curabitur quam quam, tincidunt eu tempus vel, volutpat at ipsum. Maecenas lobortis elit ac justo interdum, sit amet mattis ligula mollis. Sed posuere ligula et felis convallis tempor. Aliquam nec mollis velit. Donec varius sit amet erat at imperdiet. Nulla ipsum justo, tempor non sollicitudin gravida, dignissim vel orci. In hac habitasse platea dictumst. Cras cursus tellus id arcu aliquet accumsan. Phasellus ac erat dui.
+
+Duis mollis metus at mi luctus aliquam. Duis varius eget erat ac porttitor. Phasellus lobortis sagittis lacinia. Etiam sagittis eget erat in pulvinar. Phasellus sodales risus nec vulputate accumsan. Cras sit amet pellentesque dui. Praesent consequat felis mi, at vulputate diam convallis a. Donec hendrerit nibh vel justo consequat dictum. In euismod, dui sit amet malesuada suscipit, mauris ex rhoncus eros, sed ornare arcu nunc eu urna. Pellentesque eget erat augue. Integer rutrum mauris sem, nec sodales nulla cursus vel. Vivamus porta, urna vel varius vulputate, nulla arcu malesuada dui, a ultrices magna ante sed nibh.
+
+Morbi ultricies aliquam lorem id bibendum. Donec sit amet nunc vitae massa gravida eleifend hendrerit vel libero. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nulla vestibulum tempus condimentum. Aliquam dolor ipsum, condimentum in sapien et, tempor iaculis nulla. Aenean non pharetra augue. Maecenas mattis dignissim maximus. Fusce elementum tincidunt massa sit amet lobortis. Phasellus nec pharetra dui, et malesuada ante. Nullam commodo pretium tellus. Praesent sollicitudin, enim eget imperdiet scelerisque, odio felis vulputate dolor, eget auctor neque tellus ac lorem.
+
+In consectetur augue et sapien feugiat varius. Nam tortor mi, consectetur ac felis non, elementum venenatis augue. Suspendisse ut tellus in est sagittis cursus. Quisque faucibus, neque sit amet semper congue, nibh augue finibus odio, vitae interdum dolor arcu eget arcu. Curabitur dictum risus massa, non tincidunt urna molestie non. Maecenas eu quam purus. Donec vulputate, dui eu accumsan blandit, mauris tortor tristique mi, sed blandit leo quam id quam. Ut venenatis sagittis malesuada. Integer non auctor orci. Duis consectetur massa felis. Fusce euismod est sit amet bibendum finibus. Vestibulum dolor ex, tempor at elit in, iaculis cursus dui. Nunc sed neque ac risus rutrum tempus sit amet at ante. In hac habitasse platea dictumst.
+
+Donec rutrum, velit nec viverra tincidunt, est velit viverra neque, quis auctor leo ex at lectus. Morbi eget purus nisi. Aliquam lacus dui, interdum vitae elit at, venenatis dignissim est. Duis ac mollis lorem. Vivamus a vestibulum quam. Maecenas non metus dolor. Praesent tortor nunc, tristique at nisl molestie, vulputate eleifend diam. Integer ultrices lacus odio, vel imperdiet enim accumsan id. Sed ligula tortor, interdum eu velit eget, pharetra pulvinar magna. Sed non lacus in eros tincidunt sagittis ac vel justo. Donec vitae leo sagittis, accumsan ante sit amet, accumsan odio. Ut volutpat ultricies tortor. Vestibulum tempus purus et est tristique sagittis quis vitae turpis.
+
+Nam iaculis neque lacus, eget euismod turpis blandit eget. In hac habitasse platea dictumst. Phasellus justo neque, scelerisque sit amet risus ut, pretium commodo nisl. Phasellus auctor sapien sed ex bibendum fermentum. Proin maximus odio a ante ornare, a feugiat lorem egestas. Etiam efficitur tortor a ante tincidunt interdum. Nullam non est ac massa congue efficitur sit amet nec eros. Nullam at ipsum vel mauris tincidunt efficitur. Duis pulvinar nisl elit, id auctor risus laoreet ac. Sed nunc mauris, tristique id leo ut, condimentum congue nunc. Sed ultricies, mauris et convallis faucibus, justo ex faucibus est, at lobortis purus justo non arcu. Integer vel facilisis elit, dapibus imperdiet mauris.
+
+Pellentesque non mattis turpis, eget bibendum velit. Fusce sollicitudin ante ac tincidunt rhoncus. Praesent porta scelerisque consequat. Donec eleifend faucibus sollicitudin. Quisque vitae purus eget tortor tempor ultrices. Maecenas mauris diam, semper vitae est non, imperdiet tempor magna. Duis elit lacus, auctor vestibulum enim eget, rhoncus porttitor tortor.
+
+Donec non rhoncus nibh. Cras dapibus justo vitae nunc accumsan, id congue erat egestas. Aenean at ante ante. Duis eleifend imperdiet dREADALL
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 9be4407e0..25535ea37 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -18,6 +18,7 @@ package testutil
 import (
 	"encoding/json"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -146,3 +147,21 @@ func writeSpec(dir string, spec *specs.Spec) error {
 func UniqueContainerID() string {
 	return fmt.Sprintf("test-container-%d", time.Now().UnixNano())
 }
+
+// Copy copies file from src to dst.
+func Copy(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+
+	_, err = io.Copy(out, in)
+	return err
+}
-- 
cgit v1.2.3


From 2f59ba0e2d2169cf429b73a39a920f8d615f8eca Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 20 Jun 2018 15:27:06 -0700
Subject: Include image test as part of kokoro tests

PiperOrigin-RevId: 201427731
Change-Id: I5cbee383ec51c02b7892ec7812cbbdc426be8991
---
 kokoro/gcp_ubuntu/run_tests.sh | 13 +++++++
 runsc/test/image/image_test.go |  4 +-
 runsc/test/image/install.sh    | 85 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100755 runsc/test/image/install.sh

(limited to 'runsc')

diff --git a/kokoro/gcp_ubuntu/run_tests.sh b/kokoro/gcp_ubuntu/run_tests.sh
index ce458be9c..2f5e375eb 100755
--- a/kokoro/gcp_ubuntu/run_tests.sh
+++ b/kokoro/gcp_ubuntu/run_tests.sh
@@ -31,6 +31,10 @@ cd git/repo
 # Build everything.
 bazel build //...
 
+# Test use this variable to determine what runtime to use.
+runtime=runsc_test_$((RANDOM))
+sudo -n ./runsc/test/image/install.sh --runtime ${runtime}
+
 # Run the tests and upload results.
 #
 # We turn off "-e" flag because we must move the log files even if the test
@@ -38,6 +42,15 @@ bazel build //...
 set +e
 bazel test --test_output=errors //...
 exit_code=${?}
+
+if [[ ${exit_code} -eq 0 ]]; then
+  # image_test is tagged manual
+  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime} //runsc/test/image:image_test
+  exit_code=${?}
+fi
+
+# Best effort to uninstall
+sudo -n ./runsc/test/image/install.sh -u --runtime ${runtime}
 set -e
 
 # Find and rename all test xml and log files so that Sponge can pick them up.
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 08b1bf279..5034411e5 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 // Package image provides end-to-end image tests for runsc. These tests require
-// docker and runsc to be installed on the machine.
+// docker and runsc to be installed on the machine. To set it up, run:
+//
+//     ./runsc/test/image/install.sh [--runtime <name>]
 //
 // The tests expect the runtime name to be provided in the RUNSC_RUNTIME
 // environment variable (default: runsc-test).
diff --git a/runsc/test/image/install.sh b/runsc/test/image/install.sh
new file mode 100755
index 000000000..94832dbe4
--- /dev/null
+++ b/runsc/test/image/install.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Fail on any error
+set -e
+
+# Defaults
+declare runtime=runsc-test
+declare uninstall=0
+
+function findExe() {
+  local exe=${1}
+
+  local path=$(find bazel-bin/runsc -type f -executable -name "${exe}" | head -n1)
+  if [[ "${path}" == "" ]]; then
+    echo "Location of ${exe} not found in bazel-bin" >&2
+    exit 1
+  fi
+  echo "${path}"
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --runtime)
+      shift
+      [ "$#" -le 0 ] && echo "No runtime provided" && exit 1
+      runtime=$1
+      ;;
+    -u)
+      uninstall=1
+      ;;
+    *)
+      echo "Unknown option: ${1}"
+      echo ""
+      echo "Usage: ${0} [--runtime <name>] [-u]"
+      echo "  --runtime    sets the runtime name, default: runsc-test"
+      echo "  -u           uninstall the runtime"
+      exit 1
+  esac
+  shift
+done
+
+# Find location of executables.
+declare -r dockercfg=$(findExe dockercfg)
+[[ "${dockercfg}" == "" ]] && exit 1
+
+declare runsc=$(findExe runsc)
+[[ "${runsc}" == "" ]] && exit 1
+
+if [[ ${uninstall} == 0 ]]; then
+  rm -rf /tmp/${runtime}
+  mkdir -p /tmp/${runtime}
+  cp "${runsc}" /tmp/${runtime}/runsc
+  runsc=/tmp/${runtime}/runsc
+
+  # Make tmp dir and runsc binary readable and executable to all users, since it
+  # will run in an empty user namespace.
+  chmod a+rx "${runsc}" $(dirname "${runsc}")
+
+  # Make log dir executable and writable to all users for the same reason.
+  declare logdir=/tmp/"${runtime?}/logs"
+  mkdir -p "${logdir}"
+  sudo -n chmod a+wx "${logdir}"
+
+  sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" --debug-log-dir "${logdir}" --debug --strace --log-packets
+
+else
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"
+fi
+
+echo "Restarting docker service..."
+sudo -n /etc/init.d/docker restart
-- 
cgit v1.2.3


From 95cb01e0a9517f7119e3d848728500692a4f5cba Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 20 Jun 2018 15:31:12 -0700
Subject: Reduce test sleep time

PiperOrigin-RevId: 201428433
Change-Id: I72de1e46788ec84f61513416bb690956e515907e
---
 runsc/container/container_test.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 1116ca170..5128f5946 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -205,7 +205,9 @@ func TestLifecycle(t *testing.T) {
 	// Wait on the container.
 	var wg sync.WaitGroup
 	wg.Add(1)
+	ch := make(chan struct{})
 	go func() {
+		ch <- struct{}{}
 		ws, err := s.Wait()
 		if err != nil {
 			t.Fatalf("error waiting on container: %v", err)
@@ -218,7 +220,8 @@ func TestLifecycle(t *testing.T) {
 
 	// Wait a bit to ensure that we've started waiting on the container
 	// before we signal.
-	time.Sleep(5 * time.Second)
+	<-ch
+	time.Sleep(100 * time.Millisecond)
 	// Send the container a SIGTERM which will cause it to stop.
 	if err := s.Signal(syscall.SIGTERM); err != nil {
 		t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
-- 
cgit v1.2.3


From ef4f239c793a1a202d3249c6a8139e0602d94d94 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 21 Jun 2018 08:33:46 -0700
Subject: Fix typo in runsc gofer flag description

PiperOrigin-RevId: 201529295
Change-Id: I55eb516ec6d14fbcd48593a3d61f724adc253a23
---
 runsc/cmd/gofer.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 39803bde5..c0b747737 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -44,7 +44,7 @@ func (*Gofer) Name() string {
 
 // Synopsis implements subcommands.Command.
 func (*Gofer) Synopsis() string {
-	return "launch a gofer process that server files over 9P protocol (internal use only)"
+	return "launch a gofer process that serves files over 9P protocol (internal use only)"
 }
 
 // Usage implements subcommands.Command.
-- 
cgit v1.2.3


From 81d13fbd4d2f14b61e89faa0c9888be568f97168 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 21 Jun 2018 09:42:17 -0700
Subject: runsc: Default umask should be 0.

PiperOrigin-RevId: 201539050
Change-Id: I36cbf270fa5ad25de507ecb919e4005eda6aa16d
---
 runsc/boot/loader.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index d1a413cc7..e1194bd03 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -259,7 +259,7 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds
 		Envv:                 spec.Process.Env,
 		WorkingDirectory:     spec.Process.Cwd,
 		Credentials:          creds,
-		Umask:                uint(0022),
+		Umask:                0,
 		Limits:               ls,
 		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
 		UTSNamespace:         utsns,
-- 
cgit v1.2.3


From 7d6149063a0bb6e563885a8f199756e7af5e69cf Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Thu, 21 Jun 2018 09:57:33 -0700
Subject: Restore implementation added to runsc.

Restore creates a new container and uses the given image-path to load a saved
image of a previous container. Restore command is plumbed through container
and sandbox. This command does not work yet - more to come.

PiperOrigin-RevId: 201541229
Change-Id: I864a14c799ce3717d99bcdaaebc764281863d06f
---
 runsc/cmd/create.go               |  2 +-
 runsc/cmd/restore.go              | 55 ++++++++++++++++++++++++++++++++++++---
 runsc/container/container.go      |  6 ++---
 runsc/container/container_test.go | 22 ++++++++--------
 runsc/sandbox/sandbox.go          | 26 +++++++++++++++---
 runsc/sandbox/sandbox_test.go     |  2 +-
 6 files changed, 90 insertions(+), 23 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 94a889077..5a887c73c 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -87,7 +87,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	// Create the container. A new sandbox will be created for the
 	// container unless the metadata specifies that it should be run in an
 	// existing container.
-	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile); err != nil {
+	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, ""); err != nil {
 		Fatalf("error creating container: %v", err)
 	}
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index a535197a4..0589a36bf 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -15,13 +15,23 @@
 package cmd
 
 import (
+	"syscall"
+
 	"context"
 	"flag"
 	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 // Restore implements subcommands.Command for the "restore" command.
 type Restore struct {
+	// Restore flags are a super-set of those for Create.
+	Create
+
+	// imagePath is the path to the saved container image
+	imagePath string
 }
 
 // Name implements subcommands.Command.Name.
@@ -36,16 +46,55 @@ func (*Restore) Synopsis() string {
 
 // Usage implements subcommands.Command.Usage.
 func (*Restore) Usage() string {
-	return `restore [flags] <container id> - restore last saved state of container.
+	return `restore [flags] <container id> - restore saved state of container.
 `
 }
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (r *Restore) SetFlags(f *flag.FlagSet) {
+	r.Create.SetFlags(f)
+	f.StringVar(&r.imagePath, "image-path", "", "path to saved container image")
 }
 
 // Execute implements subcommands.Command.Execute.
 func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	Fatalf("restore not implemented")
-	return subcommands.ExitFailure
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	bundleDir := r.bundleDir
+	if bundleDir == "" {
+		bundleDir = getwdOrDie()
+	}
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	if r.imagePath == "" {
+		Fatalf("image-path flag must be provided")
+	}
+
+	cont, err := container.Create(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.imagePath)
+	if err != nil {
+		Fatalf("error restoring container: %v", err)
+	}
+
+	if err := cont.Start(conf); err != nil {
+		Fatalf("error starting container: %v", err)
+	}
+
+	ws, err := cont.Wait()
+	if err != nil {
+		Fatalf("error running container: %v", err)
+	}
+	*waitStatus = ws
+
+	return subcommands.ExitSuccess
 }
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 3b7f95af9..604708e2c 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -188,7 +188,7 @@ func List(rootDir string) ([]string, error) {
 
 // Create creates the container in a new Sandbox process, unless the metadata
 // indicates that an existing Sandbox should be used.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (*Container, error) {
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, restoreFile string) (*Container, error) {
 	log.Debugf("Create container %q in root dir: %s", id, conf.RootDir)
 	if err := validateID(id); err != nil {
 		return nil, err
@@ -222,7 +222,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		log.Debugf("Creating new sandbox for container %q", id)
 		// Start a new sandbox for this container. Any errors after this point
 		// must destroy the container.
-		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
+		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket, restoreFile)
 		if err != nil {
 			c.Destroy()
 			return nil, err
@@ -313,7 +313,7 @@ func (c *Container) Start(conf *boot.Config) error {
 // Run is a helper that calls Create + Start + Wait.
 func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (syscall.WaitStatus, error) {
 	log.Debugf("Run container %q in root dir: %s", id, conf.RootDir)
-	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile)
+	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, "")
 	if err != nil {
 		return 0, fmt.Errorf("error creating container: %v", err)
 	}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5128f5946..a8320f614 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -117,7 +117,7 @@ func run(spec *specs.Spec) error {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		return fmt.Errorf("error creating container: %v", err)
 	}
@@ -162,7 +162,7 @@ func TestLifecycle(t *testing.T) {
 	}
 	// Create the container.
 	id := testutil.UniqueContainerID()
-	if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
+	if _, err := container.Create(id, spec, conf, bundleDir, "", "", ""); err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
 
@@ -360,7 +360,7 @@ func TestExec(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -445,7 +445,7 @@ func TestCheckpoint(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -497,7 +497,7 @@ func TestPauseResume(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -599,7 +599,7 @@ func TestPauseResumeStatus(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -666,7 +666,7 @@ func TestCapabilities(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -765,7 +765,7 @@ func TestConsoleSocket(t *testing.T) {
 
 	// Create the container and pass the socket name.
 	id := testutil.UniqueContainerID()
-	s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
+	s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -828,7 +828,7 @@ func TestSpecUnsupported(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	id := testutil.UniqueContainerID()
-	_, err = container.Create(id, spec, conf, bundleDir, "", "")
+	_, err = container.Create(id, spec, conf, bundleDir, "", "", "")
 	if err == nil || !strings.Contains(err.Error(), "is not supported") {
 		t.Errorf("container.Create() wrong error, got: %v, want: *is not supported, spec.Process: %+v", err, spec.Process)
 	}
@@ -917,7 +917,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := container.Create(cid, spec, conf, bundleDir, "", "")
+		cont, err := container.Create(cid, spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -987,7 +987,7 @@ func TestMultiContainerSanity(t *testing.T) {
 			t.Fatalf("error setting up container: %v", err)
 		}
 		defer os.RemoveAll(bundleDir)
-		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 652910efa..870a0ccd3 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -54,7 +54,9 @@ type Sandbox struct {
 }
 
 // Create creates the sandbox process.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, error) {
+//
+// If restoreFile is not empty, the sandbox will be restored from file.
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, restoreFile string) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
 
 	binPath, err := specutils.BinPath()
@@ -69,7 +71,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	}
 
 	// Create the sandbox process.
-	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles); err != nil {
+	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles, restoreFile); err != nil {
 		return nil, err
 	}
 
@@ -251,7 +253,7 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error {
+func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File, restoreFile string) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -273,12 +275,27 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		"--bundle", bundleDir,
 		"--controller-fd="+strconv.Itoa(nextFD),
 		fmt.Sprintf("--console=%t", consoleEnabled))
-	nextFD++
 
 	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
 	defer controllerFile.Close()
 	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
 
+	// If a restore filename was given, open the file and append its FD to Args
+	// and the file to ExtraFiles.
+	if restoreFile != "" {
+		// Create the image file and open for reading.
+		rF, err := os.Open(restoreFile)
+		if err != nil {
+			return fmt.Errorf("os.Open(%q) failed: %v", restoreFile, err)
+		}
+		defer rF.Close()
+
+		nextFD++
+		cmd.Args = append(cmd.Args, "--restore-fd="+strconv.Itoa(nextFD))
+		cmd.ExtraFiles = append(cmd.ExtraFiles, rF)
+	}
+	nextFD++
+
 	// If there is a gofer, sends all socket ends to the sandbox.
 	for _, f := range ioFiles {
 		defer f.Close()
@@ -379,6 +396,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 	s.Pid = cmd.Process.Pid
 	log.Infof("Sandbox started, pid: %d", s.Pid)
+
 	return nil
 }
 
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index fee2de283..9db90ef07 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -39,7 +39,7 @@ func TestGoferExits(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "")
+	s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
-- 
cgit v1.2.3


From f2a687001ded18a4343c1aa3bfba18b08c6a816a Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Thu, 21 Jun 2018 10:17:19 -0700
Subject: Added functionality to create a RestoreEnvironment.

Before a container can be restored, the mounts must be configured.
The root and submounts and their key information is compiled into a
RestoreEnvironment.
Future code will be added to set this created environment before
restoring a container.
Tests to ensure the correct environment were added.

PiperOrigin-RevId: 201544637
Change-Id: Ia894a8b0f80f31104d1c732e113b1d65a4697087
---
 runsc/boot/BUILD          |   1 +
 runsc/boot/fs.go          |  91 ++++++++++++++++++--
 runsc/boot/loader_test.go | 206 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 289 insertions(+), 9 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 924cc2b90..e96722069 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -86,6 +86,7 @@ go_test(
         "//pkg/control/server",
         "//pkg/log",
         "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 7ebf22de8..7731763de 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -220,12 +220,13 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
 }
 
-func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error {
-	// Map mount type to filesystem name, and parse out the options that we are
-	// capable of dealing with.
-	var data []string
+// getMountNameAndOptions retrieves the fsName, data, and useOverlay values
+// used for mounts.
+func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
 	var fsName string
+	var data []string
 	var useOverlay bool
+	var err error
 	switch m.Type {
 	case "devpts", "devtmpfs", "proc", "sysfs":
 		fsName = m.Type
@@ -235,11 +236,8 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 		fsName = m.Type
 
 		// tmpfs has some extra supported options that we must pass through.
-		var err error
 		data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
-		if err != nil {
-			return err
-		}
+
 	case "bind":
 		switch conf.FileAccess {
 		case FileAccessProxy:
@@ -250,7 +248,7 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 			fsName = "whitelistfs"
 			data = []string{"root=" + m.Source, "dont_translate_ownership=true"}
 		default:
-			return fmt.Errorf("invalid file access type: %v", conf.FileAccess)
+			err = fmt.Errorf("invalid file access type: %v", conf.FileAccess)
 		}
 		// If configured, add overlay to all writable mounts.
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
@@ -261,6 +259,20 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 		// them, so this is a warning for now.
 		// we do not support.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, data, useOverlay, err
+}
+
+func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, data, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+
+	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
 		return nil
 	}
 
@@ -388,6 +400,67 @@ func destinations(mounts []specs.Mount, extra ...string) []string {
 	return append(ds, extra...)
 }
 
+// mountDevice returns a device string based on the fs type and target
+// of the mount.
+func mountDevice(m specs.Mount) string {
+	if m.Type == "bind" {
+		// Make a device string that includes the target, which is consistent across
+		// S/R and uniquely identifies the connection.
+		return "p9fs-" + m.Destination
+	}
+	// All other fs types use device "none".
+	return "none"
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
+	fsName, data, _, err := getMountNameAndOptions(conf, m, fds)
+	dataString := strings.Join(data, ",")
+	if err != nil {
+		return err
+	}
+	renv.MountSources[fsName] = append(renv.MountSources[fsName], fs.MountArgs{
+		Dev:   mountDevice(m),
+		Flags: mountFlags(m.Options),
+		Data:  dataString,
+	})
+	return nil
+}
+
+// createRestoreEnviroment builds a fs.RestoreEnvironment called renv by adding the mounts
+// to the environment.
+func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
+	if conf.FileAccess == FileAccessDirect {
+		return nil, fmt.Errorf("host filesystem with whitelist not supported with S/R")
+	}
+	renv := &fs.RestoreEnvironment{
+		MountSources: make(map[string][]fs.MountArgs),
+	}
+
+	// Add root mount.
+	fd := fds.remove()
+	dataString := strings.Join([]string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}, ",")
+	mf := fs.MountSourceFlags{}
+	if spec.Root.Readonly {
+		mf.ReadOnly = true
+	}
+	const rootFSName = "9p"
+	renv.MountSources[rootFSName] = append(renv.MountSources[rootFSName], fs.MountArgs{
+		Dev:   "p9fs-/",
+		Flags: mf,
+		Data:  dataString,
+	})
+
+	// Add submounts
+	for _, m := range spec.Mounts {
+		if err := addRestoreMount(conf, renv, m, fds); err != nil {
+			return nil, err
+		}
+	}
+	return renv, nil
+}
+
 func mountFlags(opts []string) fs.MountSourceFlags {
 	mf := fs.MountSourceFlags{}
 	for _, o := range opts {
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index dab7ad0c5..5ec1084db 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -19,6 +19,7 @@ import (
 	"io/ioutil"
 	"math/rand"
 	"os"
+	"reflect"
 	"sync"
 	"testing"
 	"time"
@@ -27,6 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 )
 
 func init() {
@@ -319,3 +321,207 @@ func TestCreateMountNamespace(t *testing.T) {
 		}
 	}
 }
+
+// TestRestoreEnvironment tests that the correct mounts are collected from the spec and config
+// in order to build the environment for restoring.
+func TestRestoreEnvironment(t *testing.T) {
+	testCases := []struct {
+		name          string
+		spec          *specs.Spec
+		conf          *Config
+		ioFDs         []int
+		errorExpected bool
+		expectedRenv  fs.RestoreEnvironment
+	}{
+		{
+			name: "basic spec test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			conf: &Config{
+				RootDir:        "unused_root_dir",
+				Network:        NetworkNone,
+				FileAccess:     FileAccessProxy,
+				DisableSeccomp: true,
+			},
+			ioFDs:         []int{0},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:   "p9fs-/",
+							Flags: fs.MountSourceFlags{ReadOnly: true},
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+						},
+					},
+					"tmpfs": {
+						{
+							Dev: "none",
+						},
+						{
+							Dev: "none",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "bind type test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/dev/fd-foo",
+						Type:        "bind",
+					},
+				},
+			},
+			conf: &Config{
+				RootDir:        "unused_root_dir",
+				Network:        NetworkNone,
+				FileAccess:     FileAccessProxy,
+				DisableSeccomp: true,
+			},
+			ioFDs:         []int{0, 1},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:   "p9fs-/",
+							Flags: fs.MountSourceFlags{ReadOnly: true},
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+						},
+						{
+							Dev:  "p9fs-/dev/fd-foo",
+							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "options test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/dev/fd-foo",
+						Type:        "tmpfs",
+						Options:     []string{"uid=1022", "noatime"},
+					},
+				},
+			},
+			conf: &Config{
+				RootDir:        "unused_root_dir",
+				Network:        NetworkNone,
+				FileAccess:     FileAccessProxy,
+				DisableSeccomp: true,
+			},
+			ioFDs:         []int{0},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:   "p9fs-/",
+							Flags: fs.MountSourceFlags{ReadOnly: true},
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+						},
+					},
+					"tmpfs": {
+						{
+							Dev:   "none",
+							Flags: fs.MountSourceFlags{NoAtime: true},
+							Data:  "uid=1022",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "whitelist error test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/dev/fd-foo",
+						Type:        "bind",
+					},
+				},
+			},
+			conf: &Config{
+				RootDir:        "unused_root_dir",
+				Network:        NetworkNone,
+				FileAccess:     FileAccessDirect,
+				DisableSeccomp: true,
+			},
+			ioFDs:         []int{0, 1},
+			errorExpected: true,
+		},
+		{
+			name: "bad options test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/dev/fd-foo",
+						Type:        "tmpfs",
+						Options:     []string{"invalid_option=true"},
+					},
+				},
+			},
+			conf: &Config{
+				RootDir:        "unused_root_dir",
+				Network:        NetworkNone,
+				FileAccess:     FileAccessDirect,
+				DisableSeccomp: true,
+			},
+			ioFDs:         []int{0},
+			errorExpected: true,
+		},
+	}
+
+	for _, tc := range testCases {
+		fds := &fdDispenser{fds: tc.ioFDs}
+
+		actualRenv, err := createRestoreEnvironment(tc.spec, tc.conf, fds)
+		if !tc.errorExpected && err != nil {
+			t.Fatalf("could not create restore environment for test:%s", tc.name)
+		} else if tc.errorExpected {
+			if err == nil {
+				t.Fatalf("expected an error, but no error occurred.")
+			}
+		} else {
+			if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) {
+				t.Fatalf("restore environments did not match for test:%s", tc.name)
+			}
+		}
+	}
+}
-- 
cgit v1.2.3


From f6be5fe6193163ad46722bc36209572da4a15ad0 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 21 Jun 2018 13:21:25 -0700
Subject: Forward SIGUSR2 to the sandbox too

SIGUSR2 was being masked out to be used as a way to dump sentry
stacks. This could cause compatibility problems in cases anyone
uses SIGUSR2 to communicate with the container init process.

PiperOrigin-RevId: 201575374
Change-Id: I312246e828f38ad059139bb45b8addc2ed055d74
---
 pkg/sentry/sighandling/sighandling.go | 10 ++--------
 runsc/boot/loader.go                  |  5 ++---
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 0c3a14da5..ef6f7f617 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -95,7 +95,7 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 // PrepareForwarding ensures that synchronous signals are forwarded to k and
 // returns a callback that starts signal delivery, which itself returns a
 // callback that stops signal forwarding.
-func PrepareForwarding(k *kernel.Kernel) func() func() {
+func PrepareForwarding(k *kernel.Kernel, enablePanicSignal bool) func() func() {
 	start := make(chan struct{})
 	stop := make(chan struct{})
 
@@ -112,7 +112,7 @@ func PrepareForwarding(k *kernel.Kernel) func() func() {
 		sigchans = append(sigchans, sigchan)
 
 		// SignalPanic is handled by Run.
-		if linux.Signal(sig) == kernel.SignalPanic {
+		if enablePanicSignal && linux.Signal(sig) == kernel.SignalPanic {
 			continue
 		}
 
@@ -128,9 +128,3 @@ func PrepareForwarding(k *kernel.Kernel) func() func() {
 		}
 	}
 }
-
-// StartForwarding ensures that synchronous signals are forwarded to k and
-// returns a callback that stops signal forwarding.
-func StartForwarding(k *kernel.Kernel) func() {
-	return PrepareForwarding(k)()
-}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index e1194bd03..a0a28dc43 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -215,9 +215,8 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	if err := sighandling.IgnoreChildStop(); err != nil {
 		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
 	}
-	// Ensure that most signals received in sentry context are forwarded to
-	// the emulated kernel.
-	stopSignalForwarding := sighandling.StartForwarding(k)
+	// Ensure that signals received are forwarded to the emulated kernel.
+	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
 
 	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
 	if err != nil {
-- 
cgit v1.2.3


From e1aee51d09d650cca8d098050665c2d49d859e26 Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Fri, 22 Jun 2018 09:40:21 -0700
Subject: Modified Checkpoint/Restore flags to improve compatibility with
 Docker.

Added a number of unimplemented flags required for using runsc's
Checkpoint and Restore with Docker. Modified the "image-path" flag to
require a directory instead of a file.

PiperOrigin-RevId: 201697486
Change-Id: I55883df2f1bbc3ec3c395e0ca160ce189e5e7eba
---
 runsc/cmd/checkpoint.go | 17 +++++++++++++++--
 runsc/cmd/restore.go    | 17 +++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 927027c2b..e5fc7bdc4 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"os"
+	"path/filepath"
 
 	"context"
 	"flag"
@@ -24,6 +25,9 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/container"
 )
 
+// File containing the container's saved image/state within the given image-path's directory.
+const checkpointFileName = "checkpoint.img"
+
 // Checkpoint implements subcommands.Command for the "checkpoint" command.
 type Checkpoint struct {
 	imagePath string
@@ -48,6 +52,13 @@ func (*Checkpoint) Usage() string {
 // SetFlags implements subcommands.Command.SetFlags.
 func (c *Checkpoint) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&c.imagePath, "image-path", "", "path to saved container image")
+
+	// Unimplemented flags necessary for compatibility with docker.
+	var wp string
+	f.StringVar(&wp, "work-path", "", "ignored")
+
+	var lr bool
+	f.BoolVar(&lr, "leave-running", false, "ignored")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -70,10 +81,12 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("image-path flag must be provided")
 	}
 
+	fullImagePath := filepath.Join(c.imagePath, checkpointFileName)
+
 	// Create the image file and open for writing.
-	file, err := os.OpenFile(c.imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+	file, err := os.OpenFile(fullImagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
 	if err != nil {
-		Fatalf("os.OpenFile(%q) failed: %v", c.imagePath, err)
+		Fatalf("os.OpenFile(%q) failed: %v", fullImagePath, err)
 	}
 	defer file.Close()
 
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 0589a36bf..cc55beeaf 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -15,6 +15,7 @@
 package cmd
 
 import (
+	"path/filepath"
 	"syscall"
 
 	"context"
@@ -53,7 +54,17 @@ func (*Restore) Usage() string {
 // SetFlags implements subcommands.Command.SetFlags.
 func (r *Restore) SetFlags(f *flag.FlagSet) {
 	r.Create.SetFlags(f)
-	f.StringVar(&r.imagePath, "image-path", "", "path to saved container image")
+	f.StringVar(&r.imagePath, "image-path", "", "directory path to saved container image")
+
+	// Unimplemented flags necessary for compatibility with docker.
+	var d bool
+	f.BoolVar(&d, "detach", false, "ignored")
+
+	var nsr bool
+	f.BoolVar(&nsr, "no-subreaper", false, "ignored")
+
+	var wp string
+	f.StringVar(&wp, "work-path", "", "ignored")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -81,7 +92,9 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 		Fatalf("image-path flag must be provided")
 	}
 
-	cont, err := container.Create(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.imagePath)
+	restoreFile := filepath.Join(r.imagePath, checkpointFileName)
+
+	cont, err := container.Create(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, restoreFile)
 	if err != nil {
 		Fatalf("error restoring container: %v", err)
 	}
-- 
cgit v1.2.3


From 04bdcc7b65ac03eeca9b14608a12067e1205081b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 22 Jun 2018 14:30:33 -0700
Subject: runsc: Enable waiting on individual containers within a sandbox.

PiperOrigin-RevId: 201742160
Change-Id: Ia9fa1442287c5f9e1196fb117c41536a80f6bb31
---
 runsc/boot/controller.go          | 26 +++++++-----
 runsc/boot/loader.go              | 67 ++++++++++++++++++++++++++---
 runsc/boot/loader_test.go         |  3 +-
 runsc/container/container.go      |  2 +-
 runsc/container/container_test.go | 89 +++++++++++++++++++++++++++++++++++++++
 runsc/sandbox/sandbox.go          |  5 ++-
 6 files changed, 172 insertions(+), 20 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index ec24c4dad..56829c605 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -145,10 +145,11 @@ type containerManager struct {
 }
 
 // StartRoot will start the root container process.
-func (cm *containerManager) StartRoot(_, _ *struct{}) error {
+func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
 	log.Debugf("containerManager.StartRoot")
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
+	cm.l.setRootContainerID(*cid)
 	return <-cm.startResultChan
 }
 
@@ -166,6 +167,9 @@ type StartArgs struct {
 	// TODO: Separate sandbox and container configs.
 	// Config is the runsc-specific configuration for the sandbox.
 	Conf *Config
+
+	// CID is the ID of the container to start.
+	CID string
 }
 
 // Start runs a created container within a sandbox.
@@ -182,8 +186,16 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if args.Conf == nil {
 		return errors.New("start arguments missing config")
 	}
+	if args.CID == "" {
+		return errors.New("start argument missing container ID")
+	}
+
+	tgid, err := cm.l.startContainer(args, cm.k)
+	if err != nil {
+		return err
+	}
+	log.Debugf("Container %q started with root PID of %d", args.CID, tgid)
 
-	cm.l.startContainer(args, cm.k)
 	return nil
 }
 
@@ -222,15 +234,7 @@ func (cm *containerManager) Resume(_, _ *struct{}) error {
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
 	log.Debugf("containerManager.Wait")
-	// TODO: Use the cid and wait on the init process in that
-	// container. Currently we just wait on PID 1 in the sandbox.
-	tg := cm.k.TaskSet().Root.ThreadGroupWithID(1)
-	if tg == nil {
-		return fmt.Errorf("cannot wait: no thread group with id 1")
-	}
-	tg.WaitExited()
-	*waitStatus = tg.ExitStatus().Status()
-	return nil
+	return cm.l.wait(cid, waitStatus)
 }
 
 // SignalArgs are arguments to the Signal method.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index a0a28dc43..7097f220b 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,10 +16,12 @@
 package boot
 
 import (
+	"errors"
 	"fmt"
 	"math/rand"
 	"os"
 	"runtime"
+	"sync"
 	"sync/atomic"
 	"syscall"
 	gtime "time"
@@ -81,6 +83,16 @@ type Loader struct {
 
 	// rootProcArgs refers to the root sandbox init task.
 	rootProcArgs kernel.CreateProcessArgs
+
+	// mu guards containerRootTGIDs.
+	mu sync.Mutex
+
+	// containerRootTGIDs maps container IDs to their root processes. It
+	// can be used to determine which process to manipulate when clients
+	// call methods on particular containers.
+	//
+	// containerRootTGIDs is guarded by mu.
+	containerRootTGIDs map[string]kernel.ThreadID
 }
 
 func init() {
@@ -377,12 +389,14 @@ func (l *Loader) run() error {
 	return l.k.Start()
 }
 
-func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) error {
+// startContainer starts a child container. It returns the thread group ID of
+// the newly created process.
+func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.ThreadID, error) {
 	spec := args.Spec
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
-		return fmt.Errorf("error creating capabilities: %v", err)
+		return 0, fmt.Errorf("error creating capabilities: %v", err)
 	}
 
 	// Convert the spec's additional GIDs to KGIDs.
@@ -416,19 +430,62 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) error {
 		k.RootIPCNamespace(),
 		k)
 	if err != nil {
-		return fmt.Errorf("failed to create new process: %v", err)
+		return 0, fmt.Errorf("failed to create new process: %v", err)
+	}
+
+	tg, err := l.k.CreateProcess(procArgs)
+	if err != nil {
+		return 0, fmt.Errorf("failed to create process in sentry: %v", err)
 	}
 
-	if _, err := l.k.CreateProcess(procArgs); err != nil {
-		return fmt.Errorf("failed to create process in sentry: %v", err)
+	ts := k.TaskSet()
+	tgid := ts.Root.IDOfThreadGroup(tg)
+	if tgid == 0 {
+		return 0, errors.New("failed to get thread group ID of new process")
 	}
 
 	// CreateProcess takes a reference on FDMap if successful.
 	procArgs.FDMap.DecRef()
 
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	l.containerRootTGIDs[args.CID] = tgid
+
+	return tgid, nil
+}
+
+// wait waits for the init process in the given container.
+func (l *Loader) wait(cid *string, waitStatus *uint32) error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	tgid, ok := l.containerRootTGIDs[*cid]
+	if !ok {
+		return fmt.Errorf("can't find process for container %q in %v", *cid, l.containerRootTGIDs)
+	}
+
+	// TODO: Containers don't map 1:1 with their root
+	// processes. Container exits should be managed explicitly
+	// rather than via PID.
+	// If the thread either has already exited or exits during waiting,
+	// consider the container exited.
+	defer delete(l.containerRootTGIDs, *cid)
+
+	tg := l.k.TaskSet().Root.ThreadGroupWithID(tgid)
+	if tg == nil {
+		return fmt.Errorf("no thread group with ID %d", tgid)
+	}
+	tg.WaitExited()
+	*waitStatus = tg.ExitStatus().Status()
 	return nil
 }
 
+func (l *Loader) setRootContainerID(cid string) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	// The root container has PID 1.
+	l.containerRootTGIDs = map[string]kernel.ThreadID{cid: 1}
+}
+
 // WaitForStartSignal waits for a start signal from the control server.
 func (l *Loader) WaitForStartSignal() {
 	<-l.ctrl.manager.startChan
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 5ec1084db..15ced0601 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -133,7 +133,8 @@ func TestStartSignal(t *testing.T) {
 	}
 
 	// Trigger the control server StartRoot method.
-	if err := s.ctrl.manager.StartRoot(nil, nil); err != nil {
+	cid := "foo"
+	if err := s.ctrl.manager.StartRoot(&cid, nil); err != nil {
 		t.Errorf("error calling StartRoot: %v", err)
 	}
 
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 604708e2c..9c0169ca8 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -293,7 +293,7 @@ func (c *Container) Start(conf *boot.Config) error {
 			return err
 		}
 	} else {
-		if err := c.Sandbox.Start(c.Spec, conf); err != nil {
+		if err := c.Sandbox.Start(c.Spec, conf, c.ID); err != nil {
 			c.Destroy()
 			return err
 		}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index a8320f614..de487ea97 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1020,3 +1020,92 @@ func TestMultiContainerSanity(t *testing.T) {
 		t.Errorf("failed to wait for sleep to start: %v", err)
 	}
 }
+
+func TestMultiContainerWait(t *testing.T) {
+	containerIDs := []string{
+		testutil.UniqueContainerID(),
+		testutil.UniqueContainerID(),
+	}
+	containerAnnotations := []map[string]string{
+		// The first container creates a sandbox.
+		map[string]string{
+			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+		},
+		// The second container creates a container within the first
+		// container's sandbox.
+		map[string]string{
+			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+			specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
+		},
+	}
+	args := [][]string{
+		// The first container should run the entire duration of the
+		// test.
+		{"sleep", "100"},
+		// We'll wait on the second container, which is much shorter
+		// lived.
+		{"sleep", "1"},
+	}
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// Setup the containers.
+	containers := make([]*container.Container, 0, len(containerIDs))
+	for i, annotations := range containerAnnotations {
+		spec := testutil.NewSpecWithArgs(args[i][0], args[i][1])
+		spec.Annotations = annotations
+		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  0,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Check via ps that multiple processes are running.
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+
+	// Wait on the short lived container.
+	if ws, err := containers[1].Wait(); err != nil {
+		t.Fatalf("failed to wait for process %q: %v", strings.Join(containers[1].Spec.Process.Args, " "), err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Fatalf("process %q exited with non-zero status %d", strings.Join(containers[1].Spec.Process.Args, " "), es)
+	}
+
+	// After Wait returns, ensure that the root container is running and
+	// the child has finished.
+	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
+	}
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 870a0ccd3..ed2c40e57 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -99,7 +99,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 
 	// Send a message to the sandbox control server to start the root
 	// container.
-	if err := conn.Call(boot.RootContainerStart, nil, nil); err != nil {
+	if err := conn.Call(boot.RootContainerStart, &s.ID, nil); err != nil {
 		return fmt.Errorf("error starting root container %v: %v", spec.Process.Args, err)
 	}
 
@@ -107,7 +107,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 }
 
 // Start starts running a non-root container inside the sandbox.
-func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config) error {
+func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error {
 	log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid)
 	conn, err := s.connect()
 	if err != nil {
@@ -118,6 +118,7 @@ func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config) error {
 	args := boot.StartArgs{
 		Spec: spec,
 		Conf: conf,
+		CID:  cid,
 	}
 	if err := conn.Call(boot.ContainerStart, args, nil); err != nil {
 		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
-- 
cgit v1.2.3


From cecc1e472cc893738a070be00d303dd888b9f325 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 25 Jun 2018 10:40:24 -0700
Subject: Fix lint errors

PiperOrigin-RevId: 201978212
Change-Id: Ie3df1fd41d5293fff66b546a0c68c3bf98126067
---
 runsc/boot/config.go    | 4 ++--
 runsc/boot/fs.go        | 2 +-
 runsc/cmd/capability.go | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index c13ac150d..3fca16cce 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -24,10 +24,10 @@ import (
 type PlatformType int
 
 const (
-	// Ptrace runs the sandbox with the ptrace platform.
+	// PlatformPtrace runs the sandbox with the ptrace platform.
 	PlatformPtrace PlatformType = iota
 
-	// KVM runs the sandbox with the KVM platform.
+	// PlatformKVM runs the sandbox with the KVM platform.
 	PlatformKVM
 )
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 7731763de..e0d7fc769 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -428,7 +428,7 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 	return nil
 }
 
-// createRestoreEnviroment builds a fs.RestoreEnvironment called renv by adding the mounts
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
 // to the environment.
 func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
 	if conf.FileAccess == FileAccessDirect {
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
index 0209feb1b..e2410d4ad 100644
--- a/runsc/cmd/capability.go
+++ b/runsc/cmd/capability.go
@@ -78,7 +78,7 @@ func trimCaps(names []string, setter capability.Capabilities) ([]capability.Cap,
 	var caps []capability.Cap
 	for _, c := range wantedCaps {
 		// Capability rules are more complicated than this, but this catches most
-		// problems with tests running with non-priviledged user.
+		// problems with tests running with non-privileged user.
 		if setter.Get(capability.PERMITTED, c) {
 			caps = append(caps, c)
 		} else {
-- 
cgit v1.2.3


From e8ae2b85e90fc27e74de032698224e7972673cec Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Mon, 25 Jun 2018 12:07:50 -0700
Subject: runsc: add a `multi-container` flag to enable multi-container
 support.

PiperOrigin-RevId: 201995800
Change-Id: I770190d135e14ec7da4b3155009fe10121b2a502
---
 runsc/boot/config.go            |  5 +++++
 runsc/container/container.go    |  4 ++--
 runsc/main.go                   | 34 ++++++++++++++++++----------------
 runsc/test/testutil/testutil.go | 13 +++++++------
 4 files changed, 32 insertions(+), 24 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 3fca16cce..24be82906 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -176,6 +176,10 @@ type Config struct {
 	// DisableSeccomp indicates whether seccomp syscall filters should be
 	// disabled. Pardon the double negation, but default to enabled is important.
 	DisableSeccomp bool
+
+	// MultiContainer enables multiple containers support inside one sandbox.
+	// TODO: Remove this when multiple container is fully supported.
+	MultiContainer bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -188,6 +192,7 @@ func (c *Config) ToFlags() []string {
 		"--debug-log-dir=" + c.DebugLogDir,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
+		"--multi-container=" + strconv.FormatBool(c.MultiContainer),
 		"--network=" + c.Network.String(),
 		"--log-packets=" + strconv.FormatBool(c.LogPackets),
 		"--platform=" + c.Platform.String(),
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 9c0169ca8..428aa5c62 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -218,7 +218,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// started in an existing sandbox, we must do so. The metadata will
 	// indicate the ID of the sandbox, which is the same as the ID of the
 	// init container in the sandbox.
-	if specutils.ShouldCreateSandbox(spec) {
+	if specutils.ShouldCreateSandbox(spec) || !conf.MultiContainer {
 		log.Debugf("Creating new sandbox for container %q", id)
 		// Start a new sandbox for this container. Any errors after this point
 		// must destroy the container.
@@ -287,7 +287,7 @@ func (c *Container) Start(conf *boot.Config) error {
 		}
 	}
 
-	if specutils.ShouldCreateSandbox(c.Spec) {
+	if specutils.ShouldCreateSandbox(c.Spec) || !conf.MultiContainer {
 		if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
 			c.Destroy()
 			return err
diff --git a/runsc/main.go b/runsc/main.go
index cd906e191..aa5796d42 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -55,10 +55,11 @@ var (
 	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
 
 	// Flags that control sandbox runtime behavior.
-	platform   = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
-	network    = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	fileAccess = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.")
-	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+	platform       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+	fileAccess     = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.")
+	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
 )
 
 var gitRevision = ""
@@ -111,18 +112,19 @@ func main() {
 
 	// Create a new Config from the flags.
 	conf := &boot.Config{
-		RootDir:       *rootDir,
-		Debug:         *debug,
-		LogFilename:   *logFilename,
-		LogFormat:     *logFormat,
-		DebugLogDir:   *debugLogDir,
-		FileAccess:    fsAccess,
-		Overlay:       *overlay,
-		Network:       netType,
-		LogPackets:    *logPackets,
-		Platform:      platformType,
-		Strace:        *strace,
-		StraceLogSize: *straceLogSize,
+		RootDir:        *rootDir,
+		Debug:          *debug,
+		LogFilename:    *logFilename,
+		LogFormat:      *logFormat,
+		DebugLogDir:    *debugLogDir,
+		FileAccess:     fsAccess,
+		Overlay:        *overlay,
+		Network:        netType,
+		LogPackets:     *logPackets,
+		Platform:       platformType,
+		Strace:         *strace,
+		StraceLogSize:  *straceLogSize,
+		MultiContainer: *multiContainer,
 	}
 	if len(*straceSyscalls) != 0 {
 		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 25535ea37..9d70d29f2 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -118,12 +118,13 @@ func SetupContainerInRoot(rootDir string, spec *specs.Spec) (bundleDir string, c
 	}
 
 	conf = &boot.Config{
-		Debug:      true,
-		LogFormat:  "text",
-		LogPackets: true,
-		Network:    boot.NetworkNone,
-		RootDir:    rootDir,
-		Strace:     true,
+		Debug:          true,
+		LogFormat:      "text",
+		LogPackets:     true,
+		Network:        boot.NetworkNone,
+		RootDir:        rootDir,
+		Strace:         true,
+		MultiContainer: true,
 	}
 
 	return bundleDir, conf, nil
-- 
cgit v1.2.3


From 000fd8d1e4530b4063eead26dda4843ff0d71cbd Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Tue, 26 Jun 2018 13:39:07 -0700
Subject: runsc: set gofer umask to 0.

PiperOrigin-RevId: 202185642
Change-Id: I2eefcc0b2ffadc6ef21d177a8a4ab0cda91f3399
---
 runsc/boot/loader.go | 10 ++++------
 runsc/cmd/boot.go    |  7 +++++++
 runsc/cmd/gofer.go   |  5 +++++
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 7097f220b..69b982ff8 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -23,7 +23,6 @@ import (
 	"runtime"
 	"sync"
 	"sync/atomic"
-	"syscall"
 	gtime "time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -270,7 +269,7 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds
 		Envv:                 spec.Process.Env,
 		WorkingDirectory:     spec.Process.Cwd,
 		Credentials:          creds,
-		Umask:                0,
+		Umask:                0022,
 		Limits:               ls,
 		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
 		UTSNamespace:         utsns,
@@ -296,10 +295,9 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds
 		// Use root user to configure mounts. The current user might not have
 		// permission to do so.
 		rootProcArgs := kernel.CreateProcessArgs{
-			WorkingDirectory: "/",
-			Credentials:      auth.NewRootCredentials(creds.UserNamespace),
-			// The sentry should run with a umask of 0.
-			Umask:                uint(syscall.Umask(0)),
+			WorkingDirectory:     "/",
+			Credentials:          auth.NewRootCredentials(creds.UserNamespace),
+			Umask:                0022,
 			MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
 		}
 		rootCtx := rootProcArgs.NewContext(k)
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 86f597c09..0d0e6b63f 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -107,6 +107,13 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
+	// sentry should run with a umask of 0 when --file-access=direct, because we want
+	// to preserve file modes exactly as set by the sentry, which will have applied
+	// its own umask.
+	if conf.FileAccess == boot.FileAccessDirect {
+		syscall.Umask(0)
+	}
+
 	if b.applyCaps {
 		caps := spec.Process.Capabilities
 		if conf.Platform == boot.PlatformPtrace {
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index c0b747737..8e1060a35 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -17,6 +17,7 @@ package cmd
 import (
 	"os"
 	"sync"
+	"syscall"
 
 	"context"
 	"flag"
@@ -66,6 +67,10 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return subcommands.ExitUsageError
 	}
 
+	// fsgofer should run with a umask of 0, because we want to preserve file
+	// modes exactly as sent by the sandbox, which will have applied its own umask.
+	syscall.Umask(0)
+
 	spec, err := specutils.ReadSpec(g.bundleDir)
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
-- 
cgit v1.2.3


From c186e408cc61cbefd6d72c2ff3e9d629572570db Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 26 Jun 2018 19:04:51 -0700
Subject: Add KVM, overlay and host network to image tests

PiperOrigin-RevId: 202236006
Change-Id: I4ea964a70fc49e8b51c9da27d77301c4eadaae71
---
 kokoro/gcp_ubuntu/run_tests.sh | 3 +++
 pkg/sentry/fs/overlay.go       | 2 +-
 runsc/test/image/install.sh    | 9 ++++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/gcp_ubuntu/run_tests.sh b/kokoro/gcp_ubuntu/run_tests.sh
index 2f5e375eb..5554350da 100755
--- a/kokoro/gcp_ubuntu/run_tests.sh
+++ b/kokoro/gcp_ubuntu/run_tests.sh
@@ -46,6 +46,9 @@ exit_code=${?}
 if [[ ${exit_code} -eq 0 ]]; then
   # image_test is tagged manual
   bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime} //runsc/test/image:image_test
+  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-kvm //runsc/test/image:image_test
+  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-nethost //runsc/test/image:image_test
+  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-overlay //runsc/test/image:image_test
   exit_code=${?}
 fi
 
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 90d21642e..a63f00e0e 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -113,7 +113,7 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
 // - lower must not require that file objects be revalidated.
 // - lower must not have dynamic file/directory content.
 func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
-	if IsRegular(lower.StableAttr) {
+	if !IsRegular(lower.StableAttr) {
 		return nil, fmt.Errorf("lower Inode is not a regular file")
 	}
 	msrc := newOverlayMountSource(upperMS, lower.MountSource, flags)
diff --git a/runsc/test/image/install.sh b/runsc/test/image/install.sh
index 94832dbe4..c110d96f9 100755
--- a/runsc/test/image/install.sh
+++ b/runsc/test/image/install.sh
@@ -75,10 +75,17 @@ if [[ ${uninstall} == 0 ]]; then
   mkdir -p "${logdir}"
   sudo -n chmod a+wx "${logdir}"
 
-  sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" --debug-log-dir "${logdir}" --debug --strace --log-packets
+  declare -r args="--debug-log-dir "${logdir}" --debug --strace --log-packets"
+  sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-kvm "${runsc}" --platform=kvm ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-hostnet "${runsc}" --network=host ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-overlay "${runsc}" --overlay ${args}
 
 else
   sudo -n "${dockercfg}" runtime-rm "${runtime}"
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-kvm
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-hostnet
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-overlay
 fi
 
 echo "Restarting docker service..."
-- 
cgit v1.2.3


From f93043615f31214193c9079d38498028331c72ca Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Wed, 27 Jun 2018 13:31:46 -0700
Subject: Added MkdirAll capabilities for Checkpoint's image-path.

Now able to save the state file (checkpoint.img) at an image-path that had
previously not existed. This is important because there can only be one
checkpoint.img file per directory so this will enable users to create as many
directories as needed for proper organization.

PiperOrigin-RevId: 202360414
Change-Id: If5dd2b72e08ab52834a2b605571186d107b64526
---
 runsc/cmd/checkpoint.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'runsc')

diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index e5fc7bdc4..a28eb0f02 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -81,6 +81,10 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("image-path flag must be provided")
 	}
 
+	if err := os.MkdirAll(c.imagePath, 0755); err != nil {
+		Fatalf("error making directories at path provided: %v", err)
+	}
+
 	fullImagePath := filepath.Join(c.imagePath, checkpointFileName)
 
 	// Create the image file and open for writing.
-- 
cgit v1.2.3


From 1f207de315430fb178b7025a5afd419afdc31449 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 28 Jun 2018 09:45:52 -0700
Subject: Add option to configure watchdog action

PiperOrigin-RevId: 202494747
Change-Id: I4d4a18e71468690b785060e580a5f83c616bd90f
---
 runsc/boot/config.go | 17 +++++++++++++++++
 runsc/boot/loader.go |  2 +-
 runsc/main.go        |  7 +++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 24be82906..074cd6a63 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -18,6 +18,8 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
 )
 
 // PlatformType tells which platform to use.
@@ -130,6 +132,18 @@ func (n NetworkType) String() string {
 	}
 }
 
+// MakeWatchdogAction converts type from string.
+func MakeWatchdogAction(s string) (watchdog.Action, error) {
+	switch strings.ToLower(s) {
+	case "log", "logwarning":
+		return watchdog.LogWarning, nil
+	case "panic":
+		return watchdog.Panic, nil
+	default:
+		return 0, fmt.Errorf("invalid watchdog action %q", s)
+	}
+}
+
 // Config holds configuration that is not part of the runtime spec.
 type Config struct {
 	// RootDir is the runtime root directory.
@@ -180,6 +194,8 @@ type Config struct {
 	// MultiContainer enables multiple containers support inside one sandbox.
 	// TODO: Remove this when multiple container is fully supported.
 	MultiContainer bool
+
+	WatchdogAction watchdog.Action
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -199,5 +215,6 @@ func (c *Config) ToFlags() []string {
 		"--strace=" + strconv.FormatBool(c.Strace),
 		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
 		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
+		"--watchdog-action=" + c.WatchdogAction.String(),
 	}
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 69b982ff8..da95fa0e7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -205,7 +205,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	}
 
 	// Create a watchdog.
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning)
+	watchdog := watchdog.New(k, watchdog.DefaultTimeout, conf.WatchdogAction)
 
 	// Create the control server using the provided FD.
 	//
diff --git a/runsc/main.go b/runsc/main.go
index aa5796d42..563ef8c67 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -60,6 +60,7 @@ var (
 	fileAccess     = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
+	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 )
 
 var gitRevision = ""
@@ -110,6 +111,11 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
+	wa, err := boot.MakeWatchdogAction(*watchdogAction)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
 	// Create a new Config from the flags.
 	conf := &boot.Config{
 		RootDir:        *rootDir,
@@ -125,6 +131,7 @@ func main() {
 		Strace:         *strace,
 		StraceLogSize:  *straceLogSize,
 		MultiContainer: *multiContainer,
+		WatchdogAction: wa,
 	}
 	if len(*straceSyscalls) != 0 {
 		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
-- 
cgit v1.2.3


From 8459390cdd81ef1c8180948566e893b06233923c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 28 Jun 2018 09:56:23 -0700
Subject: Error out if spec is invalid

Closes #66

PiperOrigin-RevId: 202496258
Change-Id: Ib9287c5bf1279ffba1db21ebd9e6b59305cddf34
---
 runsc/boot/loader.go              |   2 +-
 runsc/cmd/boot.go                 |   6 +-
 runsc/cmd/cmd.go                  |   2 +-
 runsc/cmd/gofer.go                |   2 +-
 runsc/container/container.go      |   9 ++-
 runsc/container/container_test.go |  22 --------
 runsc/specutils/BUILD             |   1 +
 runsc/specutils/specutils.go      |  27 ++++++++-
 runsc/specutils/specutils_test.go | 113 ++++++++++++++++++++++++++++++++++++++
 9 files changed, 150 insertions(+), 34 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index da95fa0e7..f359a0eb0 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -267,7 +267,7 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds
 		Filename:             exec,
 		Argv:                 spec.Process.Args,
 		Envv:                 spec.Process.Env,
-		WorkingDirectory:     spec.Process.Cwd,
+		WorkingDirectory:     spec.Process.Cwd, // Defaults to '/' if empty.
 		Credentials:          creds,
 		Umask:                0022,
 		Limits:               ls,
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 0d0e6b63f..685cb6f00 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -23,6 +23,7 @@ import (
 	"context"
 	"flag"
 	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -116,6 +117,9 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	if b.applyCaps {
 		caps := spec.Process.Capabilities
+		if caps == nil {
+			caps = &specs.LinuxCapabilities{}
+		}
 		if conf.Platform == boot.PlatformPtrace {
 			// Ptrace platform requires extra capabilities.
 			const c = "CAP_SYS_PTRACE"
@@ -131,7 +135,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 				args = append(args, arg)
 			}
 		}
-		if err := setCapsAndCallSelf(spec, args, caps); err != nil {
+		if err := setCapsAndCallSelf(args, caps); err != nil {
 			Fatalf("%v", err)
 		}
 		panic("setCapsAndCallSelf must never return success")
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index 940c8cd14..44ebd7165 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -72,7 +72,7 @@ func (i *intFlags) Set(s string) error {
 // setCapsAndCallSelf sets capabilities to the current thread and then execve's
 // itself again with the arguments specified in 'args' to restart the process
 // with the desired capabilities.
-func setCapsAndCallSelf(spec *specs.Spec, args []string, caps *specs.LinuxCapabilities) error {
+func setCapsAndCallSelf(args []string, caps *specs.LinuxCapabilities) error {
 	// Keep thread locked while capabilities are changed.
 	runtime.LockOSThread()
 	defer runtime.UnlockOSThread()
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 8e1060a35..55315c0e8 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -95,7 +95,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		// Note: minimal argument handling for the default case to keep it simple.
 		args := os.Args
 		args = append(args, "--apply-caps=false")
-		if err := setCapsAndCallSelf(spec, args, lc); err != nil {
+		if err := setCapsAndCallSelf(args, lc); err != nil {
 			Fatalf("Unable to apply caps: %v", err)
 		}
 		panic("unreachable")
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 428aa5c62..c7dc6ec10 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -193,9 +193,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	if err := validateID(id); err != nil {
 		return nil, err
 	}
-	if err := specutils.ValidateSpec(spec); err != nil {
-		return nil, err
-	}
 
 	containerRoot := filepath.Join(conf.RootDir, id)
 	if _, err := os.Stat(containerRoot); err == nil {
@@ -434,8 +431,10 @@ func (c *Container) Destroy() error {
 	log.Debugf("Destroy container %q", c.ID)
 
 	// First stop the container.
-	if err := c.Sandbox.Stop(c.ID); err != nil {
-		return err
+	if c.Sandbox != nil {
+		if err := c.Sandbox.Stop(c.ID); err != nil {
+			return err
+		}
 	}
 
 	// "If any poststop hook fails, the runtime MUST log a warning, but the
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index de487ea97..11285a123 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -812,28 +812,6 @@ func TestConsoleSocket(t *testing.T) {
 	}
 }
 
-func TestSpecUnsupported(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/true")
-	spec.Process.SelinuxLabel = "somelabel"
-
-	// These are normally set by docker and will just cause warnings to be logged.
-	spec.Process.ApparmorProfile = "someprofile"
-	spec.Linux = &specs.Linux{Seccomp: &specs.LinuxSeccomp{}}
-
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	id := testutil.UniqueContainerID()
-	_, err = container.Create(id, spec, conf, bundleDir, "", "", "")
-	if err == nil || !strings.Contains(err.Error(), "is not supported") {
-		t.Errorf("container.Create() wrong error, got: %v, want: *is not supported, spec.Process: %+v", err, spec.Process)
-	}
-}
-
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 1b6d265bc..34c952bdf 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -22,4 +22,5 @@ go_test(
     size = "small",
     srcs = ["specutils_test.go"],
     embed = [":specutils"],
+    deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"],
 )
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index c552111f2..0d9e09e9d 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -47,10 +47,28 @@ func LogSpec(spec *specs.Spec) {
 
 // ValidateSpec validates that the spec is compatible with runsc.
 func ValidateSpec(spec *specs.Spec) error {
+	// Mandatory fields.
 	if spec.Process == nil {
-		return fmt.Errorf("Process must be defined")
+		return fmt.Errorf("Spec.Process must be defined: %+v", spec)
 	}
-	if spec.Process.SelinuxLabel != "" {
+	if len(spec.Process.Args) == 0 {
+		return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process)
+	}
+	if spec.Root == nil {
+		return fmt.Errorf("Spec.Root must be defined: %+v", spec)
+	}
+	if len(spec.Root.Path) == 0 {
+		return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root)
+	}
+
+	// Unsupported fields.
+	if spec.Solaris != nil {
+		return fmt.Errorf("Spec.Solaris is not supported: %+v", spec)
+	}
+	if spec.Windows != nil {
+		return fmt.Errorf("Spec.Windows is not supported: %+v", spec)
+	}
+	if len(spec.Process.SelinuxLabel) != 0 {
 		return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
 	}
 
@@ -64,7 +82,7 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("Seccomp spec is being ignored")
 	}
 
-	// 2 annotations are use by containerd to support multi-container pods.
+	// Two annotations are use by containerd to support multi-container pods.
 	//   "io.kubernetes.cri.container-type"
 	//   "io.kubernetes.cri.sandbox-id"
 	containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation]
@@ -98,6 +116,9 @@ func ReadSpec(bundleDir string) (*specs.Spec, error) {
 	if err := json.Unmarshal(specBytes, &spec); err != nil {
 		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile, err, string(specBytes))
 	}
+	if err := ValidateSpec(&spec); err != nil {
+		return nil, err
+	}
 	return &spec, nil
 }
 
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index ef293e608..959be3af3 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -20,6 +20,8 @@ import (
 	"strings"
 	"testing"
 	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 )
 
 func TestWaitForReadyHappy(t *testing.T) {
@@ -94,3 +96,114 @@ func TestWaitForReadyTimeout(t *testing.T) {
 	}
 	cmd.Process.Kill()
 }
+
+func TestSpecInvalid(t *testing.T) {
+	for _, test := range []struct {
+		name  string
+		spec  specs.Spec
+		error string
+	}{
+		{
+			name: "valid",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+			},
+			error: "",
+		},
+		{
+			name: "valid+warning",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+					// This is normally set by docker and will just cause warnings to be logged.
+					ApparmorProfile: "someprofile",
+				},
+				// This is normally set by docker and will just cause warnings to be logged.
+				Linux: &specs.Linux{Seccomp: &specs.LinuxSeccomp{}},
+			},
+			error: "",
+		},
+		{
+			name: "no root",
+			spec: specs.Spec{
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+			},
+			error: "must be defined",
+		},
+		{
+			name: "empty root",
+			spec: specs.Spec{
+				Root: &specs.Root{},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+			},
+			error: "must be defined",
+		},
+		{
+			name: "no process",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+			},
+			error: "must be defined",
+		},
+		{
+			name: "empty args",
+			spec: specs.Spec{
+				Root:    &specs.Root{Path: "/"},
+				Process: &specs.Process{},
+			},
+			error: "must be defined",
+		},
+		{
+			name: "selinux",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args:         []string{"/bin/true"},
+					SelinuxLabel: "somelabel",
+				},
+			},
+			error: "is not supported",
+		},
+		{
+			name: "solaris",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Solaris: &specs.Solaris{},
+			},
+			error: "is not supported",
+		},
+		{
+			name: "windows",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Windows: &specs.Windows{},
+			},
+			error: "is not supported",
+		},
+	} {
+		err := ValidateSpec(&test.spec)
+		if len(test.error) == 0 {
+			if err != nil {
+				t.Errorf("ValidateSpec(%q) failed, err: %v", test.name, err)
+			}
+		} else {
+			if err == nil || !strings.Contains(err.Error(), test.error) {
+				t.Errorf("ValidateSpec(%q) wrong error, got: %v, want: .*%s.*", test.name, err, test.error)
+			}
+		}
+	}
+}
-- 
cgit v1.2.3


From bb31a119035dd5266737b41456d967789693cf20 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 28 Jun 2018 13:22:12 -0700
Subject: Wait for sandbox process when waiting for root container

Closes #71

PiperOrigin-RevId: 202532762
Change-Id: I80a446ff638672ff08e6fd853cd77e28dd05d540
---
 runsc/container/container.go |  2 +-
 runsc/sandbox/sandbox.go     | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index c7dc6ec10..b2ea78084 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -449,7 +449,7 @@ func (c *Container) Destroy() error {
 
 	// If we are the first container in the sandbox, take the sandbox down
 	// as well.
-	if c.Sandbox != nil && c.Sandbox.ID == c.ID {
+	if c.Sandbox != nil && c.Sandbox.IsRootContainer(c.ID) {
 		if err := c.Sandbox.Destroy(); err != nil {
 			log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err)
 		}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ed2c40e57..e1e7b39d1 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -434,9 +434,29 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	if err := conn.Call(boot.ContainerWait, &cid, &ws); err != nil {
 		return ws, fmt.Errorf("err waiting on container %q: %v", cid, err)
 	}
+
+	if s.IsRootContainer(cid) {
+		// If waiting for the root, give some time for the sandbox process to exit
+		// to prevent races with resources that might still be in use.
+		timeout := time.Now().Add(time.Second)
+		log.Debugf("Waiting for the sandbox process to exit")
+		for s.IsRunning() {
+			if time.Now().After(timeout) {
+				log.Debugf("Timeout waiting for sandbox process to exit")
+				break
+			}
+			time.Sleep(100 * time.Millisecond)
+		}
+	}
 	return ws, nil
 }
 
+// IsRootContainer returns true if the specified container ID belongs to the
+// root container.
+func (s *Sandbox) IsRootContainer(cid string) bool {
+	return s.ID == cid
+}
+
 // Stop stops the container in the sandbox.
 func (s *Sandbox) Stop(cid string) error {
 	// TODO: This should stop the container with the given ID
-- 
cgit v1.2.3


From 5a8e014c3d424abfe931b8493d06a129c3fdd388 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 28 Jun 2018 13:53:08 -0700
Subject: Add more image tests

PiperOrigin-RevId: 202537696
Change-Id: I900fe8fd36cc7a4edb44fe2d03f8ba6768db53cb
---
 runsc/test/image/BUILD         |   5 +-
 runsc/test/image/image_test.go | 175 +++++++++++++++++++++++++++++++++--------
 runsc/test/image/mysql.sql     |  23 ++++++
 3 files changed, 169 insertions(+), 34 deletions(-)
 create mode 100644 runsc/test/image/mysql.sql

(limited to 'runsc')

diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index 2876d4256..fb9db4d4c 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -6,7 +6,10 @@ go_test(
     name = "image_test",
     size = "small",
     srcs = ["image_test.go"],
-    data = ["latin10k.txt"],
+    data = [
+        "latin10k.txt",
+        "mysql.sql",
+    ],
     embed = [":image"],
     tags = [
         # Requires docker and runsc to be configured before the test runs.
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 5034411e5..849bf76a2 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -59,6 +59,30 @@ func mountArg(source, target string) string {
 	return fmt.Sprintf("%s:%s", source, target)
 }
 
+func linkArg(source *docker, target string) string {
+	return fmt.Sprintf("%s:%s", source.name, target)
+}
+
+// prepareFiles creates temp directory to copy files there. The sandbox doesn't
+// have access to files in the test dir.
+func prepareFiles(names ...string) (string, error) {
+	dir, err := ioutil.TempDir("", "image-test")
+	if err != nil {
+		return "", fmt.Errorf("ioutil.TempDir failed: %v", err)
+	}
+	if err := os.Chmod(dir, 0777); err != nil {
+		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
+	}
+	for _, name := range names {
+		src := getLocalPath(name)
+		dst := path.Join(dir, name)
+		if err := testutil.Copy(src, dst); err != nil {
+			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
+		}
+	}
+	return dir, nil
+}
+
 func getLocalPath(file string) string {
 	return path.Join(".", file)
 }
@@ -75,6 +99,7 @@ func makeDocker(namePrefix string) docker {
 
 // do executes docker command.
 func (d *docker) do(args ...string) (string, error) {
+	fmt.Printf("Running: docker %s\n", args)
 	cmd := exec.Command("docker", args...)
 	out, err := cmd.CombinedOutput()
 	if err != nil {
@@ -120,17 +145,32 @@ func (d *docker) findPort(sandboxPort int) (int, error) {
 // for the given pattern.
 func (d *docker) waitForOutput(pattern string, timeout time.Duration) error {
 	re := regexp.MustCompile(pattern)
+	var out string
 	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
-		out, err := d.do("logs", d.name)
+		var err error
+		out, err = d.do("logs", d.name)
 		if err != nil {
 			return err
 		}
 		if re.MatchString(out) {
+			// Success!
+			return nil
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	return fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
+}
+
+func (d *docker) waitForHTTP(port int, timeout time.Duration) error {
+	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
+		url := fmt.Sprintf("http://localhost:%d/", port)
+		if _, err := http.Get(url); err == nil {
+			// Success!
 			return nil
 		}
 		time.Sleep(10 * time.Millisecond)
 	}
-	return fmt.Errorf("timeout waiting for output %q", re.String())
+	return fmt.Errorf("timeout waiting for HTTP server on port %d", port)
 }
 
 func TestHelloWorld(t *testing.T) {
@@ -140,31 +180,53 @@ func TestHelloWorld(t *testing.T) {
 	}
 	defer d.cleanUp()
 
-	if err := d.waitForOutput(".*Hello from Docker!.*", 5*time.Second); err != nil {
+	if err := d.waitForOutput("Hello from Docker!", 5*time.Second); err != nil {
 		t.Fatalf("docker didn't say hello: %v", err)
 	}
 }
 
-func TestHttpd(t *testing.T) {
-	d := makeDocker("http-test")
+func testHTTPServer(port int) error {
+	url := fmt.Sprintf("http://localhost:%d/not-found", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		return fmt.Errorf("error reaching http server: %v", err)
+	}
+	if want := http.StatusNotFound; resp.StatusCode != want {
+		return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
 
-	// Create temp directory to copy htdocs files. The sandbox doesn't have access
-	// to files in the test dir.
-	dir, err := ioutil.TempDir("", "httpd")
+	url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port)
+	resp, err = http.Get(url)
 	if err != nil {
-		t.Fatalf("ioutil.TempDir failed: %v", err)
+		return fmt.Errorf("Error reaching http server: %v", err)
 	}
-	if err := os.Chmod(dir, 0777); err != nil {
-		t.Fatalf("os.Chmod(%q, 0777) failed: %v", dir, err)
+	if want := http.StatusOK; resp.StatusCode != want {
+		return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return fmt.Errorf("Error reading http response: %v", err)
 	}
-	src := getLocalPath("latin10k.txt")
-	dst := path.Join(dir, "latin10k.txt")
-	if err := testutil.Copy(src, dst); err != nil {
-		t.Fatalf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
+	defer resp.Body.Close()
+
+	// READALL is the last word in the file. Ensures everything was read.
+	if want := "READALL"; strings.HasSuffix(string(body), want) {
+		return fmt.Errorf("response doesn't contain %q, resp: %q", want, body)
+	}
+	return nil
+}
+
+func TestHttpd(t *testing.T) {
+	d := makeDocker("http-test")
+
+	dir, err := prepareFiles("latin10k.txt")
+	if err != nil {
+		t.Fatalf("prepareFiles() failed: %v", err)
 	}
 
 	// Start the container.
-	if out, err := d.run("-p", "80", "-v", mountArg(dir, "/usr/local/apache2/htdocs"), "httpd"); err != nil {
+	if out, err := d.run("-p", "80", "-v", mountArg(dir, "/usr/local/apache2/htdocs:ro"), "httpd"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
 	defer d.cleanUp()
@@ -176,37 +238,84 @@ func TestHttpd(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := d.waitForOutput(".*'httpd -D FOREGROUND'.*", 5*time.Second); err != nil {
+	if err := d.waitForOutput("'httpd -D FOREGROUND'", 5*time.Second); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
 
-	url := fmt.Sprintf("http://localhost:%d/not-found", port)
-	resp, err := http.Get(url)
+	if err := testHTTPServer(port); err != nil {
+		t.Fatalf("testHTTPServer(%d) failed: %v", port, err)
+	}
+}
+
+func TestNginx(t *testing.T) {
+	d := makeDocker("net-test")
+
+	dir, err := prepareFiles("latin10k.txt")
 	if err != nil {
-		t.Fatalf("error reaching http server: %v", err)
+		t.Fatalf("prepareFiles() failed: %v", err)
 	}
-	if want := http.StatusNotFound; resp.StatusCode != want {
-		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+
+	// Start the container.
+	if out, err := d.run("-p", "80", "-v", mountArg(dir, "/usr/share/nginx/html:ro"), "nginx"); err != nil {
+		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
+	defer d.cleanUp()
 
-	url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port)
-	resp, err = http.Get(url)
+	// Find where port 80 is mapped to.
+	port, err := d.findPort(80)
 	if err != nil {
-		t.Fatalf("Error reaching http server: %v", err)
+		t.Fatalf("docker.findPort(80) failed: %v", err)
 	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+
+	// Wait until it's up and running.
+	if err := d.waitForHTTP(port, 5*time.Second); err != nil {
+		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
 	}
 
-	body, err := ioutil.ReadAll(resp.Body)
+	if err := testHTTPServer(port); err != nil {
+		t.Fatalf("testHTTPServer(%d) failed: %v", port, err)
+	}
+}
+
+func TestMysql(t *testing.T) {
+	d := makeDocker("mysql-test")
+
+	// Start the container.
+	if out, err := d.run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
+		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	}
+	defer d.cleanUp()
+
+	// Wait until it's up and running.
+	if err := d.waitForOutput("port: 3306  MySQL Community Server", 30*time.Second); err != nil {
+		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	}
+
+	client := makeDocker("mysql-client-test")
+	dir, err := prepareFiles("mysql.sql")
 	if err != nil {
-		t.Fatalf("Error reading http response: %v", err)
+		t.Fatalf("prepareFiles() failed: %v", err)
 	}
-	defer resp.Body.Close()
 
-	// READALL is the last word in the file. Ensures everything was read.
-	if want := "READALL"; strings.HasSuffix(string(body), want) {
-		t.Errorf("response doesn't contain %q, resp: %q", want, body)
+	// Tell mysql client to connect to the server and execute the file in verbose
+	// mode to verify the output.
+	args := []string{
+		"--link", linkArg(&d, "mysql"),
+		"-v", mountArg(dir, "/sql"),
+		"mysql",
+		"mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql",
+	}
+	if out, err := client.run(args...); err != nil {
+		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	}
+	defer client.cleanUp()
+
+	// Ensure file executed to the end and shutdown mysql.
+	if err := client.waitForOutput("--------------\nshutdown\n--------------", 5*time.Second); err != nil {
+		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	}
+	if err := d.waitForOutput("mysqld: Shutdown complete", 15*time.Second); err != nil {
+		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
 }
 
diff --git a/runsc/test/image/mysql.sql b/runsc/test/image/mysql.sql
new file mode 100644
index 000000000..dd5bfaa4e
--- /dev/null
+++ b/runsc/test/image/mysql.sql
@@ -0,0 +1,23 @@
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SHOW databases;
+USE mysql;
+
+CREATE TABLE foo (id int);
+INSERT INTO foo VALUES(1);
+SELECT * FROM foo;
+DROP TABLE foo;
+
+shutdown;
-- 
cgit v1.2.3


From 16d37973ebc8f36ef613c0885879648cceaf1c45 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 28 Jun 2018 14:55:46 -0700
Subject: runsc: Add the "wait" subcommand.

Users can now call "runsc wait <container id>" to wait on a particular process
inside the container. -pid can also be used to wait on a specific PID.

Manually tested the wait subcommand for a single waiter and multiple waiters
(simultaneously 2 processes waiting on the container and 2 processes waiting on
a PID within the container).

PiperOrigin-RevId: 202548978
Change-Id: Idd507c2cdea613c3a14879b51cfb0f7ea3fb3d4c
---
 runsc/boot/controller.go          |  21 +++++++-
 runsc/boot/loader.go              |  49 ++++++++++++++----
 runsc/cmd/BUILD                   |   1 +
 runsc/cmd/wait.go                 | 103 ++++++++++++++++++++++++++++++++++++++
 runsc/container/container.go      |  14 ++++++
 runsc/container/container_test.go |  49 +++++++++++++-----
 runsc/main.go                     |   1 +
 runsc/sandbox/sandbox.go          |  24 ++++++++-
 8 files changed, 236 insertions(+), 26 deletions(-)
 create mode 100644 runsc/cmd/wait.go

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 56829c605..ff75a382e 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -61,6 +61,10 @@ const (
 	// and return its ExitStatus.
 	ContainerWait = "containerManager.Wait"
 
+	// ContainerWaitPID is used to wait on a process with a certain PID in
+	// the sandbox and return its ExitStatus.
+	ContainerWaitPID = "containerManager.WaitPID"
+
 	// NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
 	// and routes in a network stack.
 	NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
@@ -234,7 +238,22 @@ func (cm *containerManager) Resume(_, _ *struct{}) error {
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
 	log.Debugf("containerManager.Wait")
-	return cm.l.wait(cid, waitStatus)
+	return cm.l.waitContainer(*cid, waitStatus)
+}
+
+// WaitPIDArgs are arguments to the WaitPID method.
+type WaitPIDArgs struct {
+	// PID is the PID in the container's PID namespace.
+	PID int32
+
+	// CID is the container ID.
+	CID string
+}
+
+// WaitPID waits for the process with PID 'pid' in the sandbox.
+func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
+	log.Debugf("containerManager.Wait")
+	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
 }
 
 // SignalArgs are arguments to the Signal method.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f359a0eb0..014908179 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -83,6 +83,9 @@ type Loader struct {
 	// rootProcArgs refers to the root sandbox init task.
 	rootProcArgs kernel.CreateProcessArgs
 
+	// sandboxID is the ID for the whole sandbox.
+	sandboxID string
+
 	// mu guards containerRootTGIDs.
 	mu sync.Mutex
 
@@ -452,23 +455,46 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa
 	return tgid, nil
 }
 
-// wait waits for the init process in the given container.
-func (l *Loader) wait(cid *string, waitStatus *uint32) error {
+// TODO: Per-container namespaces must be supported
+// for -pid.
+
+// waitContainer waits for the root process of a container to exit.
+func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
+	// Don't defer unlock, as doing so would make it impossible for
+	// multiple clients to wait on the same container.
 	l.mu.Lock()
-	defer l.mu.Unlock()
-	tgid, ok := l.containerRootTGIDs[*cid]
+	tgid, ok := l.containerRootTGIDs[cid]
+	l.mu.Unlock()
 	if !ok {
-		return fmt.Errorf("can't find process for container %q in %v", *cid, l.containerRootTGIDs)
+		return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGIDs)
 	}
-
-	// TODO: Containers don't map 1:1 with their root
-	// processes. Container exits should be managed explicitly
-	// rather than via PID.
 	// If the thread either has already exited or exits during waiting,
 	// consider the container exited.
-	defer delete(l.containerRootTGIDs, *cid)
+	defer func() {
+		l.mu.Lock()
+		defer l.mu.Unlock()
+		// TODO: Containers don't map 1:1 with their root
+		// processes. Container exits should be managed explicitly
+		// rather than via PID.
+		delete(l.containerRootTGIDs, cid)
+	}()
+	return l.wait(tgid, cid, waitStatus)
+}
+
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
+	// TODO: Containers all currently share a PID namespace.
+	// When per-container PID namespaces are supported, wait should use cid
+	// to find the appropriate PID namespace.
+	if cid != l.sandboxID {
+		return errors.New("non-sandbox PID namespaces are not yet implemented")
+	}
+	return l.wait(tgid, cid, waitStatus)
+}
 
-	tg := l.k.TaskSet().Root.ThreadGroupWithID(tgid)
+// wait waits for the process with TGID 'tgid' in a container's PID namespace
+// to exit.
+func (l *Loader) wait(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
+	tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
 	if tg == nil {
 		return fmt.Errorf("no thread group with ID %d", tgid)
 	}
@@ -482,6 +508,7 @@ func (l *Loader) setRootContainerID(cid string) {
 	defer l.mu.Unlock()
 	// The root container has PID 1.
 	l.containerRootTGIDs = map[string]kernel.ThreadID{cid: 1}
+	l.sandboxID = cid
 }
 
 // WaitForStartSignal waits for a start signal from the control server.
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 747793efc..18e95284b 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -25,6 +25,7 @@ go_library(
         "run.go",
         "start.go",
         "state.go",
+        "wait.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/cmd",
     visibility = [
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
new file mode 100644
index 000000000..8437457c4
--- /dev/null
+++ b/runsc/cmd/wait.go
@@ -0,0 +1,103 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"syscall"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+const (
+	unsetPID = -1
+)
+
+// Wait implements subcommands.Command for the "wait" command.
+type Wait struct {
+	rootPID int
+	pid     int
+}
+
+// Name implements subcommands.Command.Name.
+func (*Wait) Name() string {
+	return "wait"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Wait) Synopsis() string {
+	return "wait on a process inside a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Wait) Usage() string {
+	return `wait [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (wt *Wait) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&wt.rootPID, "rootpid", unsetPID, "select a PID in the sandbox root PID namespace to wait on instead of the container's root process")
+	f.IntVar(&wt.pid, "pid", unsetPID, "select a PID in the container's PID namespace to wait on instead of the container's root process")
+}
+
+// Execute implements subcommands.Command.Execute. It waits for a process in a
+// container to exit before returning.
+func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+	// You can't specify both -pid and -rootpid.
+	if wt.rootPID != unsetPID && wt.pid != unsetPID {
+		Fatalf("only up to one of -pid and -rootPid can be set")
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	c, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading container: %v", err)
+	}
+
+	waitStatus := args[1].(*syscall.WaitStatus)
+	switch {
+	// Wait on the whole container.
+	case wt.rootPID == unsetPID && wt.pid == unsetPID:
+		ws, err := c.Wait()
+		if err != nil {
+			Fatalf("error waiting on container %q: %v", c.ID, err)
+		}
+		*waitStatus = ws
+	// Wait on a PID in the root PID namespace.
+	case wt.rootPID != unsetPID:
+		ws, err := c.WaitRootPID(int32(wt.rootPID))
+		if err != nil {
+			Fatalf("error waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
+		}
+		*waitStatus = ws
+	// Wait on a PID in the container's PID namespace.
+	case wt.pid != unsetPID:
+		ws, err := c.WaitPID(int32(wt.pid))
+		if err != nil {
+			Fatalf("error waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
+		}
+		*waitStatus = ws
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index b2ea78084..042c76577 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -353,6 +353,20 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 	return c.Sandbox.Wait(c.ID)
 }
 
+// WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
+// returns its WaitStatus.
+func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
+	log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
+	return c.Sandbox.WaitPID(pid, c.Sandbox.ID)
+}
+
+// WaitPID waits for process 'pid' in the container's PID namespace and returns
+// its WaitStatus.
+func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
+	log.Debugf("Wait on pid %d in container %q", pid, c.ID)
+	return c.Sandbox.WaitPID(pid, c.ID)
+}
+
 // Signal sends the signal to the container.
 func (c *Container) Signal(sig syscall.Signal) error {
 	log.Debugf("Signal container %q", c.ID)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 11285a123..ae500e7d0 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -75,9 +75,9 @@ func procListsEqual(got, want []*control.Process) bool {
 		pd1 := got[i]
 		pd2 := want[i]
 		// Zero out unimplemented and timing dependant fields.
-		pd1.Time, pd2.Time = "", ""
-		pd1.STime, pd2.STime = "", ""
-		pd1.C, pd2.C = 0, 0
+		pd1.Time = ""
+		pd1.STime = ""
+		pd1.C = 0
 		if *pd1 != *pd2 {
 			return false
 		}
@@ -1074,16 +1074,39 @@ func TestMultiContainerWait(t *testing.T) {
 		t.Errorf("failed to wait for sleep to start: %v", err)
 	}
 
-	// Wait on the short lived container.
-	if ws, err := containers[1].Wait(); err != nil {
-		t.Fatalf("failed to wait for process %q: %v", strings.Join(containers[1].Spec.Process.Args, " "), err)
-	} else if es := ws.ExitStatus(); es != 0 {
-		t.Fatalf("process %q exited with non-zero status %d", strings.Join(containers[1].Spec.Process.Args, " "), es)
-	}
+	// Wait on the short lived container from multiple goroutines.
+	wg := sync.WaitGroup{}
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			if ws, err := containers[1].Wait(); err != nil {
+				t.Errorf("failed to wait for process %q: %v", strings.Join(containers[1].Spec.Process.Args, " "), err)
+			} else if es := ws.ExitStatus(); es != 0 {
+				t.Errorf("process %q exited with non-zero status %d", strings.Join(containers[1].Spec.Process.Args, " "), es)
+			}
 
-	// After Wait returns, ensure that the root container is running and
-	// the child has finished.
-	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
-		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
+			// After Wait returns, ensure that the root container is running and
+			// the child has finished.
+			if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+				t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
+			}
+		}()
+	}
+
+	// Also wait via PID.
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			const pid = 2
+			if ws, err := containers[0].WaitPID(pid); err != nil {
+				t.Errorf("failed to wait for PID %d: %v", pid, err)
+			} else if es := ws.ExitStatus(); es != 0 {
+				t.Errorf("PID %d exited with non-zero status %d", pid, es)
+			}
+		}()
 	}
+
+	wg.Wait()
 }
diff --git a/runsc/main.go b/runsc/main.go
index 563ef8c67..dfb338b0f 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -84,6 +84,7 @@ func main() {
 	subcommands.Register(new(cmd.Run), "")
 	subcommands.Register(new(cmd.Start), "")
 	subcommands.Register(new(cmd.State), "")
+	subcommands.Register(new(cmd.Wait), "")
 
 	// Register internal commands with the internal group name. This causes
 	// them to be sorted below the user-facing commands with empty group.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e1e7b39d1..9200fbee9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -432,7 +432,29 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	defer conn.Close()
 
 	if err := conn.Call(boot.ContainerWait, &cid, &ws); err != nil {
-		return ws, fmt.Errorf("err waiting on container %q: %v", cid, err)
+		return ws, fmt.Errorf("error waiting on container %q: %v", cid, err)
+	}
+	return ws, nil
+}
+
+// WaitPID waits for process 'pid' in the container's sandbox and returns its
+// WaitStatus.
+func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) {
+	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
+	var ws syscall.WaitStatus
+	conn, err := s.connect()
+	if err != nil {
+		return ws, err
+	}
+	defer conn.Close()
+
+	args := &boot.WaitPIDArgs{
+		PID: pid,
+		CID: cid,
+	}
+
+	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
+		return ws, fmt.Errorf("error waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
 	}
 
 	if s.IsRootContainer(cid) {
-- 
cgit v1.2.3


From 25e315c2e1764a9b0a1b70196e1108c00d172f48 Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Fri, 29 Jun 2018 13:08:41 -0700
Subject: Added leave-running flag for checkpoint.

The leave-running flag allows the container to continue running after a
checkpoint has occurred by doing an immediate restore into a new
container with the same container ID after the old container is destroyed.

Updates #80.

PiperOrigin-RevId: 202695426
Change-Id: Iac50437f5afda018dc18b24bb8ddb935983cf336
---
 runsc/cmd/checkpoint.go      | 57 ++++++++++++++++++++++++++++++++++++++++----
 runsc/container/BUILD        |  1 +
 runsc/container/container.go | 57 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 109 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index a28eb0f02..9348289ca 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -17,12 +17,15 @@ package cmd
 import (
 	"os"
 	"path/filepath"
+	"syscall"
 
 	"context"
 	"flag"
 	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/container"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 // File containing the container's saved image/state within the given image-path's directory.
@@ -30,7 +33,8 @@ const checkpointFileName = "checkpoint.img"
 
 // Checkpoint implements subcommands.Command for the "checkpoint" command.
 type Checkpoint struct {
-	imagePath string
+	imagePath    string
+	leaveRunning bool
 }
 
 // Name implements subcommands.Command.Name.
@@ -51,14 +55,12 @@ func (*Checkpoint) Usage() string {
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (c *Checkpoint) SetFlags(f *flag.FlagSet) {
-	f.StringVar(&c.imagePath, "image-path", "", "path to saved container image")
+	f.StringVar(&c.imagePath, "image-path", "", "directory path to saved container image")
+	f.BoolVar(&c.leaveRunning, "leave-running", false, "restart the container after checkpointing")
 
 	// Unimplemented flags necessary for compatibility with docker.
 	var wp string
 	f.StringVar(&wp, "work-path", "", "ignored")
-
-	var lr bool
-	f.BoolVar(&lr, "leave-running", false, "ignored")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -71,6 +73,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
@@ -98,5 +101,49 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("checkpoint failed: %v", err)
 	}
 
+	if !c.leaveRunning {
+		return subcommands.ExitSuccess
+	}
+
+	// TODO: Make it possible to restore into same container.
+	// For now, we can fake it by destroying the container and making a
+	// new container with the same ID. This hack does not work with docker
+	// which uses the container pid to ensure that the restore-container is
+	// actually the same as the checkpoint-container. By restoring into
+	// the same container, we will solve the docker incompatibility.
+
+	// Restore into new container with same ID.
+	bundleDir := cont.BundleDir
+	if bundleDir == "" {
+		Fatalf("error setting bundleDir")
+	}
+
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+
+	specutils.LogSpec(spec)
+
+	if cont.ConsoleSocket != "" {
+		log.Warningf("ignoring console socket since it cannot be restored")
+	}
+
+	if err := cont.DestroyAndWait(); err != nil {
+		Fatalf("error destroying container: %v", err)
+	}
+
+	cont, err = container.Create(id, spec, conf, bundleDir, "", "", fullImagePath)
+	if err != nil {
+		Fatalf("error restoring container: %v", err)
+	}
+
+	if err := cont.Start(conf); err != nil {
+		Fatalf("error starting container: %v", err)
+	}
+
+	ws, err := cont.Wait()
+	*waitStatus = ws
+
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 61e05e1c3..679d7e097 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -16,6 +16,7 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/sentry/control",
+        "//pkg/syserror",
         "//runsc/boot",
         "//runsc/sandbox",
         "//runsc/specutils",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 042c76577..8dca721f6 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -30,6 +30,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/sandbox"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -100,11 +101,12 @@ type Container struct {
 func Load(rootDir, id string) (*Container, error) {
 	log.Debugf("Load container %q %q", rootDir, id)
 	if err := validateID(id); err != nil {
-		return nil, err
+		return nil, fmt.Errorf("error validating id: %v", err)
 	}
 
 	cRoot, err := findContainerRoot(rootDir, id)
 	if err != nil {
+		// Preserve error so that callers can distinguish 'not found' errors.
 		return nil, err
 	}
 
@@ -471,6 +473,32 @@ func (c *Container) Destroy() error {
 
 	c.Sandbox = nil
 	c.Status = Stopped
+
+	return nil
+}
+
+// DestroyAndWait frees all resources associated with the container
+// and waits for destroy to finish before returning.
+func (c *Container) DestroyAndWait() error {
+	sandboxPid := c.Sandbox.Pid
+	goferPid := c.Sandbox.GoferPid
+
+	if err := c.Destroy(); err != nil {
+		return fmt.Errorf("error destroying container %v: %v", c, err)
+	}
+
+	if sandboxPid != 0 {
+		if err := waitForDeath(sandboxPid, 5*time.Second); err != nil {
+			return fmt.Errorf("error waiting for sandbox death: %v", err)
+		}
+	}
+
+	if goferPid != 0 {
+		if err := waitForDeath(goferPid, 5*time.Second); err != nil {
+			return fmt.Errorf("error waiting for gofer death: %v", err)
+		}
+	}
+
 	return nil
 }
 
@@ -490,3 +518,30 @@ func (c *Container) save() error {
 	}
 	return nil
 }
+
+// waitForDeath ensures that process is dead before proceeding.
+//
+// This is racy because the kernel can potentially reuse the pid in the time
+// between the process' death and the first check after the process has ended.
+func waitForDeath(pid int, timeout time.Duration) error {
+	backoff := 1 * time.Millisecond
+	for start := time.Now(); time.Now().Sub(start) < timeout; {
+
+		if err := syscall.Kill(pid, 0); err != nil {
+			if err == syserror.ESRCH {
+				// pid does not exist so process must have died
+				return nil
+			}
+			return fmt.Errorf("error killing pid (%d): %v", pid, err)
+		}
+		// pid is still alive.
+
+		// Process continues to run, backoff and retry.
+		time.Sleep(backoff)
+		backoff *= 2
+		if backoff > 1*time.Second {
+			backoff = 1 * time.Second
+		}
+	}
+	return fmt.Errorf("timed out waiting for process (%d)", pid)
+}
-- 
cgit v1.2.3


From 80bdf8a4068de3ac4a73b6b61a0cdcfe3e3571af Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Fri, 29 Jun 2018 14:46:45 -0700
Subject: Sets the restore environment for restoring a container.

Updated how restoring occurs through boot.go with a separate Restore function.
This prevents a new process and new mounts from being created.
Added tests to ensure the container is restored.
Registered checkpoint and restore commands so they can be used.
Docker support for these commands is still limited.
Working on #80.

PiperOrigin-RevId: 202710950
Change-Id: I2b893ceaef6b9442b1ce3743bd112383cb92af0c
---
 pkg/sentry/fs/gofer/inode_state.go |  27 ++++++-
 runsc/boot/fs.go                   | 103 +++++++++++++++----------
 runsc/boot/loader.go               | 146 ++++++++++++++++++++++--------------
 runsc/boot/loader_test.go          |  74 +++++++++++++++++-
 runsc/cmd/boot.go                  |   2 +
 runsc/cmd/checkpoint.go            |   2 +-
 runsc/cmd/restore.go               |   2 +-
 runsc/container/container_test.go  | 150 ++++++++++++++++++++++++++++++++++---
 runsc/main.go                      |   2 +
 runsc/specutils/specutils.go       |  11 ---
 10 files changed, 391 insertions(+), 128 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 82d1dd4da..33ec33364 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -17,6 +17,7 @@ package gofer
 import (
 	"errors"
 	"fmt"
+	"path/filepath"
 	"strings"
 
 	"gvisor.googlesource.com/gvisor/pkg/p9"
@@ -77,6 +78,29 @@ func (i *inodeFileState) saveLoading() struct{} {
 	return struct{}{}
 }
 
+// splitAbsolutePath splits the path on slashes ignoring the leading slash.
+func splitAbsolutePath(path string) []string {
+	if len(path) == 0 {
+		panic("There is no path!")
+	}
+	if path != filepath.Clean(path) {
+		panic(fmt.Sprintf("path %q is not clean", path))
+	}
+	// This case is to return {} rather than {""}
+	if path == "/" {
+		return []string{}
+	}
+	if path[0] != '/' {
+		panic(fmt.Sprintf("path %q is not absolute", path))
+	}
+
+	s := strings.Split(path, "/")
+
+	// Since p is absolute, the first component of s
+	// is an empty string. We must remove that.
+	return s[1:]
+}
+
 // loadLoading is invoked by stateify.
 func (i *inodeFileState) loadLoading(_ struct{}) {
 	i.loading.Lock()
@@ -98,7 +122,8 @@ func (i *inodeFileState) afterLoad() {
 		// TODO: Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 		var err error
-		_, i.file, err = i.s.attach.walk(ctx, strings.Split(name, "/"))
+
+		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
 		if err != nil {
 			return fmt.Errorf("failed to walk to %q: %v", name, err)
 		}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e0d7fc769..a9b2f225a 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -38,6 +38,14 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
+const (
+	// Filesystem name for 9p gofer mounts.
+	rootFsName = "9p"
+
+	// Device name for root mount.
+	rootDevice = "9pfs-/"
+)
+
 type fdDispenser struct {
 	fds []int
 }
@@ -64,7 +72,8 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
 	}
-	if err := configureMounts(rootCtx, spec, conf, mns, fds); err != nil {
+	mounts := compileMounts(spec)
+	if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil {
 		return nil, fmt.Errorf("failed to configure mounts: %v", err)
 	}
 	if !fds.empty() {
@@ -73,27 +82,23 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 	return mns, nil
 }
 
-// configureMounts iterates over Spec.Mounts and mounts them in the specified
-// mount namespace.
-func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser) error {
+// compileMounts returns the supported mounts from the mount spec, adding any
+// additional mounts that are required by the OCI specification.
+func compileMounts(spec *specs.Spec) []specs.Mount {
 	// Keep track of whether proc, sys, and tmp were mounted.
 	var procMounted, sysMounted, tmpMounted bool
+	var mounts []specs.Mount
 
 	// Always mount /dev.
-	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+	mounts = append(mounts, specs.Mount{
 		Type:        "devtmpfs",
 		Destination: "/dev",
-	}); err != nil {
-		return err
-	}
+	})
 
-	// Always mount /dev/pts.
-	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+	mounts = append(mounts, specs.Mount{
 		Type:        "devpts",
 		Destination: "/dev/pts",
-	}); err != nil {
-		return err
-	}
+	})
 
 	// Mount all submounts from the spec.
 	for _, m := range spec.Mounts {
@@ -101,6 +106,7 @@ func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *f
 			log.Warningf("ignoring dev mount at %q", m.Destination)
 			continue
 		}
+		mounts = append(mounts, m)
 		switch filepath.Clean(m.Destination) {
 		case "/proc":
 			procMounted = true
@@ -109,43 +115,45 @@ func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *f
 		case "/tmp":
 			tmpMounted = true
 		}
-
-		if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil {
-			return err
-		}
 	}
 
 	// Mount proc and sys even if the user did not ask for it, as the spec
 	// says we SHOULD.
 	if !procMounted {
-		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		mounts = append(mounts, specs.Mount{
 			Type:        "proc",
 			Destination: "/proc",
-		}); err != nil {
-			return err
-		}
+		})
 	}
 	if !sysMounted {
-		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		mounts = append(mounts, specs.Mount{
 			Type:        "sysfs",
 			Destination: "/sys",
-		}); err != nil {
-			return err
-		}
+		})
 	}
 
 	// Technically we don't have to mount tmpfs at /tmp, as we could just
 	// rely on the host /tmp, but this is a nice optimization, and fixes
 	// some apps that call mknod in /tmp.
 	if !tmpMounted {
-		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		mounts = append(mounts, specs.Mount{
 			Type:        "tmpfs",
 			Destination: "/tmp",
-		}); err != nil {
+		})
+	}
+	return mounts
+}
+
+// setMounts iterates over mounts and mounts them in the specified
+// mount namespace.
+func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
+
+	// Mount all submounts from mounts.
+	for _, m := range mounts {
+		if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil {
 			return err
 		}
 	}
-
 	return nil
 }
 
@@ -158,19 +166,20 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 		rootInode *fs.Inode
 		err       error
 	)
+
 	switch conf.FileAccess {
 	case FileAccessProxy:
 		fd := fds.remove()
 		log.Infof("Mounting root over 9P, ioFD: %d", fd)
 		hostFS := mustFindFilesystem("9p")
-		rootInode, err = hostFS.Mount(ctx, "root", mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
+		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
 		if err != nil {
 			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 		}
 
 	case FileAccessDirect:
 		hostFS := mustFindFilesystem("whitelistfs")
-		rootInode, err = hostFS.Mount(ctx, "root", mf, "root="+spec.Root.Path+",dont_translate_ownership=true")
+		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, "root="+spec.Root.Path+",dont_translate_ownership=true")
 		if err != nil {
 			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 		}
@@ -263,7 +272,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 	return fsName, data, useOverlay, err
 }
 
-func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error {
+func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, data, useOverlay, err := getMountNameAndOptions(conf, m, fds)
@@ -285,14 +294,13 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 		mf.ReadOnly = true
 	}
 
-	inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ","))
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(data, ","))
 	if err != nil {
 		return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
 	}
 
 	// If there are submounts, we need to overlay the mount on top of a
 	// ramfs with stub directories for submount paths.
-	mounts := specutils.SupportedMounts(spec.Mounts)
 	submounts := subtargets(m.Destination, mounts)
 	if len(submounts) > 0 {
 		log.Infof("Adding submount overlay over %q", m.Destination)
@@ -406,7 +414,7 @@ func mountDevice(m specs.Mount) string {
 	if m.Type == "bind" {
 		// Make a device string that includes the target, which is consistent across
 		// S/R and uniquely identifies the connection.
-		return "p9fs-" + m.Destination
+		return "9pfs-" + m.Destination
 	}
 	// All other fs types use device "none".
 	return "none"
@@ -417,14 +425,24 @@ func mountDevice(m specs.Mount) string {
 func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
 	fsName, data, _, err := getMountNameAndOptions(conf, m, fds)
 	dataString := strings.Join(data, ",")
+
+	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
 	if err != nil {
 		return err
 	}
-	renv.MountSources[fsName] = append(renv.MountSources[fsName], fs.MountArgs{
+	// TODO: Fix this when we support all the mount types and make this a
+	// fatal error.
+	if fsName == "" {
+		return nil
+	}
+
+	newMount := fs.MountArgs{
 		Dev:   mountDevice(m),
 		Flags: mountFlags(m.Options),
 		Data:  dataString,
-	})
+	}
+	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+	log.Infof("Added mount at %q: %+v", fsName, newMount)
 	return nil
 }
 
@@ -438,6 +456,8 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 		MountSources: make(map[string][]fs.MountArgs),
 	}
 
+	mounts := compileMounts(spec)
+
 	// Add root mount.
 	fd := fds.remove()
 	dataString := strings.Join([]string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}, ",")
@@ -445,15 +465,16 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	if spec.Root.Readonly {
 		mf.ReadOnly = true
 	}
-	const rootFSName = "9p"
-	renv.MountSources[rootFSName] = append(renv.MountSources[rootFSName], fs.MountArgs{
-		Dev:   "p9fs-/",
+
+	rootMount := fs.MountArgs{
+		Dev:   rootDevice,
 		Flags: mf,
 		Data:  dataString,
-	})
+	}
+	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
 
 	// Add submounts
-	for _, m := range spec.Mounts {
+	for _, m := range mounts {
 		if err := addRestoreMount(conf, renv, m, fds); err != nil {
 			return nil, err
 		}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 014908179..6fcfba5cb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -80,6 +81,9 @@ type Loader struct {
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
 
+	// restore is set to true if we are restoring a container.
+	restore bool
+
 	// rootProcArgs refers to the root sandbox init task.
 	rootProcArgs kernel.CreateProcessArgs
 
@@ -106,7 +110,17 @@ func init() {
 }
 
 // New initializes a new kernel loader configured by spec.
+// New also handles setting up a kernel for restoring a container.
 func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []int, console bool) (*Loader, error) {
+	var (
+		tk          *kernel.Timekeeper
+		creds       *auth.Credentials
+		vdso        *loader.VDSO
+		utsns       *kernel.UTSNamespace
+		ipcns       *kernel.IPCNamespace
+		restoreFile *os.File
+		procArgs    kernel.CreateProcessArgs
+	)
 	// Create kernel and platform.
 	p, err := createPlatform(conf)
 	if err != nil {
@@ -116,47 +130,60 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		Platform: p,
 	}
 
-	// Create VDSO.
-	//
-	// Pass k as the platform since it is savable, unlike the actual platform.
-	vdso, err := loader.PrepareVDSO(k)
-	if err != nil {
-		return nil, fmt.Errorf("error creating vdso: %v", err)
-	}
+	if restoreFD == -1 {
+		// Create VDSO.
+		//
+		// Pass k as the platform since it is savable, unlike the actual platform.
+		vdso, err := loader.PrepareVDSO(k)
+		if err != nil {
+			return nil, fmt.Errorf("error creating vdso: %v", err)
+		}
 
-	// Create timekeeper.
-	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
-	if err != nil {
-		return nil, fmt.Errorf("error creating timekeeper: %v", err)
-	}
-	tk.SetClocks(time.NewCalibratedClocks())
+		// Create timekeeper.
+		tk, err = kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+		if err != nil {
+			return nil, fmt.Errorf("error creating timekeeper: %v", err)
+		}
+		tk.SetClocks(time.NewCalibratedClocks())
 
-	// Create capabilities.
-	caps, err := specutils.Capabilities(spec.Process.Capabilities)
-	if err != nil {
-		return nil, fmt.Errorf("error creating capabilities: %v", err)
-	}
+		// Create capabilities.
+		caps, err := specutils.Capabilities(spec.Process.Capabilities)
+		if err != nil {
+			return nil, fmt.Errorf("error creating capabilities: %v", err)
+		}
 
-	// Convert the spec's additional GIDs to KGIDs.
-	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
-	for _, GID := range spec.Process.User.AdditionalGids {
-		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
-	}
+		// Convert the spec's additional GIDs to KGIDs.
+		extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+		for _, GID := range spec.Process.User.AdditionalGids {
+			extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+		}
 
-	// Create credentials.
-	creds := auth.NewUserCredentials(
-		auth.KUID(spec.Process.User.UID),
-		auth.KGID(spec.Process.User.GID),
-		extraKGIDs,
-		caps,
-		auth.NewRootUserNamespace())
+		// Create credentials.
+		creds = auth.NewUserCredentials(
+			auth.KUID(spec.Process.User.UID),
+			auth.KGID(spec.Process.User.GID),
+			extraKGIDs,
+			caps,
+			auth.NewRootUserNamespace())
 
-	// Create user namespace.
-	// TODO: Not clear what domain name should be here.  It is
-	// not configurable from runtime spec.
-	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
+		// Create user namespace.
+		// TODO: Not clear what domain name should be here.  It is
+		// not configurable from runtime spec.
+		utsns = kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
 
-	ipcns := kernel.NewIPCNamespace(creds.UserNamespace)
+		ipcns = kernel.NewIPCNamespace(creds.UserNamespace)
+	} else {
+		// Create and set RestoreEnvironment
+		fds := &fdDispenser{fds: ioFDs}
+		renv, err := createRestoreEnvironment(spec, conf, fds)
+		if err != nil {
+			return nil, fmt.Errorf("error creating RestoreEnvironment: %v", err)
+		}
+		fs.SetRestoreEnvironment(*renv)
+
+		restoreFile = os.NewFile(uintptr(restoreFD), "restore_file")
+		defer restoreFile.Close()
+	}
 
 	if err := enableStrace(conf); err != nil {
 		return nil, fmt.Errorf("failed to enable strace: %v", err)
@@ -168,19 +195,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	// Run().
 	networkStack := newEmptyNetworkStack(conf, k)
 
-	// Check if we need to restore the kernel
-	if restoreFD != -1 {
-		restoreFile := os.NewFile(uintptr(restoreFD), "restore_file")
-		defer restoreFile.Close()
-
-		// Load the state.
-		loadOpts := state.LoadOpts{
-			Source: restoreFile,
-		}
-		if err := loadOpts.Load(k, p, networkStack); err != nil {
-			return nil, err
-		}
-	} else {
+	if restoreFile == nil {
 		// Initiate the Kernel object, which is required by the Context passed
 		// to createVFS in order to mount (among other things) procfs.
 		if err = k.Init(kernel.InitKernelArgs{
@@ -196,6 +211,17 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		}); err != nil {
 			return nil, fmt.Errorf("error initializing kernel: %v", err)
 		}
+	} else {
+		// Load the state.
+		loadOpts := state.LoadOpts{
+			Source: restoreFile,
+		}
+		if err := loadOpts.Load(k, p, networkStack); err != nil {
+			return nil, err
+		}
+
+		// Set timekeeper.
+		k.Timekeeper().SetClocks(time.NewCalibratedClocks())
 	}
 
 	// Turn on packet logging if enabled.
@@ -232,9 +258,11 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	// Ensure that signals received are forwarded to the emulated kernel.
 	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
 
-	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
-	if err != nil {
-		return nil, fmt.Errorf("failed to create root process: %v", err)
+	if restoreFile == nil {
+		procArgs, err = newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create root process: %v", err)
+		}
 	}
 
 	l := &Loader{
@@ -245,6 +273,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		watchdog:             watchdog,
 		stopSignalForwarding: stopSignalForwarding,
 		rootProcArgs:         procArgs,
+		restore:              restoreFile != nil,
 	}
 	ctrl.manager.l = l
 	return l, nil
@@ -378,13 +407,16 @@ func (l *Loader) run() error {
 		}
 	}
 
-	// Create the root container init task.
-	if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
-		return fmt.Errorf("failed to create init process: %v", err)
-	}
+	// If we are restoring, we do not want to create a process.
+	if !l.restore {
+		// Create the root container init task.
+		if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+			return fmt.Errorf("failed to create init process: %v", err)
+		}
 
-	// CreateProcess takes a reference on FDMap if successful.
-	l.rootProcArgs.FDMap.DecRef()
+		// CreateProcess takes a reference on FDMap if successful.
+		l.rootProcArgs.FDMap.DecRef()
+	}
 
 	l.watchdog.Start()
 	return l.k.Start()
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 15ced0601..28d45b54b 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -364,7 +364,7 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "p9fs-/",
+							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
 							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
@@ -376,6 +376,24 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev: "none",
 						},
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
 					},
 				},
 			},
@@ -406,15 +424,40 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "p9fs-/",
+							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
 							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 						{
-							Dev:  "p9fs-/dev/fd-foo",
+							Dev:  "9pfs-/dev/fd-foo",
 							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true",
 						},
 					},
+					"tmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"proc": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
+					},
 				},
 			},
 		},
@@ -445,7 +488,7 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "p9fs-/",
+							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
 							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
@@ -456,6 +499,29 @@ func TestRestoreEnvironment(t *testing.T) {
 							Flags: fs.MountSourceFlags{NoAtime: true},
 							Data:  "uid=1022",
 						},
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"proc": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
 					},
 				},
 			},
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 685cb6f00..b19da315f 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -142,7 +142,9 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
+
 	l, err := boot.New(spec, conf, b.controllerFD, b.restoreFD, b.ioFDs.GetArray(), b.console)
+
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 9348289ca..94efc3517 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -44,7 +44,7 @@ func (*Checkpoint) Name() string {
 
 // Synopsis implements subcommands.Command.Synopsis.
 func (*Checkpoint) Synopsis() string {
-	return "checkpoint current state of container"
+	return "checkpoint current state of container (experimental)"
 }
 
 // Usage implements subcommands.Command.Usage.
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index cc55beeaf..69cdb35c1 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -42,7 +42,7 @@ func (*Restore) Name() string {
 
 // Synopsis implements subcommands.Command.Synopsis.
 func (*Restore) Synopsis() string {
-	return "restore a saved state of container"
+	return "restore a saved state of container (experimental)"
 }
 
 // Usage implements subcommands.Command.Usage.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index ae500e7d0..a6bb39c5d 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -22,6 +22,7 @@ import (
 	"path"
 	"path/filepath"
 	"reflect"
+	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -106,6 +107,56 @@ func procListToString(pl []*control.Process) string {
 	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
 }
 
+// createWriteableOutputFile creates an output file that can be read and written to in the sandbox.
+func createWriteableOutputFile(path string) (*os.File, error) {
+	outputFile, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+	if err != nil {
+		return nil, fmt.Errorf("error creating file: %q, %v", path, err)
+	}
+
+	// Chmod to allow writing after umask.
+	if err := outputFile.Chmod(0666); err != nil {
+		return nil, fmt.Errorf("error chmoding file: %q, %v", path, err)
+	}
+	return outputFile, nil
+}
+
+func readOutputNum(outputFile *os.File, path string, first bool) (int, error) {
+	var num int
+	time.Sleep(1 * time.Second)
+
+	// Check that outputFile exists and contains counting data.
+	fileInfo, err := os.Stat(path)
+	if err != nil {
+		return 0, fmt.Errorf("error creating output file: %v", err)
+	}
+
+	if fileInfo.Size() == 0 {
+		return 0, fmt.Errorf("failed to write to file, file still appears empty")
+	}
+
+	// Read the first number in the new file
+	outputFileContent, err := ioutil.ReadAll(outputFile)
+	if err != nil {
+		return 0, fmt.Errorf("error reading file: %v", err)
+	}
+	if len(outputFileContent) == 0 {
+		return 0, fmt.Errorf("error no content was read")
+	}
+
+	nums := strings.Split(string(outputFileContent), "\n")
+
+	if first {
+		num, err = strconv.Atoi(nums[0])
+	} else {
+		num, err = strconv.Atoi(nums[len(nums)-2])
+	}
+	if err != nil {
+		return 0, fmt.Errorf("error getting number from file: %v", err)
+	}
+	return num, nil
+}
+
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
 func run(spec *specs.Spec) error {
@@ -429,13 +480,28 @@ func TestExec(t *testing.T) {
 	}
 }
 
-// TestCheckpoint verifies that calling checkpoint with an image-path flag succeeds.
-// Since there is no current default image path, confirming that calling
-// checkpoint without an image path fails.
-// Checks that there is a file with the name and location given by image path.
-func TestCheckpoint(t *testing.T) {
-	// Container will succeed.
-	spec := testutil.NewSpecWithArgs("sleep", "100")
+// TestCheckpointRestore creates a container that continuously writes successive integers
+// to a file. To test checkpoint and restore functionality, the container is
+// checkpointed and the last number printed to the file is recorded. Then, it is restored in two
+// new containers and the first number printed from these containers is checked. Both should
+// be the next consecutive number after the last number from the checkpointed container.
+func TestCheckpointRestore(t *testing.T) {
+	outputPath := filepath.Join(os.TempDir(), "output")
+	outputFile, err := createWriteableOutputFile(outputPath)
+	if err != nil {
+		t.Fatalf("error creating output file: %v", err)
+	}
+	defer outputFile.Close()
+
+	outputFileSandbox := strings.Replace(outputPath, os.TempDir(), "/tmp2", -1)
+
+	script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %s; sleep 1; done", outputFileSandbox)
+	spec := testutil.NewSpecWithArgs("bash", "-c", script)
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Type:        "bind",
+		Destination: "/tmp2",
+		Source:      os.TempDir(),
+	})
 
 	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
@@ -464,20 +530,80 @@ func TestCheckpoint(t *testing.T) {
 	}
 	defer file.Close()
 
+	time.Sleep(1 * time.Second)
+
 	// Checkpoint running container; save state into new file.
 	if err := cont.Checkpoint(file); err != nil {
 		t.Fatalf("error checkpointing container to empty file: %v", err)
 	}
 	defer os.RemoveAll(imagePath)
 
-	// Check to see if file exists and contains data.
-	fileInfo, err := os.Stat(imagePath)
+	lastNum, err := readOutputNum(outputFile, outputPath, false)
+	if err != nil {
+		t.Fatalf("error with outputFile: %v", err)
+	}
+
+	// Delete and recreate file before restoring.
+	if err := os.Remove(outputPath); err != nil {
+		t.Fatalf("error removing file")
+	}
+	outputFile2, err := createWriteableOutputFile(outputPath)
+	if err != nil {
+		t.Fatalf("error creating output file: %v", err)
+	}
+	defer outputFile2.Close()
+
+	// Restore into a new container.
+	cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", imagePath)
 	if err != nil {
-		t.Fatalf("error checkpointing container: %v", err)
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont2.Destroy()
+	if err := cont2.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	firstNum, err := readOutputNum(outputFile2, outputPath, true)
+	if err != nil {
+		t.Fatalf("error with outputFile: %v", err)
 	}
-	if size := fileInfo.Size(); size == 0 {
-		t.Fatalf("failed checkpoint, file still appears empty: %v", err)
+
+	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+	if lastNum+1 != firstNum {
+		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
 	}
+
+	// Restore into another container!
+	// Delete and recreate file before restoring.
+	if err := os.Remove(outputPath); err != nil {
+		t.Fatalf("error removing file")
+	}
+	outputFile3, err := createWriteableOutputFile(outputPath)
+	if err != nil {
+		t.Fatalf("error creating output file: %v", err)
+	}
+	defer outputFile3.Close()
+
+	// Restore into a new container.
+	cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", imagePath)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont3.Destroy()
+	if err := cont3.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	firstNum2, err := readOutputNum(outputFile3, outputPath, true)
+	if err != nil {
+		t.Fatalf("error with outputFile: %v", err)
+	}
+
+	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+	if lastNum+1 != firstNum2 {
+		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
+	}
+
 }
 
 // TestPauseResume tests that we can successfully pause and resume a container.
diff --git a/runsc/main.go b/runsc/main.go
index dfb338b0f..10ae44b5e 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -71,6 +71,7 @@ func main() {
 	subcommands.Register(subcommands.FlagsCommand(), "")
 
 	// Register user-facing runsc commands.
+	subcommands.Register(new(cmd.Checkpoint), "")
 	subcommands.Register(new(cmd.Create), "")
 	subcommands.Register(new(cmd.Delete), "")
 	subcommands.Register(new(cmd.Events), "")
@@ -80,6 +81,7 @@ func main() {
 	subcommands.Register(new(cmd.List), "")
 	subcommands.Register(new(cmd.Pause), "")
 	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Restore), "")
 	subcommands.Register(new(cmd.Resume), "")
 	subcommands.Register(new(cmd.Run), "")
 	subcommands.Register(new(cmd.Start), "")
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 0d9e09e9d..34243e623 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -266,17 +266,6 @@ func IsSupportedDevMount(m specs.Mount) bool {
 	return true
 }
 
-// SupportedMounts filters out unsupported mounts.
-func SupportedMounts(mounts []specs.Mount) []specs.Mount {
-	var newMounts []specs.Mount
-	for _, m := range mounts {
-		if IsSupportedDevMount(m) {
-			newMounts = append(newMounts, m)
-		}
-	}
-	return newMounts
-}
-
 // BinPath returns the real path to self, resolving symbolink links. This is done
 // to make the process name appears as 'runsc', instead of 'exe'.
 func BinPath() (string, error) {
-- 
cgit v1.2.3


From ca353b53ed09c94ecf655dba991c7b587026d12a Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Fri, 29 Jun 2018 15:50:38 -0700
Subject: Fix typo.

PiperOrigin-RevId: 202720658
Change-Id: Iff42fd23f831ee7f29ddd6eb867020b76ed1eb23
---
 runsc/container/container.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 8dca721f6..30323138a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -135,7 +135,7 @@ func Load(rootDir, id string) (*Container, error) {
 			// see if this particular container is still running.
 		} else {
 			// Sandbox no longer exists, so this container
-			// definitly does not exist.
+			// definitely does not exist.
 			c.Status = Stopped
 			c.Sandbox = nil
 		}
-- 
cgit v1.2.3


From fa64c2a1517d20c08447bb2230f2903ec3baade9 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 2 Jul 2018 12:50:37 -0700
Subject: Make default limits the same as with runc

Closes #2

PiperOrigin-RevId: 202997196
Change-Id: I0c9f6f5a8a1abe1ae427bca5f590bdf9f82a6675
---
 README.md            | 34 ++++++++++++++--------------------
 runsc/boot/limits.go | 41 +++++++++++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 30 deletions(-)

(limited to 'runsc')

diff --git a/README.md b/README.md
index 1432c09d8..ed989b0ed 100644
--- a/README.md
+++ b/README.md
@@ -361,10 +361,20 @@ Then restart the Docker daemon.
 
 ## FAQ & Known Issues
 
+### Will my container work with gVisor?
+
+gVisor implements a large portion of the Linux surface and while we strive to
+make it broadly compatible, there are (and always will be) unimplemented
+features and bugs. The only real way to know if it will work is to try. If you
+find a container that doesn’t work and there is no known issue, please [file a
+bug][bug] indicating the full command you used to run the image. Providing the
+debug logs is also helpful.
+
 ### What works?
 
 The following applications/images have been tested:
 
+* elasticsearch
 * golang
 * httpd
 * java8
@@ -384,33 +394,17 @@ The following applications/images have been tested:
 * tomcat
 * wordpress
 
-### What doesn't work yet?
-
-The following applications have been tested and may not yet work:
-
-* elasticsearch: Requires unimplemented socket ioctls. See [bug
-  #2](https://github.com/google/gvisor/issues/2).
-
-### Will my container work with gVisor?
+### My container runs fine with *runc* but fails with *runsc*.
 
-gVisor implements a large portion of the Linux surface and while we strive to
-make it broadly compatible, there are (and always will be) unimplemented
-features and bugs. The only real way to know if it will work is to try. If you
-find a container that doesn’t work and there is no known issue, please [file a
-bug][bug] indicating the full command you used to run the image. Providing the
-debug logs is also helpful.
+If you’re having problems running a container with `runsc` it’s most likely due
+to a compatibility issue or a missing feature in gVisor. See **Debugging**,
+above.
 
 ### When I run my container, docker fails with `flag provided but not defined: -console`
 
 You're using an old version of Docker. Refer to the
 [Requirements](#requirements) section for the minimum version supported.
 
-### My container runs fine with *runc* but fails with *runsc*.
-
-If you’re having problems running a container with `runsc` it’s most likely due
-to a compatibility issue or a missing feature in gVisor. See **Debugging**,
-above.
-
 ### I can’t see a file copied with `docker cp` or `kubectl cp`.
 
 For performance reasons, gVisor caches directory contents, and therefore it may
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index ea72de8e9..510497eba 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -23,29 +23,50 @@ import (
 
 // Mapping from linux resource names to limits.LimitType.
 var fromLinuxResource = map[string]limits.LimitType{
+	"RLIMIT_AS":         limits.AS,
+	"RLIMIT_CORE":       limits.Core,
 	"RLIMIT_CPU":        limits.CPU,
-	"RLIMIT_FSIZE":      limits.FileSize,
 	"RLIMIT_DATA":       limits.Data,
-	"RLIMIT_STACK":      limits.Stack,
-	"RLIMIT_CORE":       limits.Core,
-	"RLIMIT_RSS":        limits.Rss,
-	"RLIMIT_NPROC":      limits.ProcessCount,
-	"RLIMIT_NOFILE":     limits.NumberOfFiles,
-	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
-	"RLIMIT_AS":         limits.AS,
+	"RLIMIT_FSIZE":      limits.FileSize,
 	"RLIMIT_LOCKS":      limits.Locks,
-	"RLIMIT_SIGPENDING": limits.SignalsPending,
+	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
 	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
 	"RLIMIT_NICE":       limits.Nice,
+	"RLIMIT_NOFILE":     limits.NumberOfFiles,
+	"RLIMIT_NPROC":      limits.ProcessCount,
+	"RLIMIT_RSS":        limits.Rss,
 	"RLIMIT_RTPRIO":     limits.RealTimePriority,
 	"RLIMIT_RTTIME":     limits.Rttime,
+	"RLIMIT_SIGPENDING": limits.SignalsPending,
+	"RLIMIT_STACK":      limits.Stack,
 }
 
 func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
-	ls, err := limits.NewLinuxDistroLimitSet()
+	ls, err := limits.NewLinuxLimitSet()
 	if err != nil {
 		return nil, err
 	}
+
+	// Set default limits based on what containers get by default, ex:
+	// $ docker run --rm debian prlimit
+	ls.SetUnchecked(limits.AS, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Core, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.CPU, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
+	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
+	ls.SetUnchecked(limits.ProcessCount, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Rss, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.RealTimePriority, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.Rttime, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity})
+
+	// Then apply overwrites on top of defaults.
 	for _, rl := range spec.Process.Rlimits {
 		lt, ok := fromLinuxResource[rl.Type]
 		if !ok {
-- 
cgit v1.2.3


From 126296ce2adce615005ae16edb8b80e3bfae56cd Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Mon, 2 Jul 2018 14:51:20 -0700
Subject: runsc: fix panic for `runsc wait` on stopped container.

PiperOrigin-RevId: 203016694
Change-Id: Ic51ef754aa6d7d1b3b35491aff96a63d7992e122
---
 runsc/container/container.go      | 9 +++++++++
 runsc/container/container_test.go | 6 ++++++
 2 files changed, 15 insertions(+)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 30323138a..c4e5bf9f6 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -352,6 +352,9 @@ func (c *Container) Pid() int {
 // Wait waits for the container to exit, and returns its WaitStatus.
 func (c *Container) Wait() (syscall.WaitStatus, error) {
 	log.Debugf("Wait on container %q", c.ID)
+	if c.Status == Stopped {
+		return 0, fmt.Errorf("container is stopped")
+	}
 	return c.Sandbox.Wait(c.ID)
 }
 
@@ -359,6 +362,9 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 // returns its WaitStatus.
 func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
+	if c.Status == Stopped {
+		return 0, fmt.Errorf("container is stopped")
+	}
 	return c.Sandbox.WaitPID(pid, c.Sandbox.ID)
 }
 
@@ -366,6 +372,9 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 // its WaitStatus.
 func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in container %q", pid, c.ID)
+	if c.Status == Stopped {
+		return 0, fmt.Errorf("container is stopped")
+	}
 	return c.Sandbox.WaitPID(pid, c.ID)
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index a6bb39c5d..d2f3cc14a 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1211,6 +1211,9 @@ func TestMultiContainerWait(t *testing.T) {
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("process %q exited with non-zero status %d", strings.Join(containers[1].Spec.Process.Args, " "), es)
 			}
+			if _, err := containers[1].Wait(); err == nil {
+				t.Errorf("wait for stopped process %q should fail", strings.Join(containers[1].Spec.Process.Args, " "))
+			}
 
 			// After Wait returns, ensure that the root container is running and
 			// the child has finished.
@@ -1231,6 +1234,9 @@ func TestMultiContainerWait(t *testing.T) {
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("PID %d exited with non-zero status %d", pid, es)
 			}
+			if _, err := containers[0].WaitPID(pid); err == nil {
+				t.Errorf("wait for stopped PID %d should fail", pid)
+			}
 		}()
 	}
 
-- 
cgit v1.2.3


From 614475196201a380d969ed269d99a8ad70ca1885 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 2 Jul 2018 17:46:35 -0700
Subject: runsc/boot/filter: permit SYS_TIME for race

glibc's malloc also uses SYS_TIME. Permit it.

#0  0x0000000000de6267 in time ()
#1  0x0000000000db19d8 in get_nprocs ()
#2  0x0000000000d8a31a in arena_get2.part ()
#3  0x0000000000d8ab4a in malloc ()
#4  0x0000000000d3c6b5 in __sanitizer::InternalAlloc(unsigned long, __sanitizer::SizeClassAllocatorLocalCache<__sanitizer::SizeClassAllocator32<0ul, 140737488355328ull, 0ul, __sanitizer::SizeClassMap<3ul, 4ul, 8ul, 17ul, 64ul, 14ul>, 20ul, __sanitizer::TwoLevelByteMap<32768ull, 4096ull, __sanitizer::NoOpMapUnmapCallback>, __sanitizer::NoOpMapUnmapCallback> >*, unsigned long) ()
#5  0x0000000000d4cd70 in __tsan_go_start ()
#6  0x00000000004617a3 in racecall ()
#7  0x00000000010f4ea0 in runtime.findfunctab ()
#8  0x000000000043f193 in runtime.racegostart ()

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
[mpratt@google.com: updated comments and commit message]
Signed-off-by: Michael Pratt <mpratt@google.com>

Change-Id: Ibe2d0dc3035bf5052d5fb802cfaa37c5e0e7a09a
PiperOrigin-RevId: 203042627
---
 runsc/boot/filter/extra_filters_race.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'runsc')

diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
index c810773df..ebd56c553 100644
--- a/runsc/boot/filter/extra_filters_race.go
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -34,5 +34,7 @@ func instrumentationFilters() seccomp.SyscallRules {
 		syscall.SYS_NANOSLEEP:       {},
 		syscall.SYS_OPEN:            {},
 		syscall.SYS_SET_ROBUST_LIST: {},
+		// Used within glibc's malloc.
+		syscall.SYS_TIME: {},
 	}
 }
-- 
cgit v1.2.3


From 4500155ffc5edfc2d417297d3367f5656dbea5a7 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 3 Jul 2018 10:35:27 -0700
Subject: runsc: Mount "mandatory" mounts right after mounting the root.

The /proc and /sys mounts are "mandatory" in the sense that they should be
mounted in the sandbox even when they are not included in the spec. Runsc
treats /tmp similarly, because it is faster to use the internal tmpfs
implementation instead of proxying to the host.

However, the spec may contain submounts of these mandatory mounts (particularly
for /tmp). In those cases, we must mount our mandatory mounts before the
submount, otherwise the submount will be masked.

Since the mandatory mounts are all top-level directories, we can mount them
right after the root.

PiperOrigin-RevId: 203145635
Change-Id: Id69bae771d32c1a5b67e08c8131b73d9b42b2fbf
---
 runsc/boot/fs.go          | 22 ++++++++++++++++------
 runsc/boot/loader_test.go | 39 +++++++++++++++++++++++++++++++++++----
 2 files changed, 51 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index a9b2f225a..f36bcdc2e 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -83,7 +83,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 }
 
 // compileMounts returns the supported mounts from the mount spec, adding any
-// additional mounts that are required by the OCI specification.
+// mandatory mounts that are required by the OCI specification.
 func compileMounts(spec *specs.Spec) []specs.Mount {
 	// Keep track of whether proc, sys, and tmp were mounted.
 	var procMounted, sysMounted, tmpMounted bool
@@ -119,14 +119,15 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 
 	// Mount proc and sys even if the user did not ask for it, as the spec
 	// says we SHOULD.
+	var mandatoryMounts []specs.Mount
 	if !procMounted {
-		mounts = append(mounts, specs.Mount{
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
 			Type:        "proc",
 			Destination: "/proc",
 		})
 	}
 	if !sysMounted {
-		mounts = append(mounts, specs.Mount{
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
 			Type:        "sysfs",
 			Destination: "/sys",
 		})
@@ -136,11 +137,20 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	// rely on the host /tmp, but this is a nice optimization, and fixes
 	// some apps that call mknod in /tmp.
 	if !tmpMounted {
-		mounts = append(mounts, specs.Mount{
+		// TODO: If the host /tmp (or a mount at /tmp) has
+		// files in it, we should overlay our tmpfs implementation over
+		// that. Until then, the /tmp mount will always appear empty at
+		// container creation.
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
 			Type:        "tmpfs",
 			Destination: "/tmp",
 		})
 	}
+
+	// The mandatory mounts should be ordered right after the root, in case
+	// there are submounts of these mandatory mounts already in the spec.
+	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
+
 	return mounts
 }
 
@@ -430,8 +440,8 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 	if err != nil {
 		return err
 	}
-	// TODO: Fix this when we support all the mount types and make this a
-	// fatal error.
+	// TODO: Fix this when we support all the mount types and
+	// make this a fatal error.
 	if fsName == "" {
 		return nil
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 28d45b54b..30ec236e4 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -304,6 +304,37 @@ func TestCreateMountNamespace(t *testing.T) {
 			},
 			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
 		},
+		{
+			name: "mounts inside mandatory mounts",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					// We don't include /sys, and /tmp in
+					// the spec, since they will be added
+					// automatically.
+					//
+					// Instead, add submounts inside these
+					// directories and make sure they are
+					// visible under the mandatory mounts.
+					{
+						Destination: "/sys/bar",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/tmp/baz",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
+		},
 	}
 
 	for _, tc := range testCases {
@@ -494,14 +525,14 @@ func TestRestoreEnvironment(t *testing.T) {
 						},
 					},
 					"tmpfs": {
+						{
+							Dev: "none",
+						},
 						{
 							Dev:   "none",
 							Flags: fs.MountSourceFlags{NoAtime: true},
 							Data:  "uid=1022",
 						},
-						{
-							Dev: "none",
-						},
 					},
 					"devtmpfs": {
 						{
@@ -587,7 +618,7 @@ func TestRestoreEnvironment(t *testing.T) {
 			}
 		} else {
 			if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) {
-				t.Fatalf("restore environments did not match for test:%s", tc.name)
+				t.Fatalf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv)
 			}
 		}
 	}
-- 
cgit v1.2.3


From c1b4c1ffee850aea2880f5f91a1e48e840933c71 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Jul 2018 11:33:20 -0700
Subject: Fix flaky image_test

- Some failures were being ignored in run_tests.sh
- Give more time for mysql to setup
- Fix typo with network=host tests
- Change httpd test to wait on http server being available, not only output

PiperOrigin-RevId: 203156896
Change-Id: Ie1801dcd76e9b5fe4722c4d8695c76e40988dd74
---
 kokoro/gcp_ubuntu/run_tests.sh | 15 +++++++++------
 runsc/test/image/BUILD         |  2 +-
 runsc/test/image/image_test.go | 14 +++++++-------
 3 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/gcp_ubuntu/run_tests.sh b/kokoro/gcp_ubuntu/run_tests.sh
index 5554350da..978a21599 100755
--- a/kokoro/gcp_ubuntu/run_tests.sh
+++ b/kokoro/gcp_ubuntu/run_tests.sh
@@ -44,12 +44,15 @@ bazel test --test_output=errors //...
 exit_code=${?}
 
 if [[ ${exit_code} -eq 0 ]]; then
-  # image_test is tagged manual
-  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime} //runsc/test/image:image_test
-  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-kvm //runsc/test/image:image_test
-  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-nethost //runsc/test/image:image_test
-  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-overlay //runsc/test/image:image_test
-  exit_code=${?}
+  declare -a variations=("" "-kvm" "-hostnet" "-overlay")
+  for v in "${variations[@]}"; do
+    # image_test is tagged manual
+    bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}${v} //runsc/test/image:image_test
+    exit_code=${?}
+    if [[ ${exit_code} -ne 0 ]]; then
+      break
+    fi
+  done
 fi
 
 # Best effort to uninstall
diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index fb9db4d4c..8fdaa1e5c 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -4,7 +4,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
 go_test(
     name = "image_test",
-    size = "small",
+    size = "large",
     srcs = ["image_test.go"],
     data = [
         "latin10k.txt",
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 849bf76a2..ed2111107 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -156,7 +156,7 @@ func (d *docker) waitForOutput(pattern string, timeout time.Duration) error {
 			// Success!
 			return nil
 		}
-		time.Sleep(10 * time.Millisecond)
+		time.Sleep(100 * time.Millisecond)
 	}
 	return fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
 }
@@ -168,7 +168,7 @@ func (d *docker) waitForHTTP(port int, timeout time.Duration) error {
 			// Success!
 			return nil
 		}
-		time.Sleep(10 * time.Millisecond)
+		time.Sleep(100 * time.Millisecond)
 	}
 	return fmt.Errorf("timeout waiting for HTTP server on port %d", port)
 }
@@ -238,8 +238,8 @@ func TestHttpd(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := d.waitForOutput("'httpd -D FOREGROUND'", 5*time.Second); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	if err := d.waitForHTTP(port, 5*time.Second); err != nil {
+		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
 	}
 
 	if err := testHTTPServer(port); err != nil {
@@ -287,7 +287,7 @@ func TestMysql(t *testing.T) {
 	defer d.cleanUp()
 
 	// Wait until it's up and running.
-	if err := d.waitForOutput("port: 3306  MySQL Community Server", 30*time.Second); err != nil {
+	if err := d.waitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
 
@@ -311,10 +311,10 @@ func TestMysql(t *testing.T) {
 	defer client.cleanUp()
 
 	// Ensure file executed to the end and shutdown mysql.
-	if err := client.waitForOutput("--------------\nshutdown\n--------------", 5*time.Second); err != nil {
+	if err := client.waitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
-	if err := d.waitForOutput("mysqld: Shutdown complete", 15*time.Second); err != nil {
+	if err := d.waitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
 }
-- 
cgit v1.2.3


From 0ef606616732e1daf9d424658c31e5fed8f1ee4a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Jul 2018 11:38:08 -0700
Subject: Resend packets back to netstack if destined to itself

Add option to redirect packet back to netstack if it's destined to itself.
This fixes the problem where connecting to the local NIC address would
not work, e.g.:
echo bar | nc -l -p 8080 &
echo foo | nc 192.168.0.2 8080

PiperOrigin-RevId: 203157739
Change-Id: I31c9f7c501e3f55007f25e1852c27893a16ac6c4
---
 pkg/tcpip/link/fdbased/endpoint.go | 51 +++++++++++++++++++++++---------------
 pkg/tcpip/stack/nic.go             |  3 +--
 pkg/tcpip/stack/route.go           | 16 ++++++++----
 pkg/tcpip/stack/stack.go           |  2 +-
 runsc/boot/network.go              |  9 +++----
 5 files changed, 48 insertions(+), 33 deletions(-)

(limited to 'runsc')

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 668514454..0c844c05b 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -45,10 +45,14 @@ type endpoint struct {
 	// its end of the communication pipe.
 	closed func(*tcpip.Error)
 
-	vv       *buffer.VectorisedView
-	iovecs   []syscall.Iovec
-	views    []buffer.View
-	attached bool
+	vv         *buffer.VectorisedView
+	iovecs     []syscall.Iovec
+	views      []buffer.View
+	dispatcher stack.NetworkDispatcher
+
+	// egressLocal indicates whether packets destined to itself should be
+	// forwarded to the FD endpoint (true) or be sent back to netstack (false).
+	egressLocal bool
 }
 
 // Options specify the details about the fd-based endpoint to be created.
@@ -59,6 +63,7 @@ type Options struct {
 	ChecksumOffload bool
 	ClosedFunc      func(*tcpip.Error)
 	Address         tcpip.LinkAddress
+	EgressLocal     bool
 }
 
 // New creates a new fd-based endpoint.
@@ -80,14 +85,15 @@ func New(opts *Options) tcpip.LinkEndpointID {
 	}
 
 	e := &endpoint{
-		fd:      opts.FD,
-		mtu:     opts.MTU,
-		caps:    caps,
-		closed:  opts.ClosedFunc,
-		addr:    opts.Address,
-		hdrSize: hdrSize,
-		views:   make([]buffer.View, len(BufConfig)),
-		iovecs:  make([]syscall.Iovec, len(BufConfig)),
+		fd:          opts.FD,
+		mtu:         opts.MTU,
+		caps:        caps,
+		closed:      opts.ClosedFunc,
+		addr:        opts.Address,
+		hdrSize:     hdrSize,
+		views:       make([]buffer.View, len(BufConfig)),
+		iovecs:      make([]syscall.Iovec, len(BufConfig)),
+		egressLocal: opts.EgressLocal,
 	}
 	vv := buffer.NewVectorisedView(0, e.views)
 	e.vv = &vv
@@ -97,13 +103,13 @@ func New(opts *Options) tcpip.LinkEndpointID {
 // Attach launches the goroutine that reads packets from the file descriptor and
 // dispatches them via the provided dispatcher.
 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
-	e.attached = true
-	go e.dispatchLoop(dispatcher) // S/R-FIXME
+	e.dispatcher = dispatcher
+	go e.dispatchLoop() // S/R-FIXME
 }
 
 // IsAttached implements stack.LinkEndpoint.IsAttached.
 func (e *endpoint) IsAttached() bool {
-	return e.attached
+	return e.dispatcher != nil
 }
 
 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
@@ -130,6 +136,12 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
 func (e *endpoint) WritePacket(r *stack.Route, hdr *buffer.Prependable, payload buffer.View, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+	if !e.egressLocal && r.LocalAddress != "" && r.LocalAddress == r.RemoteAddress {
+		hdrView := hdr.View()
+		vv := buffer.NewVectorisedView(len(hdrView)+len(payload), []buffer.View{hdrView, payload})
+		e.dispatcher.DeliverNetworkPacket(e, r.RemoteLinkAddress, protocol, &vv)
+		return nil
+	}
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
 		eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize))
@@ -142,7 +154,6 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr *buffer.Prependable, payload
 
 	if len(payload) == 0 {
 		return rawfile.NonBlockingWrite(e.fd, hdr.UsedBytes())
-
 	}
 
 	return rawfile.NonBlockingWrite2(e.fd, hdr.UsedBytes(), payload)
@@ -175,7 +186,7 @@ func (e *endpoint) allocateViews(bufConfig []int) {
 }
 
 // dispatch reads one packet from the file descriptor and dispatches it.
-func (e *endpoint) dispatch(d stack.NetworkDispatcher, largeV buffer.View) (bool, *tcpip.Error) {
+func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
 	e.allocateViews(BufConfig)
 
 	n, err := rawfile.BlockingReadv(e.fd, e.iovecs)
@@ -211,7 +222,7 @@ func (e *endpoint) dispatch(d stack.NetworkDispatcher, largeV buffer.View) (bool
 	e.vv.SetSize(n)
 	e.vv.TrimFront(e.hdrSize)
 
-	d.DeliverNetworkPacket(e, addr, p, e.vv)
+	e.dispatcher.DeliverNetworkPacket(e, addr, p, e.vv)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -223,10 +234,10 @@ func (e *endpoint) dispatch(d stack.NetworkDispatcher, largeV buffer.View) (bool
 
 // dispatchLoop reads packets from the file descriptor in a loop and dispatches
 // them to the network stack.
-func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) *tcpip.Error {
+func (e *endpoint) dispatchLoop() *tcpip.Error {
 	v := buffer.NewView(header.MaxIPPacketSize)
 	for {
-		cont, err := e.dispatch(d, v)
+		cont, err := e.dispatch(v)
 		if err != nil || !cont {
 			if e.closed != nil {
 				e.closed(err)
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 8ff4310d5..06bb5abc5 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -317,8 +317,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remoteLinkAddr tcpip.Lin
 		return
 	}
 
-	r := makeRoute(protocol, dst, src, ref)
-	r.LocalLinkAddress = linkEP.LinkAddress()
+	r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
 	r.RemoteLinkAddress = remoteLinkAddr
 	ref.ep.HandlePacket(&r, vv)
 	ref.decRef()
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 12f5efba5..e4f10cfa1 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -40,12 +40,13 @@ type Route struct {
 
 // makeRoute initializes a new route. It takes ownership of the provided
 // reference to a network endpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, ref *referencedNetworkEndpoint) Route {
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint) Route {
 	return Route{
-		NetProto:      netProto,
-		LocalAddress:  localAddr,
-		RemoteAddress: remoteAddr,
-		ref:           ref,
+		NetProto:         netProto,
+		LocalAddress:     localAddr,
+		LocalLinkAddress: localLinkAddr,
+		RemoteAddress:    remoteAddr,
+		ref:              ref,
 	}
 }
 
@@ -82,6 +83,11 @@ func (r *Route) Resolve(waker *sleep.Waker) *tcpip.Error {
 
 	nextAddr := r.NextHop
 	if nextAddr == "" {
+		// Local link address is already known.
+		if r.RemoteAddress == r.LocalAddress {
+			r.RemoteLinkAddress = r.LocalLinkAddress
+			return nil
+		}
 		nextAddr = r.RemoteAddress
 	}
 	linkAddr, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 3976f585c..d1d762a8e 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -647,7 +647,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			remoteAddr = ref.ep.ID().LocalAddress
 		}
 
-		r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, ref)
+		r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref)
 		r.NextHop = s.routeTable[i].Gateway
 		return r, nil
 	}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index d2b52c823..df45218b9 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -134,11 +134,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		linkEP := fdbased.New(&fdbased.Options{
-			FD:              newFD,
-			MTU:             uint32(link.MTU),
-			ChecksumOffload: false,
-			EthernetHeader:  true,
-			Address:         tcpip.LinkAddress(generateRndMac()),
+			FD:             newFD,
+			MTU:            uint32(link.MTU),
+			EthernetHeader: true,
+			Address:        tcpip.LinkAddress(generateRndMac()),
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-- 
cgit v1.2.3


From 138cb8da5043c1c8f59f4c27b727383e5ad8254e Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Tue, 3 Jul 2018 11:57:04 -0700
Subject: runsc: `runsc wait` print wait status.

PiperOrigin-RevId: 203160639
Change-Id: I8fb2787ba0efb7eacd9d4c934238a26eb5ae79d5
---
 runsc/cmd/wait.go | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 8437457c4..b41edc725 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -15,6 +15,8 @@
 package cmd
 
 import (
+	"encoding/json"
+	"os"
 	"syscall"
 
 	"context"
@@ -75,7 +77,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("error loading container: %v", err)
 	}
 
-	waitStatus := args[1].(*syscall.WaitStatus)
+	var waitStatus syscall.WaitStatus
 	switch {
 	// Wait on the whole container.
 	case wt.rootPID == unsetPID && wt.pid == unsetPID:
@@ -83,21 +85,43 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if err != nil {
 			Fatalf("error waiting on container %q: %v", c.ID, err)
 		}
-		*waitStatus = ws
+		waitStatus = ws
 	// Wait on a PID in the root PID namespace.
 	case wt.rootPID != unsetPID:
 		ws, err := c.WaitRootPID(int32(wt.rootPID))
 		if err != nil {
 			Fatalf("error waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
 		}
-		*waitStatus = ws
+		waitStatus = ws
 	// Wait on a PID in the container's PID namespace.
 	case wt.pid != unsetPID:
 		ws, err := c.WaitPID(int32(wt.pid))
 		if err != nil {
 			Fatalf("error waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
 		}
-		*waitStatus = ws
+		waitStatus = ws
+	}
+	result := waitResult{
+		ID:         id,
+		ExitStatus: exitStatus(waitStatus),
+	}
+	// Write json-encoded wait result directly to stdout.
+	if err := json.NewEncoder(os.Stdout).Encode(result); err != nil {
+		Fatalf("error marshaling wait result: %v", err)
 	}
 	return subcommands.ExitSuccess
 }
+
+type waitResult struct {
+	ID         string `json:"id"`
+	ExitStatus int    `json:"exitStatus"`
+}
+
+// exitStatus returns the correct exit status for a process based on if it
+// was signaled or exited cleanly.
+func exitStatus(status syscall.WaitStatus) int {
+	if status.Signaled() {
+		return 128 + int(status.Signal())
+	}
+	return status.ExitStatus()
+}
-- 
cgit v1.2.3


From 52ddb8571c466577843d8eb1c5e270dd54f1ade6 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Jul 2018 12:00:09 -0700
Subject: Skip overlay on root when its readonly

PiperOrigin-RevId: 203161098
Change-Id: Ia1904420cb3ee830899d24a4fe418bba6533be64
---
 runsc/boot/fs.go                  |  2 +-
 runsc/container/container_test.go | 71 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index f36bcdc2e..51c8d620d 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -207,7 +207,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 		return nil, fmt.Errorf("error adding submount overlay: %v", err)
 	}
 
-	if conf.Overlay {
+	if conf.Overlay && !spec.Root.Readonly {
 		log.Debugf("Adding overlay on top of root mount")
 		// Overlay a tmpfs filesystem on top of the root.
 		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index d2f3cc14a..72b115628 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -998,6 +998,77 @@ func TestMountNewDir(t *testing.T) {
 	}
 }
 
+func TestReadonlyRoot(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+	spec.Root.Readonly = true
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	conf.Overlay = true
+
+	// Create, start and wait for the container.
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+	ws, err := s.Wait()
+	if err != nil {
+		t.Fatalf("error waiting on container: %v", err)
+	}
+	if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+		t.Fatalf("container failed, waitStatus: %v", ws)
+	}
+}
+
+func TestReadonlyMount(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/touch", "/foo/file")
+	dir, err := ioutil.TempDir("", "ro-mount")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed: %v", err)
+	}
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/foo",
+		Source:      dir,
+		Type:        "bind",
+		Options:     []string{"ro"},
+	})
+	spec.Root.Readonly = false
+
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	conf.Overlay = true
+
+	// Create, start and wait for the container.
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+	ws, err := s.Wait()
+	if err != nil {
+		t.Fatalf("error waiting on container: %v", err)
+	}
+	if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+		t.Fatalf("container failed, waitStatus: %v", ws)
+	}
+}
+
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
-- 
cgit v1.2.3


From 660f1203ff1949a7b7869b801f4aa2133d30b91f Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 3 Jul 2018 12:52:39 -0700
Subject: Fix runsc VDSO mapping

80bdf8a4068de3ac4a73b6b61a0cdcfe3e3571af accidentally moved vdso into an
inner scope, never assigning the vdso variable passed to the Kernel and
thus skipping VDSO mappings.

Fix this and remove the ability for loadVDSO to skip VDSO mappings,
since tests that do so are gone.

PiperOrigin-RevId: 203169135
Change-Id: Ifd8cadcbaf82f959223c501edcc4d83d05327eba
---
 pkg/sentry/loader/vdso.go | 6 ------
 runsc/boot/loader.go      | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 037576e41..2e8693f8e 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -278,12 +278,6 @@ func PrepareVDSO(p platform.Platform) (*VDSO, error) {
 //
 // loadVDSO takes a reference on the VDSO and parameter page FrameRegions.
 func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (usermem.Addr, error) {
-	if v == nil {
-		// Should be used only by tests.
-		ctx.Warningf("No VDSO provided, skipping VDSO mapping")
-		return 0, nil
-	}
-
 	if v.os != bin.os {
 		ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os)
 		return 0, syserror.ENOEXEC
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 6fcfba5cb..a3cc0e4a4 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -134,7 +134,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		// Create VDSO.
 		//
 		// Pass k as the platform since it is savable, unlike the actual platform.
-		vdso, err := loader.PrepareVDSO(k)
+		vdso, err = loader.PrepareVDSO(k)
 		if err != nil {
 			return nil, fmt.Errorf("error creating vdso: %v", err)
 		}
-- 
cgit v1.2.3


From f107a5b1a0e264d518617c57f0cf310b63e8b59c Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Fri, 6 Jul 2018 09:37:32 -0700
Subject: Tests pause and resume functionality on a Python container.

PiperOrigin-RevId: 203488336
Change-Id: I55e1b646f1fae73c27a49e064875d55f5605b200
---
 runsc/test/image/BUILD          |  5 +++-
 runsc/test/image/python_test.go | 51 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 runsc/test/image/python_test.go

(limited to 'runsc')

diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index 8fdaa1e5c..e3985ecc4 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -5,7 +5,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_test(
     name = "image_test",
     size = "large",
-    srcs = ["image_test.go"],
+    srcs = [
+        "image_test.go",
+        "python_test.go",
+    ],
     data = [
         "latin10k.txt",
         "mysql.sql",
diff --git a/runsc/test/image/python_test.go b/runsc/test/image/python_test.go
new file mode 100644
index 000000000..e931bb444
--- /dev/null
+++ b/runsc/test/image/python_test.go
@@ -0,0 +1,51 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package image
+
+import (
+	"fmt"
+	"net/http"
+	"testing"
+	"time"
+)
+
+func TestPythonHello(t *testing.T) {
+	d := makeDocker("python-hello-test")
+	if out, err := d.run("-p", "8080", "google/python-hello"); err != nil {
+		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	}
+	defer d.cleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.findPort(8080)
+	if err != nil {
+		t.Fatalf("docker.findPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := d.waitForHTTP(port, 5*time.Second); err != nil {
+		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	}
+
+	// Ensure that content is being served.
+	url := fmt.Sprintf("http://localhost:%d", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Errorf("Error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+}
-- 
cgit v1.2.3


From b763b3992a2c4f16fc218e1920df5525dd75b114 Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Tue, 10 Jul 2018 14:57:20 -0700
Subject: Modified error message for clarity.

Previously, error message only showed "<nil>" when child and pid were the
same (since no error is returned by the Wait4 syscall in this case) which
occurs when the process has incorrectly terminated. A new error message
was added to improve clarity for such a case. Tests for this function were
modified to reflect the improved distinction between process termination
and error.

PiperOrigin-RevId: 204018107
Change-Id: Ib38481c9590405e5bafcb6efe27fd49b3948910c
---
 runsc/specutils/specutils.go      | 9 +++++++--
 runsc/specutils/specutils_test.go | 7 +++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 34243e623..861e7fd70 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -324,9 +324,14 @@ func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) er
 		// Check if the process is still running.
 		var ws syscall.WaitStatus
 		var ru syscall.Rusage
+
+		// If the process is alive, child is 0 because of the NOHANG option.
+		// If the process has terminated, child equals the process id.
 		child, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, &ru)
-		if err != nil || child == pid {
-			return fmt.Errorf("process (%d) is not running, err: %v", pid, err)
+		if err != nil {
+			return fmt.Errorf("error waiting for process: %v", err)
+		} else if child == pid {
+			return fmt.Errorf("process %d has terminated", pid)
 		}
 
 		// Process continues to run, backoff and retry.
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index 959be3af3..2dc5d90cc 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -76,8 +76,11 @@ func TestWaitForReadyNotRunning(t *testing.T) {
 	err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) {
 		return false, nil
 	})
-	if !strings.Contains(err.Error(), "not running") {
-		t.Errorf("ProcessWaitReady got: %v, expected: not running", err)
+	if err != nil && !strings.Contains(err.Error(), "terminated") {
+		t.Errorf("ProcessWaitReady got: %v, expected: process terminated", err)
+	}
+	if err == nil {
+		t.Errorf("ProcessWaitReady incorrectly succeeded")
 	}
 }
 
-- 
cgit v1.2.3


From 81ae5f3df533d5e5990baaa105392f59e28d5730 Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Wed, 11 Jul 2018 09:36:20 -0700
Subject: Created runsc and docker integration tests.

Moved some of the docker image functions to testutil.go.
Test runsc commands create, start, stop, pause, and resume.

PiperOrigin-RevId: 204138452
Change-Id: Id00bc58d2ad230db5e9e905eed942187e68e7c7b
---
 runsc/test/image/image_test.go             | 222 +++++------------------------
 runsc/test/image/install.sh                |  92 ------------
 runsc/test/image/python_test.go            |  14 +-
 runsc/test/install.sh                      |  92 ++++++++++++
 runsc/test/integration/BUILD               |  26 ++++
 runsc/test/integration/integration.go      |  16 +++
 runsc/test/integration/integration_test.go | 148 +++++++++++++++++++
 runsc/test/testutil/BUILD                  |   5 +-
 runsc/test/testutil/docker.go              | 185 ++++++++++++++++++++++++
 9 files changed, 514 insertions(+), 286 deletions(-)
 delete mode 100755 runsc/test/image/install.sh
 create mode 100755 runsc/test/install.sh
 create mode 100644 runsc/test/integration/BUILD
 create mode 100644 runsc/test/integration/integration.go
 create mode 100644 runsc/test/integration/integration_test.go
 create mode 100644 runsc/test/testutil/docker.go

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index ed2111107..04c334d92 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -15,7 +15,7 @@
 // Package image provides end-to-end image tests for runsc. These tests require
 // docker and runsc to be installed on the machine. To set it up, run:
 //
-//     ./runsc/test/image/install.sh [--runtime <name>]
+//     ./runsc/test/install.sh [--runtime <name>]
 //
 // The tests expect the runtime name to be provided in the RUNSC_RUNTIME
 // environment variable (default: runsc-test).
@@ -28,14 +28,8 @@ package image
 import (
 	"fmt"
 	"io/ioutil"
-	"log"
-	"math/rand"
 	"net/http"
 	"os"
-	"os/exec"
-	"path"
-	"regexp"
-	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -43,144 +37,14 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
-func init() {
-	rand.Seed(time.Now().UnixNano())
-}
-
-func runtime() string {
-	r := os.Getenv("RUNSC_RUNTIME")
-	if r == "" {
-		return "runsc-test"
-	}
-	return r
-}
-
-func mountArg(source, target string) string {
-	return fmt.Sprintf("%s:%s", source, target)
-}
-
-func linkArg(source *docker, target string) string {
-	return fmt.Sprintf("%s:%s", source.name, target)
-}
-
-// prepareFiles creates temp directory to copy files there. The sandbox doesn't
-// have access to files in the test dir.
-func prepareFiles(names ...string) (string, error) {
-	dir, err := ioutil.TempDir("", "image-test")
-	if err != nil {
-		return "", fmt.Errorf("ioutil.TempDir failed: %v", err)
-	}
-	if err := os.Chmod(dir, 0777); err != nil {
-		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
-	}
-	for _, name := range names {
-		src := getLocalPath(name)
-		dst := path.Join(dir, name)
-		if err := testutil.Copy(src, dst); err != nil {
-			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
-		}
-	}
-	return dir, nil
-}
-
-func getLocalPath(file string) string {
-	return path.Join(".", file)
-}
-
-type docker struct {
-	runtime string
-	name    string
-}
-
-func makeDocker(namePrefix string) docker {
-	suffix := fmt.Sprintf("-%06d", rand.Int())[:7]
-	return docker{name: namePrefix + suffix, runtime: runtime()}
-}
-
-// do executes docker command.
-func (d *docker) do(args ...string) (string, error) {
-	fmt.Printf("Running: docker %s\n", args)
-	cmd := exec.Command("docker", args...)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("error executing docker %s: %v", args, err)
-	}
-	return string(out), nil
-}
-
-// run calls 'docker run' with the arguments provided.
-func (d *docker) run(args ...string) (string, error) {
-	a := []string{"run", "--runtime", d.runtime, "--name", d.name, "-d"}
-	a = append(a, args...)
-	return d.do(a...)
-}
-
-// cleanUp kills and deletes the container.
-func (d *docker) cleanUp() error {
-	if _, err := d.do("kill", d.name); err != nil {
-		return fmt.Errorf("error killing container %q: %v", d.name, err)
-	}
-	if _, err := d.do("rm", d.name); err != nil {
-		return fmt.Errorf("error deleting container %q: %v", d.name, err)
-	}
-	return nil
-}
-
-// findPort returns the host port that is mapped to 'sandboxPort'. This calls
-// docker to allocate a free port in the host and prevent conflicts.
-func (d *docker) findPort(sandboxPort int) (int, error) {
-	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
-	out, err := d.do("inspect", "-f", format, d.name)
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving port: %v", err)
-	}
-	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
-	}
-	return port, nil
-}
-
-// waitForOutput calls 'docker logs' to retrieve containers output and searches
-// for the given pattern.
-func (d *docker) waitForOutput(pattern string, timeout time.Duration) error {
-	re := regexp.MustCompile(pattern)
-	var out string
-	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
-		var err error
-		out, err = d.do("logs", d.name)
-		if err != nil {
-			return err
-		}
-		if re.MatchString(out) {
-			// Success!
-			return nil
-		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	return fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
-}
-
-func (d *docker) waitForHTTP(port int, timeout time.Duration) error {
-	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
-		url := fmt.Sprintf("http://localhost:%d/", port)
-		if _, err := http.Get(url); err == nil {
-			// Success!
-			return nil
-		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	return fmt.Errorf("timeout waiting for HTTP server on port %d", port)
-}
-
 func TestHelloWorld(t *testing.T) {
-	d := makeDocker("hello-test")
-	if out, err := d.run("hello-world"); err != nil {
+	d := testutil.MakeDocker("hello-test")
+	if out, err := d.Run("hello-world"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
-	defer d.cleanUp()
+	defer d.CleanUp()
 
-	if err := d.waitForOutput("Hello from Docker!", 5*time.Second); err != nil {
+	if err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
 		t.Fatalf("docker didn't say hello: %v", err)
 	}
 }
@@ -218,27 +82,27 @@ func testHTTPServer(port int) error {
 }
 
 func TestHttpd(t *testing.T) {
-	d := makeDocker("http-test")
+	d := testutil.MakeDocker("http-test")
 
-	dir, err := prepareFiles("latin10k.txt")
+	dir, err := testutil.PrepareFiles("latin10k.txt")
 	if err != nil {
-		t.Fatalf("prepareFiles() failed: %v", err)
+		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
 
 	// Start the container.
-	if out, err := d.run("-p", "80", "-v", mountArg(dir, "/usr/local/apache2/htdocs:ro"), "httpd"); err != nil {
+	if out, err := d.Run("-p", "80", "-v", testutil.MountArg(dir, "/usr/local/apache2/htdocs:ro"), "httpd"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
-	defer d.cleanUp()
+	defer d.CleanUp()
 
 	// Find where port 80 is mapped to.
-	port, err := d.findPort(80)
+	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatalf("docker.findPort(80) failed: %v", err)
+		t.Fatalf("docker.FindPort(80) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := d.waitForHTTP(port, 5*time.Second); err != nil {
+	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
 		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
 	}
 
@@ -248,27 +112,27 @@ func TestHttpd(t *testing.T) {
 }
 
 func TestNginx(t *testing.T) {
-	d := makeDocker("net-test")
+	d := testutil.MakeDocker("net-test")
 
-	dir, err := prepareFiles("latin10k.txt")
+	dir, err := testutil.PrepareFiles("latin10k.txt")
 	if err != nil {
-		t.Fatalf("prepareFiles() failed: %v", err)
+		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
 
 	// Start the container.
-	if out, err := d.run("-p", "80", "-v", mountArg(dir, "/usr/share/nginx/html:ro"), "nginx"); err != nil {
+	if out, err := d.Run("-p", "80", "-v", testutil.MountArg(dir, "/usr/share/nginx/html:ro"), "nginx"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
-	defer d.cleanUp()
+	defer d.CleanUp()
 
 	// Find where port 80 is mapped to.
-	port, err := d.findPort(80)
+	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatalf("docker.findPort(80) failed: %v", err)
+		t.Fatalf("docker.FindPort(80) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := d.waitForHTTP(port, 5*time.Second); err != nil {
+	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
 		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
 	}
 
@@ -278,64 +142,48 @@ func TestNginx(t *testing.T) {
 }
 
 func TestMysql(t *testing.T) {
-	d := makeDocker("mysql-test")
+	d := testutil.MakeDocker("mysql-test")
 
 	// Start the container.
-	if out, err := d.run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
+	if out, err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
-	defer d.cleanUp()
+	defer d.CleanUp()
 
 	// Wait until it's up and running.
-	if err := d.waitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
+	if err := d.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
 
-	client := makeDocker("mysql-client-test")
-	dir, err := prepareFiles("mysql.sql")
+	client := testutil.MakeDocker("mysql-client-test")
+	dir, err := testutil.PrepareFiles("mysql.sql")
 	if err != nil {
-		t.Fatalf("prepareFiles() failed: %v", err)
+		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
 
 	// Tell mysql client to connect to the server and execute the file in verbose
 	// mode to verify the output.
 	args := []string{
-		"--link", linkArg(&d, "mysql"),
-		"-v", mountArg(dir, "/sql"),
+		"--link", testutil.LinkArg(&d, "mysql"),
+		"-v", testutil.MountArg(dir, "/sql"),
 		"mysql",
 		"mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql",
 	}
-	if out, err := client.run(args...); err != nil {
+	if out, err := client.Run(args...); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
-	defer client.cleanUp()
+	defer client.CleanUp()
 
 	// Ensure file executed to the end and shutdown mysql.
-	if err := client.waitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
+	if err := client.WaitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
-	if err := d.waitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
+	if err := d.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
 }
 
 func MainTest(m *testing.M) {
-	// Check correct docker is installed.
-	cmd := exec.Command("docker", "version")
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		log.Fatalf("Error running %q: %v", "docker version", err)
-	}
-	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
-	matches := re.FindStringSubmatch(string(out))
-	if len(matches) != 3 {
-		log.Fatalf("Invalid docker output: %s", out)
-	}
-	major, _ := strconv.Atoi(matches[1])
-	minor, _ := strconv.Atoi(matches[2])
-	if major < 17 || (major == 17 && minor < 9) {
-		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
-	}
-
+	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
 }
diff --git a/runsc/test/image/install.sh b/runsc/test/image/install.sh
deleted file mode 100755
index c110d96f9..000000000
--- a/runsc/test/image/install.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fail on any error
-set -e
-
-# Defaults
-declare runtime=runsc-test
-declare uninstall=0
-
-function findExe() {
-  local exe=${1}
-
-  local path=$(find bazel-bin/runsc -type f -executable -name "${exe}" | head -n1)
-  if [[ "${path}" == "" ]]; then
-    echo "Location of ${exe} not found in bazel-bin" >&2
-    exit 1
-  fi
-  echo "${path}"
-}
-
-while [[ $# -gt 0 ]]; do
-  case "$1" in
-    --runtime)
-      shift
-      [ "$#" -le 0 ] && echo "No runtime provided" && exit 1
-      runtime=$1
-      ;;
-    -u)
-      uninstall=1
-      ;;
-    *)
-      echo "Unknown option: ${1}"
-      echo ""
-      echo "Usage: ${0} [--runtime <name>] [-u]"
-      echo "  --runtime    sets the runtime name, default: runsc-test"
-      echo "  -u           uninstall the runtime"
-      exit 1
-  esac
-  shift
-done
-
-# Find location of executables.
-declare -r dockercfg=$(findExe dockercfg)
-[[ "${dockercfg}" == "" ]] && exit 1
-
-declare runsc=$(findExe runsc)
-[[ "${runsc}" == "" ]] && exit 1
-
-if [[ ${uninstall} == 0 ]]; then
-  rm -rf /tmp/${runtime}
-  mkdir -p /tmp/${runtime}
-  cp "${runsc}" /tmp/${runtime}/runsc
-  runsc=/tmp/${runtime}/runsc
-
-  # Make tmp dir and runsc binary readable and executable to all users, since it
-  # will run in an empty user namespace.
-  chmod a+rx "${runsc}" $(dirname "${runsc}")
-
-  # Make log dir executable and writable to all users for the same reason.
-  declare logdir=/tmp/"${runtime?}/logs"
-  mkdir -p "${logdir}"
-  sudo -n chmod a+wx "${logdir}"
-
-  declare -r args="--debug-log-dir "${logdir}" --debug --strace --log-packets"
-  sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" ${args}
-  sudo -n "${dockercfg}" runtime-add "${runtime}"-kvm "${runsc}" --platform=kvm ${args}
-  sudo -n "${dockercfg}" runtime-add "${runtime}"-hostnet "${runsc}" --network=host ${args}
-  sudo -n "${dockercfg}" runtime-add "${runtime}"-overlay "${runsc}" --overlay ${args}
-
-else
-  sudo -n "${dockercfg}" runtime-rm "${runtime}"
-  sudo -n "${dockercfg}" runtime-rm "${runtime}"-kvm
-  sudo -n "${dockercfg}" runtime-rm "${runtime}"-hostnet
-  sudo -n "${dockercfg}" runtime-rm "${runtime}"-overlay
-fi
-
-echo "Restarting docker service..."
-sudo -n /etc/init.d/docker restart
diff --git a/runsc/test/image/python_test.go b/runsc/test/image/python_test.go
index e931bb444..f0dab3989 100644
--- a/runsc/test/image/python_test.go
+++ b/runsc/test/image/python_test.go
@@ -19,23 +19,25 @@ import (
 	"net/http"
 	"testing"
 	"time"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
 func TestPythonHello(t *testing.T) {
-	d := makeDocker("python-hello-test")
-	if out, err := d.run("-p", "8080", "google/python-hello"); err != nil {
+	d := testutil.MakeDocker("python-hello-test")
+	if out, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
-	defer d.cleanUp()
+	defer d.CleanUp()
 
 	// Find where port 8080 is mapped to.
-	port, err := d.findPort(8080)
+	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatalf("docker.findPort(8080) failed: %v", err)
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := d.waitForHTTP(port, 5*time.Second); err != nil {
+	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
 		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
 	}
 
diff --git a/runsc/test/install.sh b/runsc/test/install.sh
new file mode 100755
index 000000000..c110d96f9
--- /dev/null
+++ b/runsc/test/install.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Fail on any error
+set -e
+
+# Defaults
+declare runtime=runsc-test
+declare uninstall=0
+
+function findExe() {
+  local exe=${1}
+
+  local path=$(find bazel-bin/runsc -type f -executable -name "${exe}" | head -n1)
+  if [[ "${path}" == "" ]]; then
+    echo "Location of ${exe} not found in bazel-bin" >&2
+    exit 1
+  fi
+  echo "${path}"
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --runtime)
+      shift
+      [ "$#" -le 0 ] && echo "No runtime provided" && exit 1
+      runtime=$1
+      ;;
+    -u)
+      uninstall=1
+      ;;
+    *)
+      echo "Unknown option: ${1}"
+      echo ""
+      echo "Usage: ${0} [--runtime <name>] [-u]"
+      echo "  --runtime    sets the runtime name, default: runsc-test"
+      echo "  -u           uninstall the runtime"
+      exit 1
+  esac
+  shift
+done
+
+# Find location of executables.
+declare -r dockercfg=$(findExe dockercfg)
+[[ "${dockercfg}" == "" ]] && exit 1
+
+declare runsc=$(findExe runsc)
+[[ "${runsc}" == "" ]] && exit 1
+
+if [[ ${uninstall} == 0 ]]; then
+  rm -rf /tmp/${runtime}
+  mkdir -p /tmp/${runtime}
+  cp "${runsc}" /tmp/${runtime}/runsc
+  runsc=/tmp/${runtime}/runsc
+
+  # Make tmp dir and runsc binary readable and executable to all users, since it
+  # will run in an empty user namespace.
+  chmod a+rx "${runsc}" $(dirname "${runsc}")
+
+  # Make log dir executable and writable to all users for the same reason.
+  declare logdir=/tmp/"${runtime?}/logs"
+  mkdir -p "${logdir}"
+  sudo -n chmod a+wx "${logdir}"
+
+  declare -r args="--debug-log-dir "${logdir}" --debug --strace --log-packets"
+  sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-kvm "${runsc}" --platform=kvm ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-hostnet "${runsc}" --network=host ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-overlay "${runsc}" --overlay ${args}
+
+else
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-kvm
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-hostnet
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-overlay
+fi
+
+echo "Restarting docker service..."
+sudo -n /etc/init.d/docker restart
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
new file mode 100644
index 000000000..b366fe936
--- /dev/null
+++ b/runsc/test/integration/BUILD
@@ -0,0 +1,26 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_test(
+    name = "integration_test",
+    size = "large",
+    srcs = [
+        "integration_test.go",
+    ],
+    embed = [":integration"],
+    tags = [
+        # Requires docker and runsc to be configured before the test runs.
+        "manual",
+        "local",
+    ],
+    deps = [
+        "//runsc/test/testutil",
+    ],
+)
+
+go_library(
+    name = "integration",
+    srcs = ["integration.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/test/integration",
+)
diff --git a/runsc/test/integration/integration.go b/runsc/test/integration/integration.go
new file mode 100644
index 000000000..49c3c893a
--- /dev/null
+++ b/runsc/test/integration/integration.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package integration is empty. See integration_test.go for description.
+package integration
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
new file mode 100644
index 000000000..09d845bfc
--- /dev/null
+++ b/runsc/test/integration/integration_test.go
@@ -0,0 +1,148 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package image provides end-to-end integration tests for runsc. These tests require
+// docker and runsc to be installed on the machine. To set it up, run:
+//
+//     ./runsc/test/install.sh [--runtime <name>]
+//
+// The tests expect the runtime name to be provided in the RUNSC_RUNTIME
+// environment variable (default: runsc-test).
+//
+// Each test calls docker commands to start up a container, and tests that it is
+// behaving properly, with various runsc commands. The container is killed and deleted
+// at the end.
+
+package integration
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"os"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// This container is a docker image for the Flask microframework hello world application.
+const container = "python-hello-test"
+
+// httpRequestSucceeds sends a request to a given url and checks that the status is OK.
+func httpRequestSucceeds(client http.Client, url string) error {
+	// Ensure that content is being served.
+	resp, err := client.Get(url)
+	if err != nil {
+		return fmt.Errorf("error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		return fmt.Errorf("wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+	return nil
+}
+
+// TestLifeCycle tests a basic Create/Start/Stop docker container life cycle.
+func TestLifeCycle(t *testing.T) {
+	d := testutil.MakeDocker(container)
+
+	// Test docker create.
+	if out, err := d.Do("create", "--runtime", d.Runtime, "--name", d.Name, "-p", "8080", "google/python-hello"); err != nil {
+		t.Fatalf("docker create failed: %v\nout: %s", err, out)
+	}
+
+	// Test docker start.
+	if out, err := d.Do("start", d.Name); err != nil {
+		d.CleanUp()
+		t.Fatalf("docker start failed: %v\nout: %s", err, out)
+	}
+
+	// Test docker stop.
+	if out, err := d.Do("stop", d.Name); err != nil {
+		d.CleanUp()
+		t.Fatalf("docker stop failed: %v\nout: %s", err, out)
+	}
+
+	// Test removing the container.
+	if out, err := d.Do("rm", d.Name); err != nil {
+		t.Fatalf("docker rm failed: %v\nout: %s", err, out)
+	}
+}
+
+func TestPauseResume(t *testing.T) {
+	d := testutil.MakeDocker(container)
+	if out, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
+		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
+		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	}
+
+	timeout := time.Duration(2 * time.Second)
+	client := http.Client{
+		Timeout: timeout,
+	}
+
+	url := fmt.Sprintf("http://localhost:%d", port)
+	// Check that container is working.
+	if err := httpRequestSucceeds(client, url); err != nil {
+		t.Errorf("http request failed: %v", err)
+	}
+
+	// Pause container.
+	if out, err := d.Do("pause", d.Name); err != nil {
+		t.Fatalf("docker pause failed: %v\nout: %s", err, out)
+	}
+
+	// Check if container is paused.
+	switch _, err := client.Get(url); v := err.(type) {
+	case nil:
+		t.Errorf("http req expected to fail but it succeeded")
+	case net.Error:
+		if !v.Timeout() {
+			t.Errorf("http req got error %v, wanted timeout", v)
+		}
+	default:
+		t.Errorf("http req got unexpected error %v", v)
+	}
+
+	// Resume container.
+	if out, err := d.Do("unpause", d.Name); err != nil {
+		t.Fatalf("docker unpause failed: %v\nout: %s", err, out)
+	}
+
+	// Wait until it's up and running.
+	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
+		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	}
+
+	// Check if container is working again.
+	if err := httpRequestSucceeds(client, url); err != nil {
+		t.Errorf("http request failed: %v", err)
+	}
+}
+
+func MainTest(m *testing.M) {
+	testutil.EnsureSupportedDockerVersion()
+	os.Exit(m.Run())
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 2c2555d98..6aec54abe 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -4,7 +4,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "testutil",
-    srcs = ["testutil.go"],
+    srcs = [
+        "docker.go",
+        "testutil.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/test/testutil",
     visibility = [
         "//runsc:__subpackages__",
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
new file mode 100644
index 000000000..4eb049591
--- /dev/null
+++ b/runsc/test/testutil/docker.go
@@ -0,0 +1,185 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net/http"
+	"os"
+	"os/exec"
+	"path"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+)
+
+func init() {
+	rand.Seed(time.Now().UnixNano())
+}
+
+func runtime() string {
+	r := os.Getenv("RUNSC_RUNTIME")
+	if r == "" {
+		return "runsc-test"
+	}
+	return r
+}
+
+// EnsureSupportedDockerVersion checks if correct docker is installed.
+func EnsureSupportedDockerVersion() {
+	cmd := exec.Command("docker", "version")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		log.Fatalf("Error running %q: %v", "docker version", err)
+	}
+	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
+	matches := re.FindStringSubmatch(string(out))
+	if len(matches) != 3 {
+		log.Fatalf("Invalid docker output: %s", out)
+	}
+	major, _ := strconv.Atoi(matches[1])
+	minor, _ := strconv.Atoi(matches[2])
+	if major < 17 || (major == 17 && minor < 9) {
+		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
+	}
+}
+
+// MountArg formats the volume argument to mount in the container.
+func MountArg(source, target string) string {
+	return fmt.Sprintf("%s:%s", source, target)
+}
+
+// LinkArg formats the link argument.
+func LinkArg(source *Docker, target string) string {
+	return fmt.Sprintf("%s:%s", source.Name, target)
+}
+
+// PrepareFiles creates temp directory to copy files there. The sandbox doesn't
+// have access to files in the test dir.
+func PrepareFiles(names ...string) (string, error) {
+	dir, err := ioutil.TempDir("", "image-test")
+	if err != nil {
+		return "", fmt.Errorf("ioutil.TempDir failed: %v", err)
+	}
+	if err := os.Chmod(dir, 0777); err != nil {
+		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
+	}
+	for _, name := range names {
+		src := getLocalPath(name)
+		dst := path.Join(dir, name)
+		if err := Copy(src, dst); err != nil {
+			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
+		}
+	}
+	return dir, nil
+}
+
+func getLocalPath(file string) string {
+	return path.Join(".", file)
+}
+
+// Docker contains the name and the runtime of a docker container.
+type Docker struct {
+	Runtime string
+	Name    string
+}
+
+// MakeDocker sets up the struct for a Docker container.
+// Names of containers will be unique.
+func MakeDocker(namePrefix string) Docker {
+	suffix := fmt.Sprintf("-%06d", rand.Int())[:7]
+	return Docker{Name: namePrefix + suffix, Runtime: runtime()}
+}
+
+// Do executes docker command.
+func (d *Docker) Do(args ...string) (string, error) {
+	fmt.Printf("Running: docker %s\n", args)
+	cmd := exec.Command("docker", args...)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("error executing docker %s: %v", args, err)
+	}
+	return string(out), nil
+}
+
+// Run calls 'docker run' with the arguments provided.
+func (d *Docker) Run(args ...string) (string, error) {
+	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
+	a = append(a, args...)
+	return d.Do(a...)
+}
+
+// CleanUp kills and deletes the container.
+func (d *Docker) CleanUp() error {
+	if _, err := d.Do("kill", d.Name); err != nil {
+		return fmt.Errorf("error killing container %q: %v", d.Name, err)
+	}
+	if _, err := d.Do("rm", d.Name); err != nil {
+		return fmt.Errorf("error deleting container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// FindPort returns the host port that is mapped to 'sandboxPort'. This calls
+// docker to allocate a free port in the host and prevent conflicts.
+func (d *Docker) FindPort(sandboxPort int) (int, error) {
+	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
+	out, err := d.Do("inspect", "-f", format, d.Name)
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving port: %v", err)
+	}
+	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
+	}
+	return port, nil
+}
+
+// WaitForOutput calls 'docker logs' to retrieve containers output and searches
+// for the given pattern.
+func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) error {
+	re := regexp.MustCompile(pattern)
+	var out string
+	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
+		var err error
+		out, err = d.Do("logs", d.Name)
+		if err != nil {
+			return err
+		}
+		if re.MatchString(out) {
+			// Success!
+			return nil
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+	return fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
+}
+
+// WaitForHTTP tries GET requests on a port until the call succeeds or a timeout.
+func (d *Docker) WaitForHTTP(port int, timeout time.Duration) error {
+	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
+		url := fmt.Sprintf("http://localhost:%d/", port)
+		if _, err := http.Get(url); err == nil {
+			// Success!
+			return nil
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+	return fmt.Errorf("timeout waiting for HTTP server on port %d", port)
+}
-- 
cgit v1.2.3


From c15cb8d432034e121497dbdc74d2842d5201552f Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 11 Jul 2018 15:06:29 -0700
Subject: Automated rollback of changelist 203157739

PiperOrigin-RevId: 204196916
Change-Id: If632750fc6368acb835e22cfcee0ae55c8a04d16
---
 pkg/tcpip/link/fdbased/endpoint.go | 51 +++++++++++++++-----------------------
 pkg/tcpip/stack/nic.go             |  3 ++-
 pkg/tcpip/stack/route.go           | 16 ++++--------
 pkg/tcpip/stack/stack.go           |  2 +-
 runsc/boot/network.go              |  9 ++++---
 5 files changed, 33 insertions(+), 48 deletions(-)

(limited to 'runsc')

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index c6a5f6d5a..413f77dcc 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -55,14 +55,10 @@ type endpoint struct {
 	// its end of the communication pipe.
 	closed func(*tcpip.Error)
 
-	vv         *buffer.VectorisedView
-	iovecs     []syscall.Iovec
-	views      []buffer.View
-	dispatcher stack.NetworkDispatcher
-
-	// egressLocal indicates whether packets destined to itself should be
-	// forwarded to the FD endpoint (true) or be sent back to netstack (false).
-	egressLocal bool
+	vv       *buffer.VectorisedView
+	iovecs   []syscall.Iovec
+	views    []buffer.View
+	attached bool
 }
 
 // Options specify the details about the fd-based endpoint to be created.
@@ -73,7 +69,6 @@ type Options struct {
 	ChecksumOffload bool
 	ClosedFunc      func(*tcpip.Error)
 	Address         tcpip.LinkAddress
-	EgressLocal     bool
 }
 
 // New creates a new fd-based endpoint.
@@ -95,15 +90,14 @@ func New(opts *Options) tcpip.LinkEndpointID {
 	}
 
 	e := &endpoint{
-		fd:          opts.FD,
-		mtu:         opts.MTU,
-		caps:        caps,
-		closed:      opts.ClosedFunc,
-		addr:        opts.Address,
-		hdrSize:     hdrSize,
-		views:       make([]buffer.View, len(BufConfig)),
-		iovecs:      make([]syscall.Iovec, len(BufConfig)),
-		egressLocal: opts.EgressLocal,
+		fd:      opts.FD,
+		mtu:     opts.MTU,
+		caps:    caps,
+		closed:  opts.ClosedFunc,
+		addr:    opts.Address,
+		hdrSize: hdrSize,
+		views:   make([]buffer.View, len(BufConfig)),
+		iovecs:  make([]syscall.Iovec, len(BufConfig)),
 	}
 	vv := buffer.NewVectorisedView(0, e.views)
 	e.vv = &vv
@@ -113,13 +107,13 @@ func New(opts *Options) tcpip.LinkEndpointID {
 // Attach launches the goroutine that reads packets from the file descriptor and
 // dispatches them via the provided dispatcher.
 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
-	e.dispatcher = dispatcher
-	go e.dispatchLoop() // S/R-FIXME
+	e.attached = true
+	go e.dispatchLoop(dispatcher) // S/R-FIXME
 }
 
 // IsAttached implements stack.LinkEndpoint.IsAttached.
 func (e *endpoint) IsAttached() bool {
-	return e.dispatcher != nil
+	return e.attached
 }
 
 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
@@ -146,12 +140,6 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
 func (e *endpoint) WritePacket(r *stack.Route, hdr *buffer.Prependable, payload buffer.View, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
-	if !e.egressLocal && r.LocalAddress != "" && r.LocalAddress == r.RemoteAddress {
-		hdrView := hdr.View()
-		vv := buffer.NewVectorisedView(len(hdrView)+len(payload), []buffer.View{hdrView, payload})
-		e.dispatcher.DeliverNetworkPacket(e, r.RemoteLinkAddress, protocol, &vv)
-		return nil
-	}
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
 		eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize))
@@ -164,6 +152,7 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr *buffer.Prependable, payload
 
 	if len(payload) == 0 {
 		return rawfile.NonBlockingWrite(e.fd, hdr.UsedBytes())
+
 	}
 
 	return rawfile.NonBlockingWrite2(e.fd, hdr.UsedBytes(), payload)
@@ -196,7 +185,7 @@ func (e *endpoint) allocateViews(bufConfig []int) {
 }
 
 // dispatch reads one packet from the file descriptor and dispatches it.
-func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
+func (e *endpoint) dispatch(d stack.NetworkDispatcher, largeV buffer.View) (bool, *tcpip.Error) {
 	e.allocateViews(BufConfig)
 
 	n, err := rawfile.BlockingReadv(e.fd, e.iovecs)
@@ -232,7 +221,7 @@ func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
 	e.vv.SetSize(n)
 	e.vv.TrimFront(e.hdrSize)
 
-	e.dispatcher.DeliverNetworkPacket(e, addr, p, e.vv)
+	d.DeliverNetworkPacket(e, addr, p, e.vv)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -244,10 +233,10 @@ func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
 
 // dispatchLoop reads packets from the file descriptor in a loop and dispatches
 // them to the network stack.
-func (e *endpoint) dispatchLoop() *tcpip.Error {
+func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) *tcpip.Error {
 	v := buffer.NewView(header.MaxIPPacketSize)
 	for {
-		cont, err := e.dispatch(v)
+		cont, err := e.dispatch(d, v)
 		if err != nil || !cont {
 			if e.closed != nil {
 				e.closed(err)
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index c1480f97b..25c06cba5 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -327,7 +327,8 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remoteLinkAddr tcpip.Lin
 		return
 	}
 
-	r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
+	r := makeRoute(protocol, dst, src, ref)
+	r.LocalLinkAddress = linkEP.LinkAddress()
 	r.RemoteLinkAddress = remoteLinkAddr
 	ref.ep.HandlePacket(&r, vv)
 	ref.decRef()
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 423f428df..200c39289 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -50,13 +50,12 @@ type Route struct {
 
 // makeRoute initializes a new route. It takes ownership of the provided
 // reference to a network endpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint) Route {
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, ref *referencedNetworkEndpoint) Route {
 	return Route{
-		NetProto:         netProto,
-		LocalAddress:     localAddr,
-		LocalLinkAddress: localLinkAddr,
-		RemoteAddress:    remoteAddr,
-		ref:              ref,
+		NetProto:      netProto,
+		LocalAddress:  localAddr,
+		RemoteAddress: remoteAddr,
+		ref:           ref,
 	}
 }
 
@@ -93,11 +92,6 @@ func (r *Route) Resolve(waker *sleep.Waker) *tcpip.Error {
 
 	nextAddr := r.NextHop
 	if nextAddr == "" {
-		// Local link address is already known.
-		if r.RemoteAddress == r.LocalAddress {
-			r.RemoteLinkAddress = r.LocalLinkAddress
-			return nil
-		}
 		nextAddr = r.RemoteAddress
 	}
 	linkAddr, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 67a3cc95e..b9d0a1762 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -657,7 +657,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			remoteAddr = ref.ep.ID().LocalAddress
 		}
 
-		r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref)
+		r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, ref)
 		r.NextHop = s.routeTable[i].Gateway
 		return r, nil
 	}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index df45218b9..d2b52c823 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -134,10 +134,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		linkEP := fdbased.New(&fdbased.Options{
-			FD:             newFD,
-			MTU:            uint32(link.MTU),
-			EthernetHeader: true,
-			Address:        tcpip.LinkAddress(generateRndMac()),
+			FD:              newFD,
+			MTU:             uint32(link.MTU),
+			ChecksumOffload: false,
+			EthernetHeader:  true,
+			Address:         tcpip.LinkAddress(generateRndMac()),
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-- 
cgit v1.2.3


From 67507bd579a305e5d993c7cca71b665f33f341ff Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 12 Jul 2018 13:36:01 -0700
Subject: runsc: Don't close the control server in a defer.

Closing the control server will block until all open requests have completed.
If a control server method panics, we end up stuck because the defer'd Destroy
function will never return.

PiperOrigin-RevId: 204354676
Change-Id: I6bb1d84b31242d7c3f20d5334b1c966bd6a61dbf
---
 runsc/boot/loader.go | 5 ++++-
 runsc/cmd/boot.go    | 5 ++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index a3cc0e4a4..7de0a84cc 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -347,9 +347,12 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds
 }
 
 // Destroy cleans up all resources used by the loader.
+//
+// Note that this will block until all open control server connections have
+// been closed. For that reason, this should NOT be called in a defer, because
+// a panic in a control server rpc would then hang forever.
 func (l *Loader) Destroy() {
 	if l.ctrl != nil {
-		// Shut down control server.
 		l.ctrl.srv.Stop()
 	}
 	l.stopSignalForwarding()
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index b19da315f..70c4616b4 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -142,24 +142,23 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-
 	l, err := boot.New(spec, conf, b.controllerFD, b.restoreFD, b.ioFDs.GetArray(), b.console)
-
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
-	defer l.Destroy()
 
 	// Wait for the start signal from runsc.
 	l.WaitForStartSignal()
 
 	// Run the application and wait for it to finish.
 	if err := l.Run(); err != nil {
+		l.Destroy()
 		Fatalf("error running sandbox: %v", err)
 	}
 
 	ws := l.WaitExit()
 	log.Infof("application exiting with %+v", ws)
 	*waitStatus = syscall.WaitStatus(ws.Status())
+	l.Destroy()
 	return subcommands.ExitSuccess
 }
-- 
cgit v1.2.3


From 6dce46d4c0c1795e8a00319a0e87f5f63260080b Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 13 Jul 2018 12:33:08 -0700
Subject: Bump the timeout when waiting for python HTTP server.

PiperOrigin-RevId: 204511630
Change-Id: Ib841a7144f3833321b0e69b8585b03c4ed55a265
---
 runsc/test/image/python_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/image/python_test.go b/runsc/test/image/python_test.go
index f0dab3989..e7324e83e 100644
--- a/runsc/test/image/python_test.go
+++ b/runsc/test/image/python_test.go
@@ -37,7 +37,7 @@ func TestPythonHello(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
+	if err := d.WaitForHTTP(port, 10*time.Second); err != nil {
 		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
 	}
 
-- 
cgit v1.2.3


From 9059983fdb2cf64a08152d2aab648b6660b9631b Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 13 Jul 2018 13:45:13 -0700
Subject: runsc: Fix map access race in boot.Loader.waitContainer.

PiperOrigin-RevId: 204522004
Change-Id: I4819dc025f0a1df03ceaaba7951b1902d44562b3
---
 runsc/boot/loader.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 7de0a84cc..706910d8a 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -499,10 +499,12 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// multiple clients to wait on the same container.
 	l.mu.Lock()
 	tgid, ok := l.containerRootTGIDs[cid]
-	l.mu.Unlock()
 	if !ok {
+		defer l.mu.Unlock()
 		return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGIDs)
 	}
+	l.mu.Unlock()
+
 	// If the thread either has already exited or exits during waiting,
 	// consider the container exited.
 	defer func() {
-- 
cgit v1.2.3


From e5d8f99c6071c09aa7bca4e79d28b26f95dc7716 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 18 Jul 2018 15:44:34 -0700
Subject: runsc: Fixes to CheckpointRestoreTest.

We must delete the output file at the beginning of the test, otherwise the test
fails immediately.

Also some minor cleanups in readOutputFile.

PiperOrigin-RevId: 205150525
Change-Id: I6bae1acd5b315320a2c6e25a59afcfc06267fb17
---
 runsc/container/container_test.go | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 72b115628..fc441e353 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -121,12 +121,12 @@ func createWriteableOutputFile(path string) (*os.File, error) {
 	return outputFile, nil
 }
 
-func readOutputNum(outputFile *os.File, path string, first bool) (int, error) {
+func readOutputNum(f *os.File, first bool) (int, error) {
 	var num int
 	time.Sleep(1 * time.Second)
 
-	// Check that outputFile exists and contains counting data.
-	fileInfo, err := os.Stat(path)
+	// Check that f exists and contains counting data.
+	fileInfo, err := f.Stat()
 	if err != nil {
 		return 0, fmt.Errorf("error creating output file: %v", err)
 	}
@@ -136,15 +136,15 @@ func readOutputNum(outputFile *os.File, path string, first bool) (int, error) {
 	}
 
 	// Read the first number in the new file
-	outputFileContent, err := ioutil.ReadAll(outputFile)
+	b, err := ioutil.ReadAll(f)
 	if err != nil {
 		return 0, fmt.Errorf("error reading file: %v", err)
 	}
-	if len(outputFileContent) == 0 {
+	if len(b) == 0 {
 		return 0, fmt.Errorf("error no content was read")
 	}
 
-	nums := strings.Split(string(outputFileContent), "\n")
+	nums := strings.Split(string(b), "\n")
 
 	if first {
 		num, err = strconv.Atoi(nums[0])
@@ -487,6 +487,9 @@ func TestExec(t *testing.T) {
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	outputPath := filepath.Join(os.TempDir(), "output")
+	// Make sure it does not already exist.
+	os.Remove(outputPath)
+
 	outputFile, err := createWriteableOutputFile(outputPath)
 	if err != nil {
 		t.Fatalf("error creating output file: %v", err)
@@ -538,7 +541,7 @@ func TestCheckpointRestore(t *testing.T) {
 	}
 	defer os.RemoveAll(imagePath)
 
-	lastNum, err := readOutputNum(outputFile, outputPath, false)
+	lastNum, err := readOutputNum(outputFile, false)
 	if err != nil {
 		t.Fatalf("error with outputFile: %v", err)
 	}
@@ -563,7 +566,7 @@ func TestCheckpointRestore(t *testing.T) {
 		t.Fatalf("error starting container: %v", err)
 	}
 
-	firstNum, err := readOutputNum(outputFile2, outputPath, true)
+	firstNum, err := readOutputNum(outputFile2, true)
 	if err != nil {
 		t.Fatalf("error with outputFile: %v", err)
 	}
@@ -594,7 +597,7 @@ func TestCheckpointRestore(t *testing.T) {
 		t.Fatalf("error starting container: %v", err)
 	}
 
-	firstNum2, err := readOutputNum(outputFile3, outputPath, true)
+	firstNum2, err := readOutputNum(outputFile3, true)
 	if err != nil {
 		t.Fatalf("error with outputFile: %v", err)
 	}
-- 
cgit v1.2.3


From c05660373e8bda36ddf5181220c76f4327f2abc6 Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Wed, 18 Jul 2018 16:57:29 -0700
Subject: Moved restore code out of create and made to be called after create.

Docker expects containers to be created before they are restored.
However, gVisor restoring requires specificactions regarding the kernel
and the file system. These actions were originally in booting the sandbox.

Now setting up the file system is deferred until a call to a call to
runsc start. In the restore case, the kernel is destroyed and a new kernel
is created in the same process, as we need the same process for Docker.

These changes required careful execution of concurrent processes which
required the use of a channel.

Full docker integration still needs the ability to restore into the same
container.

PiperOrigin-RevId: 205161441
Change-Id: Ie1d2304ead7e06855319d5dc310678f701bd099f
---
 pkg/urpc/urpc.go                  |   2 +-
 runsc/boot/controller.go          | 131 ++++++++++++++++++----
 runsc/boot/events.go              |   4 +-
 runsc/boot/fs.go                  |  45 ++++++++
 runsc/boot/loader.go              | 222 ++++++++++++++++----------------------
 runsc/boot/loader_test.go         |   3 +-
 runsc/cmd/boot.go                 |   9 +-
 runsc/cmd/checkpoint.go           |   4 +-
 runsc/cmd/create.go               |   2 +-
 runsc/cmd/restore.go              |  11 +-
 runsc/container/container.go      |  22 +++-
 runsc/container/container_test.go |  42 ++++----
 runsc/sandbox/sandbox.go          |  71 ++++++++----
 runsc/sandbox/sandbox_test.go     |   2 +-
 14 files changed, 359 insertions(+), 211 deletions(-)

(limited to 'runsc')

diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index 0f2b5ccce..af620b704 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -570,7 +570,7 @@ func (c *Client) Call(method string, arg interface{}, result interface{}) error
 	callR := callResult{Result: result}
 	newFs, err := unmarshal(c.Socket, &callR)
 	if err != nil {
-		return err
+		return fmt.Errorf("urpc method %q failed: %v", method, err)
 	}
 
 	// Set the file payload.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index ff75a382e..c6e934e66 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -23,9 +23,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
 )
 
 const (
@@ -47,9 +51,15 @@ const (
 	// processes running in a container.
 	ContainerProcesses = "containerManager.Processes"
 
+	// ContainerRestore restores a container from a statefile.
+	ContainerRestore = "containerManager.Restore"
+
 	// ContainerResume unpauses the paused container.
 	ContainerResume = "containerManager.Resume"
 
+	// ContainerWaitForLoader blocks until the container's loader has been created.
+	ContainerWaitForLoader = "containerManager.WaitForLoader"
+
 	// ContainerSignal is used to send a signal to a container.
 	ContainerSignal = "containerManager.Signal"
 
@@ -85,7 +95,7 @@ func ControlSocketAddr(id string) string {
 // controller holds the control server, and is used for communication into the
 // sandbox.
 type controller struct {
-	// srv is the contorl server.
+	// srv is the control server.
 	srv *server.Server
 
 	// manager holds the containerManager methods.
@@ -100,10 +110,9 @@ func newController(fd int, k *kernel.Kernel, w *watchdog.Watchdog) (*controller,
 	}
 
 	manager := &containerManager{
-		startChan:       make(chan struct{}),
-		startResultChan: make(chan error),
-		k:               k,
-		watchdog:        w,
+		startChan:         make(chan struct{}),
+		startResultChan:   make(chan error),
+		loaderCreatedChan: make(chan struct{}),
 	}
 	srv.Register(manager)
 
@@ -137,15 +146,13 @@ type containerManager struct {
 	// channel. A nil value indicates success.
 	startResultChan chan error
 
-	// k is the emulated linux kernel on which the sandboxed
-	// containers run.
-	k *kernel.Kernel
-
-	// watchdog is the kernel watchdog.
-	watchdog *watchdog.Watchdog
-
 	// l is the loader that creates containers and sandboxes.
 	l *Loader
+
+	// loaderCreatedChan is used to signal when the loader has been created.
+	// After a loader is created, a notify method is called that writes to
+	// this channel.
+	loaderCreatedChan chan struct{}
 }
 
 // StartRoot will start the root container process.
@@ -160,7 +167,7 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
 // Processes retrieves information about processes running in the sandbox.
 func (cm *containerManager) Processes(_, out *[]*control.Process) error {
 	log.Debugf("containerManager.Processes")
-	return control.Processes(cm.k, out)
+	return control.Processes(cm.l.k, out)
 }
 
 // StartArgs contains arguments to the Start method.
@@ -194,7 +201,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 		return errors.New("start argument missing container ID")
 	}
 
-	tgid, err := cm.l.startContainer(args, cm.k)
+	tgid, err := cm.l.startContainer(args, cm.l.k)
 	if err != nil {
 		return err
 	}
@@ -206,7 +213,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 // Execute runs a command on a created or running sandbox.
 func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Execute")
-	proc := control.Proc{Kernel: cm.k}
+	proc := control.Proc{Kernel: cm.l.k}
 	if err := proc.Exec(e, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
 	}
@@ -217,21 +224,105 @@ func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) err
 func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
 	log.Debugf("containerManager.Checkpoint")
 	state := control.State{
-		Kernel:   cm.k,
-		Watchdog: cm.watchdog,
+		Kernel:   cm.l.k,
+		Watchdog: cm.l.watchdog,
 	}
 	return state.Save(o, nil)
 }
 
 // Pause suspends a container.
 func (cm *containerManager) Pause(_, _ *struct{}) error {
-	cm.k.Pause()
+	cm.l.k.Pause()
 	return nil
 }
 
+// WaitForLoader blocks until the container's loader has been created.
+func (cm *containerManager) WaitForLoader(_, _ *struct{}) error {
+	log.Debugf("containerManager.WaitForLoader")
+	<-cm.loaderCreatedChan
+	return nil
+}
+
+// RestoreOpts contains options related to restoring a container's file system.
+type RestoreOpts struct {
+	// FilePayload contains the state file to be restored.
+	urpc.FilePayload
+
+	// SandboxID contains the ID of the sandbox.
+	SandboxID string
+}
+
+// Restore loads a container from a statefile.
+// The container's current kernel is destroyed, a restore environment is created,
+// and the kernel is recreated with the restore state file. The container then sends the
+// signal to start.
+func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
+	log.Debugf("containerManager.Restore")
+	if len(o.FilePayload.Files) != 1 {
+		return fmt.Errorf("exactly one file must be provided")
+	}
+	defer o.FilePayload.Files[0].Close()
+
+	// Destroy the old kernel and create a new kernel.
+	cm.l.k.Pause()
+	cm.l.k.Destroy()
+
+	p, err := createPlatform(cm.l.conf)
+	if err != nil {
+		return fmt.Errorf("error creating platform: %v", err)
+	}
+	k := &kernel.Kernel{
+		Platform: p,
+	}
+	cm.l.k = k
+
+	// Set up the restore environment.
+	fds := &fdDispenser{fds: cm.l.ioFDs}
+	renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
+	if err != nil {
+		return fmt.Errorf("error creating RestoreEnvironment: %v", err)
+	}
+	fs.SetRestoreEnvironment(*renv)
+
+	// Prepare to load from the state file.
+	networkStack := newEmptyNetworkStack(cm.l.conf, k)
+	info, err := o.FilePayload.Files[0].Stat()
+	if err != nil {
+		return err
+	}
+	if info.Size() == 0 {
+		return fmt.Errorf("error file was empty")
+	}
+
+	// Load the state.
+	loadOpts := state.LoadOpts{
+		Source: o.FilePayload.Files[0],
+	}
+	if err := loadOpts.Load(k, p, networkStack); err != nil {
+		return err
+	}
+
+	// Set timekeeper.
+	k.Timekeeper().SetClocks(time.NewCalibratedClocks())
+
+	// Since we have a new kernel we also must make a new watchdog.
+	watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+
+	// Change the loader fields to reflect the changes made when restoring.
+	cm.l.k = k
+	cm.l.watchdog = watchdog
+	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+	cm.l.setRootContainerID(o.SandboxID)
+	cm.l.restore = true
+
+	// Tell the root container to start and wait for the result.
+	cm.startChan <- struct{}{}
+	return <-cm.startResultChan
+}
+
 // Resume unpauses a container.
 func (cm *containerManager) Resume(_, _ *struct{}) error {
-	cm.k.Unpause()
+	cm.l.k.Unpause()
 	return nil
 }
 
@@ -272,7 +363,7 @@ func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
 	// process in theat container. Currently we just signal PID 1 in the
 	// sandbox.
 	si := arch.SignalInfo{Signo: args.Signo}
-	t := cm.k.TaskSet().Root.TaskWithID(1)
+	t := cm.l.k.TaskSet().Root.TaskWithID(1)
 	if t == nil {
 		return fmt.Errorf("cannot signal: no task with id 1")
 	}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index 0eb75c14c..832339cf4 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -62,8 +62,8 @@ type Memory struct {
 // Event gets the events from the container.
 func (cm *containerManager) Event(_ *struct{}, out *Event) error {
 	stats := &Stats{}
-	stats.populateMemory(cm.k)
-	stats.populatePIDs(cm.k)
+	stats.populateMemory(cm.l.k)
+	stats.populatePIDs(cm.l.k)
 	*out = Event{Type: "stats", Data: stats}
 	return nil
 }
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 51c8d620d..e596c739f 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -27,6 +27,9 @@ import (
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -563,3 +566,45 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	}
 	return targets
 }
+
+// setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
+// procArgs are passed by reference and the FDMap field is modified.
+func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel) error {
+	ctx := procArgs.NewContext(k)
+
+	// Create the FD map, which will set stdin, stdout, and stderr.  If
+	// console is true, then ioctl calls will be passed through to the host
+	// fd.
+	fdm, err := createFDMap(ctx, k, ls, console)
+	if err != nil {
+		return fmt.Errorf("error importing fds: %v", err)
+	}
+
+	// CreateProcess takes a reference on FDMap if successful. We
+	// won't need ours either way.
+	procArgs.FDMap = fdm
+
+	// If this is the root container, we also need to setup the root mount
+	// namespace.
+	if k.RootMountNamespace() == nil {
+		// Use root user to configure mounts. The current user might not have
+		// permission to do so.
+		rootProcArgs := kernel.CreateProcessArgs{
+			WorkingDirectory:     "/",
+			Credentials:          auth.NewRootCredentials(creds.UserNamespace),
+			Umask:                0022,
+			MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+		}
+		rootCtx := rootProcArgs.NewContext(k)
+
+		// Create the virtual filesystem.
+		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
+		if err != nil {
+			return fmt.Errorf("error creating mounts: %v", err)
+		}
+
+		k.SetRootMountNamespace(mns)
+	}
+
+	return nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 706910d8a..66394cdf8 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -19,7 +19,6 @@ import (
 	"errors"
 	"fmt"
 	"math/rand"
-	"os"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -29,7 +28,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -38,7 +36,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
 	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
@@ -77,6 +74,12 @@ type Loader struct {
 
 	watchdog *watchdog.Watchdog
 
+	// ioFDs are the FDs that attach the sandbox to the gofers.
+	ioFDs []int
+
+	// spec is the base configuration for the root container.
+	spec *specs.Spec
+
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
@@ -111,16 +114,7 @@ func init() {
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []int, console bool) (*Loader, error) {
-	var (
-		tk          *kernel.Timekeeper
-		creds       *auth.Credentials
-		vdso        *loader.VDSO
-		utsns       *kernel.UTSNamespace
-		ipcns       *kernel.IPCNamespace
-		restoreFile *os.File
-		procArgs    kernel.CreateProcessArgs
-	)
+func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) {
 	// Create kernel and platform.
 	p, err := createPlatform(conf)
 	if err != nil {
@@ -130,60 +124,47 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		Platform: p,
 	}
 
-	if restoreFD == -1 {
-		// Create VDSO.
-		//
-		// Pass k as the platform since it is savable, unlike the actual platform.
-		vdso, err = loader.PrepareVDSO(k)
-		if err != nil {
-			return nil, fmt.Errorf("error creating vdso: %v", err)
-		}
+	// Create VDSO.
+	//
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	vdso, err := loader.PrepareVDSO(k)
+	if err != nil {
+		return nil, fmt.Errorf("error creating vdso: %v", err)
+	}
 
-		// Create timekeeper.
-		tk, err = kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
-		if err != nil {
-			return nil, fmt.Errorf("error creating timekeeper: %v", err)
-		}
-		tk.SetClocks(time.NewCalibratedClocks())
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("error creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
 
-		// Create capabilities.
-		caps, err := specutils.Capabilities(spec.Process.Capabilities)
-		if err != nil {
-			return nil, fmt.Errorf("error creating capabilities: %v", err)
-		}
+	// Create capabilities.
+	caps, err := specutils.Capabilities(spec.Process.Capabilities)
+	if err != nil {
+		return nil, fmt.Errorf("error creating capabilities: %v", err)
+	}
 
-		// Convert the spec's additional GIDs to KGIDs.
-		extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
-		for _, GID := range spec.Process.User.AdditionalGids {
-			extraKGIDs = append(extraKGIDs, auth.KGID(GID))
-		}
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+	for _, GID := range spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
 
-		// Create credentials.
-		creds = auth.NewUserCredentials(
-			auth.KUID(spec.Process.User.UID),
-			auth.KGID(spec.Process.User.GID),
-			extraKGIDs,
-			caps,
-			auth.NewRootUserNamespace())
+	// Create credentials.
+	creds := auth.NewUserCredentials(
+		auth.KUID(spec.Process.User.UID),
+		auth.KGID(spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		auth.NewRootUserNamespace())
 
-		// Create user namespace.
-		// TODO: Not clear what domain name should be here.  It is
-		// not configurable from runtime spec.
-		utsns = kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
+	// Create user namespace.
+	// TODO: Not clear what domain name should be here.  It is
+	// not configurable from runtime spec.
+	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
 
-		ipcns = kernel.NewIPCNamespace(creds.UserNamespace)
-	} else {
-		// Create and set RestoreEnvironment
-		fds := &fdDispenser{fds: ioFDs}
-		renv, err := createRestoreEnvironment(spec, conf, fds)
-		if err != nil {
-			return nil, fmt.Errorf("error creating RestoreEnvironment: %v", err)
-		}
-		fs.SetRestoreEnvironment(*renv)
-
-		restoreFile = os.NewFile(uintptr(restoreFD), "restore_file")
-		defer restoreFile.Close()
-	}
+	ipcns := kernel.NewIPCNamespace(creds.UserNamespace)
 
 	if err := enableStrace(conf); err != nil {
 		return nil, fmt.Errorf("failed to enable strace: %v", err)
@@ -195,33 +176,20 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	// Run().
 	networkStack := newEmptyNetworkStack(conf, k)
 
-	if restoreFile == nil {
-		// Initiate the Kernel object, which is required by the Context passed
-		// to createVFS in order to mount (among other things) procfs.
-		if err = k.Init(kernel.InitKernelArgs{
-			FeatureSet:        cpuid.HostFeatureSet(),
-			Timekeeper:        tk,
-			RootUserNamespace: creds.UserNamespace,
-			NetworkStack:      networkStack,
-			// TODO: use number of logical processors from cgroups.
-			ApplicationCores: uint(runtime.NumCPU()),
-			Vdso:             vdso,
-			RootUTSNamespace: utsns,
-			RootIPCNamespace: ipcns,
-		}); err != nil {
-			return nil, fmt.Errorf("error initializing kernel: %v", err)
-		}
-	} else {
-		// Load the state.
-		loadOpts := state.LoadOpts{
-			Source: restoreFile,
-		}
-		if err := loadOpts.Load(k, p, networkStack); err != nil {
-			return nil, err
-		}
-
-		// Set timekeeper.
-		k.Timekeeper().SetClocks(time.NewCalibratedClocks())
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		FeatureSet:        cpuid.HostFeatureSet(),
+		Timekeeper:        tk,
+		RootUserNamespace: creds.UserNamespace,
+		NetworkStack:      networkStack,
+		// TODO: use number of logical processors from cgroups.
+		ApplicationCores: uint(runtime.NumCPU()),
+		Vdso:             vdso,
+		RootUTSNamespace: utsns,
+		RootIPCNamespace: ipcns,
+	}); err != nil {
+		return nil, fmt.Errorf("error initializing kernel: %v", err)
 	}
 
 	// Turn on packet logging if enabled.
@@ -258,11 +226,9 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	// Ensure that signals received are forwarded to the emulated kernel.
 	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
 
-	if restoreFile == nil {
-		procArgs, err = newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
-		if err != nil {
-			return nil, fmt.Errorf("failed to create root process: %v", err)
-		}
+	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
 
 	l := &Loader{
@@ -271,9 +237,10 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		conf:                 conf,
 		console:              console,
 		watchdog:             watchdog,
+		ioFDs:                ioFDs,
+		spec:                 spec,
 		stopSignalForwarding: stopSignalForwarding,
 		rootProcArgs:         procArgs,
-		restore:              restoreFile != nil,
 	}
 	ctrl.manager.l = l
 	return l, nil
@@ -307,41 +274,6 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds
 		UTSNamespace:         utsns,
 		IPCNamespace:         ipcns,
 	}
-	ctx := procArgs.NewContext(k)
-
-	// Create the FD map, which will set stdin, stdout, and stderr.  If
-	// console is true, then ioctl calls will be passed through to the host
-	// fd.
-	fdm, err := createFDMap(ctx, k, ls, console)
-	if err != nil {
-		return kernel.CreateProcessArgs{}, fmt.Errorf("error importing fds: %v", err)
-	}
-
-	// CreateProcess takes a reference on FDMap if successful. We
-	// won't need ours either way.
-	procArgs.FDMap = fdm
-
-	// If this is the root container, we also need to setup the root mount
-	// namespace.
-	if k.RootMountNamespace() == nil {
-		// Use root user to configure mounts. The current user might not have
-		// permission to do so.
-		rootProcArgs := kernel.CreateProcessArgs{
-			WorkingDirectory:     "/",
-			Credentials:          auth.NewRootCredentials(creds.UserNamespace),
-			Umask:                0022,
-			MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		}
-		rootCtx := rootProcArgs.NewContext(k)
-
-		// Create the virtual filesystem.
-		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
-		if err != nil {
-			return kernel.CreateProcessArgs{}, fmt.Errorf("error creating mounts: %v", err)
-		}
-
-		k.SetRootMountNamespace(mns)
-	}
 
 	return procArgs, nil
 }
@@ -411,7 +343,20 @@ func (l *Loader) run() error {
 	}
 
 	// If we are restoring, we do not want to create a process.
+	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
+		err := setFileSystemForProcess(
+			&l.rootProcArgs,
+			l.spec,
+			l.conf,
+			l.ioFDs,
+			l.console,
+			l.rootProcArgs.Credentials,
+			l.rootProcArgs.Limits,
+			l.k)
+		if err != nil {
+			return err
+		}
 		// Create the root container init task.
 		if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
 			return fmt.Errorf("failed to create init process: %v", err)
@@ -421,6 +366,7 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
+	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
 }
@@ -468,6 +414,18 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa
 	if err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
+	err = setFileSystemForProcess(
+		&procArgs,
+		args.Spec,
+		args.Conf,
+		nil,
+		false,
+		creds,
+		procArgs.Limits,
+		k)
+	if err != nil {
+		return 0, fmt.Errorf("failed to create new process: %v", err)
+	}
 
 	tg, err := l.k.CreateProcess(procArgs)
 	if err != nil {
@@ -553,6 +511,12 @@ func (l *Loader) WaitForStartSignal() {
 	<-l.ctrl.manager.startChan
 }
 
+// NotifyLoaderCreated sends a signal to the container manager that this
+// loader has been created.
+func (l *Loader) NotifyLoaderCreated() {
+	l.ctrl.manager.loaderCreatedChan <- struct{}{}
+}
+
 // WaitExit waits for the root container to exit, and returns its exit status.
 func (l *Loader) WaitExit() kernel.ExitStatus {
 	// Wait for container.
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 30ec236e4..7ea2e1ee5 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -61,7 +61,8 @@ func createLoader() (*Loader, error) {
 		FileAccess:     FileAccessDirect,
 		DisableSeccomp: true,
 	}
-	return New(testSpec(), conf, fd, -1, nil, false)
+	spec := testSpec()
+	return New(spec, conf, fd, nil, false)
 }
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 70c4616b4..4e08dafc8 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -49,9 +49,6 @@ type Boot struct {
 	// applyCaps determines if capabilities defined in the spec should be applied
 	// to the process.
 	applyCaps bool
-
-	// restoreFD is the file descriptor to the state file to be restored.
-	restoreFD int
 }
 
 // Name implements subcommands.Command.Name.
@@ -76,7 +73,6 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
-	f.IntVar(&b.restoreFD, "restore-fd", -1, "FD of the state file to be restored")
 }
 
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
@@ -142,11 +138,14 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-	l, err := boot.New(spec, conf, b.controllerFD, b.restoreFD, b.ioFDs.GetArray(), b.console)
+	l, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
 
+	// Notify other processes the loader has been created.
+	l.NotifyLoaderCreated()
+
 	// Wait for the start signal from runsc.
 	l.WaitForStartSignal()
 
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 94efc3517..05014ba3d 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -133,12 +133,12 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("error destroying container: %v", err)
 	}
 
-	cont, err = container.Create(id, spec, conf, bundleDir, "", "", fullImagePath)
+	cont, err = container.Create(id, spec, conf, bundleDir, "", "")
 	if err != nil {
 		Fatalf("error restoring container: %v", err)
 	}
 
-	if err := cont.Start(conf); err != nil {
+	if err := cont.Restore(spec, conf, fullImagePath); err != nil {
 		Fatalf("error starting container: %v", err)
 	}
 
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 5a887c73c..94a889077 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -87,7 +87,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	// Create the container. A new sandbox will be created for the
 	// container unless the metadata specifies that it should be run in an
 	// existing container.
-	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, ""); err != nil {
+	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile); err != nil {
 		Fatalf("error creating container: %v", err)
 	}
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 69cdb35c1..6dc044672 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -94,16 +94,15 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 
 	restoreFile := filepath.Join(r.imagePath, checkpointFileName)
 
-	cont, err := container.Create(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, restoreFile)
+	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error restoring container: %v", err)
+		Fatalf("error loading container: %v", err)
 	}
-
-	if err := cont.Start(conf); err != nil {
-		Fatalf("error starting container: %v", err)
+	if err := c.Restore(spec, conf, restoreFile); err != nil {
+		Fatalf("error restoring container: %v", err)
 	}
 
-	ws, err := cont.Wait()
+	ws, err := c.Wait()
 	if err != nil {
 		Fatalf("error running container: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index c4e5bf9f6..574075b00 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -190,7 +190,7 @@ func List(rootDir string) ([]string, error) {
 
 // Create creates the container in a new Sandbox process, unless the metadata
 // indicates that an existing Sandbox should be used.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, restoreFile string) (*Container, error) {
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (*Container, error) {
 	log.Debugf("Create container %q in root dir: %s", id, conf.RootDir)
 	if err := validateID(id); err != nil {
 		return nil, err
@@ -221,7 +221,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		log.Debugf("Creating new sandbox for container %q", id)
 		// Start a new sandbox for this container. Any errors after this point
 		// must destroy the container.
-		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket, restoreFile)
+		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
 		if err != nil {
 			c.Destroy()
 			return nil, err
@@ -309,10 +309,26 @@ func (c *Container) Start(conf *boot.Config) error {
 	return c.save()
 }
 
+// Restore takes a container and replaces its kernel and file system
+// to restore a container from its state file.
+func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
+	log.Debugf("Restore container %q", c.ID)
+
+	if c.Status != Created {
+		return fmt.Errorf("cannot restore container in state %s", c.Status)
+	}
+
+	if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil {
+		return err
+	}
+	c.Status = Running
+	return c.save()
+}
+
 // Run is a helper that calls Create + Start + Wait.
 func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (syscall.WaitStatus, error) {
 	log.Debugf("Run container %q in root dir: %s", id, conf.RootDir)
-	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, "")
+	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile)
 	if err != nil {
 		return 0, fmt.Errorf("error creating container: %v", err)
 	}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index fc441e353..62a681ac2 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -168,7 +168,7 @@ func run(spec *specs.Spec) error {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		return fmt.Errorf("error creating container: %v", err)
 	}
@@ -213,7 +213,7 @@ func TestLifecycle(t *testing.T) {
 	}
 	// Create the container.
 	id := testutil.UniqueContainerID()
-	if _, err := container.Create(id, spec, conf, bundleDir, "", "", ""); err != nil {
+	if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
 
@@ -411,7 +411,7 @@ func TestExec(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -514,7 +514,7 @@ func TestCheckpointRestore(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -557,13 +557,14 @@ func TestCheckpointRestore(t *testing.T) {
 	defer outputFile2.Close()
 
 	// Restore into a new container.
-	cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", imagePath)
+	cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
 	defer cont2.Destroy()
-	if err := cont2.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
+
+	if err := cont2.Restore(spec, conf, imagePath); err != nil {
+		t.Fatalf("error restoring container: %v", err)
 	}
 
 	firstNum, err := readOutputNum(outputFile2, true)
@@ -588,13 +589,14 @@ func TestCheckpointRestore(t *testing.T) {
 	defer outputFile3.Close()
 
 	// Restore into a new container.
-	cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", imagePath)
+	cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
 	defer cont3.Destroy()
-	if err := cont3.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
+
+	if err := cont3.Restore(spec, conf, imagePath); err != nil {
+		t.Fatalf("error restoring container: %v", err)
 	}
 
 	firstNum2, err := readOutputNum(outputFile3, true)
@@ -604,7 +606,7 @@ func TestCheckpointRestore(t *testing.T) {
 
 	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
 	if lastNum+1 != firstNum2 {
-		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
+		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
 	}
 
 }
@@ -626,7 +628,7 @@ func TestPauseResume(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -728,7 +730,7 @@ func TestPauseResumeStatus(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -795,7 +797,7 @@ func TestCapabilities(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -894,7 +896,7 @@ func TestConsoleSocket(t *testing.T) {
 
 	// Create the container and pass the socket name.
 	id := testutil.UniqueContainerID()
-	s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "", "")
+	s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -1014,7 +1016,7 @@ func TestReadonlyRoot(t *testing.T) {
 	conf.Overlay = true
 
 	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -1055,7 +1057,7 @@ func TestReadonlyMount(t *testing.T) {
 	conf.Overlay = true
 
 	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -1095,7 +1097,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := container.Create(cid, spec, conf, bundleDir, "", "", "")
+		cont, err := container.Create(cid, spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1165,7 +1167,7 @@ func TestMultiContainerSanity(t *testing.T) {
 			t.Fatalf("error setting up container: %v", err)
 		}
 		defer os.RemoveAll(bundleDir)
-		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "", "")
+		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1241,7 +1243,7 @@ func TestMultiContainerWait(t *testing.T) {
 			t.Fatalf("error setting up container: %v", err)
 		}
 		defer os.RemoveAll(bundleDir)
-		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "", "")
+		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 9200fbee9..1f2cd6018 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -56,7 +56,7 @@ type Sandbox struct {
 // Create creates the sandbox process.
 //
 // If restoreFile is not empty, the sandbox will be restored from file.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, restoreFile string) (*Sandbox, error) {
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
 
 	binPath, err := specutils.BinPath()
@@ -71,7 +71,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	}
 
 	// Create the sandbox process.
-	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles, restoreFile); err != nil {
+	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles); err != nil {
 		return nil, err
 	}
 
@@ -127,6 +127,42 @@ func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error {
 	return nil
 }
 
+// Restore sends the restore call for a container in the sandbox.
+func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f string) error {
+	log.Debugf("Restore sandbox %q", s.ID)
+
+	rf, err := os.Open(f)
+	if err != nil {
+		return fmt.Errorf("os.Open(%q) failed: %v", f, err)
+	}
+	defer rf.Close()
+
+	opt := boot.RestoreOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{rf},
+		},
+		SandboxID: s.ID,
+	}
+
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	// Configure the network.
+	if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
+		return fmt.Errorf("error setting up network: %v", err)
+	}
+
+	// Restore the container and start the root container.
+	if err := conn.Call(boot.ContainerRestore, &opt, nil); err != nil {
+		return fmt.Errorf("error restoring container %q: %v", cid, err)
+	}
+
+	return nil
+}
+
 // Processes retrieves the list of processes and associated metadata for a
 // given container in this sandbox.
 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
@@ -254,7 +290,7 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File, restoreFile string) error {
+func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -276,27 +312,12 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		"--bundle", bundleDir,
 		"--controller-fd="+strconv.Itoa(nextFD),
 		fmt.Sprintf("--console=%t", consoleEnabled))
+	nextFD++
 
 	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
 	defer controllerFile.Close()
 	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
 
-	// If a restore filename was given, open the file and append its FD to Args
-	// and the file to ExtraFiles.
-	if restoreFile != "" {
-		// Create the image file and open for reading.
-		rF, err := os.Open(restoreFile)
-		if err != nil {
-			return fmt.Errorf("os.Open(%q) failed: %v", restoreFile, err)
-		}
-		defer rF.Close()
-
-		nextFD++
-		cmd.Args = append(cmd.Args, "--restore-fd="+strconv.Itoa(nextFD))
-		cmd.ExtraFiles = append(cmd.ExtraFiles, rF)
-	}
-	nextFD++
-
 	// If there is a gofer, sends all socket ends to the sandbox.
 	for _, f := range ioFiles {
 		defer f.Close()
@@ -402,7 +423,8 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 }
 
 // waitForCreated waits for the sandbox subprocess control server to be
-// running, at which point the sandbox is in Created state.
+// running and for the loader to have been created, at which point the sandbox
+// is in Created state.
 func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 	log.Debugf("Waiting for sandbox %q creation", s.ID)
 
@@ -418,6 +440,15 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 	if err := specutils.WaitForReady(s.Pid, timeout, ready); err != nil {
 		return fmt.Errorf("unexpected error waiting for sandbox %q, err: %v", s.ID, err)
 	}
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.ContainerWaitForLoader, nil, nil); err != nil {
+		return fmt.Errorf("err waiting on loader on sandbox %q, err: %v", s.ID, err)
+	}
 	return nil
 }
 
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index 9db90ef07..fee2de283 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -39,7 +39,7 @@ func TestGoferExits(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
-- 
cgit v1.2.3


From f62d6dd4537b155f462d69aefb6414785791fcba Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Thu, 19 Jul 2018 18:09:04 -0700
Subject: runsc: copy gateway from the pod network interface.

PiperOrigin-RevId: 205334841
Change-Id: Ia60d486f9aae70182fdc4af50cf7c915986126d7
---
 runsc/sandbox/network.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'runsc')

diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 62dcdd9e9..d6685fd66 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -350,6 +350,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
 		routes = append(routes, boot.Route{
 			Destination: r.Dst.IP.Mask(r.Dst.Mask),
 			Mask:        r.Dst.Mask,
+			Gateway:     r.Gw,
 		})
 	}
 	return routes, def, nil
-- 
cgit v1.2.3


From f543ada15005e6e2d31a63148a74fbdc43d070de Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Fri, 20 Jul 2018 16:17:00 -0700
Subject: Removed a now incorrect reference to restoreFile.

PiperOrigin-RevId: 205470108
Change-Id: I226878a887fe1133561005357a9e3b09428b06b6
---
 runsc/sandbox/sandbox.go | 2 --
 1 file changed, 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 1f2cd6018..196949f11 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -54,8 +54,6 @@ type Sandbox struct {
 }
 
 // Create creates the sandbox process.
-//
-// If restoreFile is not empty, the sandbox will be restored from file.
 func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
 
-- 
cgit v1.2.3


From d7a34790a0cc3cfdef9d9e54f17c4bc0a6819900 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 23 Jul 2018 13:30:29 -0700
Subject: Add KVM and overlay dimensions to container_test

PiperOrigin-RevId: 205714667
Change-Id: I317a2ca98ac3bdad97c4790fcc61b004757d99ef
---
 pkg/sentry/platform/kvm/kvm.go       |    2 +-
 runsc/container/BUILD                |    6 +-
 runsc/container/container_test.go    | 1475 ++++++++++++++++++----------------
 runsc/sandbox/sandbox_test.go        |    3 +-
 runsc/test/testutil/BUILD            |    1 +
 runsc/test/testutil/testutil.go      |   42 +-
 runsc/test/testutil/testutil_race.go |   21 +
 7 files changed, 831 insertions(+), 719 deletions(-)
 create mode 100644 runsc/test/testutil/testutil_race.go

(limited to 'runsc')

diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 3ed057881..2dc3239a5 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -66,7 +66,7 @@ func New() (*KVM, error) {
 		ring0.Init(cpuid.HostFeatureSet())
 	})
 	if globalErr != nil {
-		return nil, err
+		return nil, globalErr
 	}
 
 	// Create a new VM fd.
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 679d7e097..7ec68f573 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -26,17 +26,21 @@ go_library(
 
 go_test(
     name = "container_test",
-    size = "small",
+    size = "medium",
     srcs = ["container_test.go"],
     data = [
         "//runsc",
     ],
+    tags = [
+        "requires-kvm",
+    ],
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
+        "//runsc/boot",
         "//runsc/container",
         "//runsc/specutils",
         "//runsc/test/testutil",
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 62a681ac2..34febe038 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/container"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
@@ -159,8 +160,8 @@ func readOutputNum(f *os.File, first bool) (int, error) {
 
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
-func run(spec *specs.Spec) error {
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+func run(spec *specs.Spec, conf *boot.Config) error {
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		return fmt.Errorf("error setting up container: %v", err)
 	}
@@ -186,173 +187,207 @@ func run(spec *specs.Spec) error {
 	return nil
 }
 
-// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
-// It verifies after each step that the container can be loaded from disk, and
-// has the correct status.
-func TestLifecycle(t *testing.T) {
-	// The container will just sleep for a long time.  We will kill it before
-	// it finishes sleeping.
-	spec := testutil.NewSpecWithArgs("sleep", "100")
+type configOptions int
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+const (
+	overlay configOptions = 1 << iota
+	kvm
+)
+const all = overlay | kvm
 
-	// expectedPL lists the expected process state of the container.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
-	// Create the container.
-	id := testutil.UniqueContainerID()
-	if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
+// configs generates different configurations to run tests.
+func configs(opts configOptions) []*boot.Config {
+	cs := []*boot.Config{testutil.TestConfig()}
 
-	// Load the container from disk and check the status.
-	s, err := container.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading container: %v", err)
-	}
-	if got, want := s.Status, container.Created; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
+	if opts&overlay != 0 {
+		c := testutil.TestConfig()
+		c.Overlay = true
+		cs = append(cs, c)
 	}
 
-	// List should return the container id.
-	ids, err := container.List(rootDir)
-	if err != nil {
-		t.Fatalf("error listing containers: %v", err)
-	}
-	if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
-		t.Errorf("container list got %v, want %v", got, want)
+	// TODO: KVM doesn't work with --race.
+	if !testutil.RaceEnabled && opts&kvm != 0 {
+		c := testutil.TestConfig()
+		c.Platform = boot.PlatformKVM
+		cs = append(cs, c)
 	}
 
-	// Start the container.
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-	// Load the container from disk and check the status.
-	s, err = container.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading container: %v", err)
-	}
-	if got, want := s.Status, container.Running; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
-	}
+	return cs
+}
 
-	// Verify that "sleep 100" is running.
-	if err := waitForProcessList(s, expectedPL); err != nil {
-		t.Error(err)
-	}
+// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
+// It verifies after each step that the container can be loaded from disk, and
+// has the correct status.
+func TestLifecycle(t *testing.T) {
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
+		// The container will just sleep for a long time.  We will kill it before
+		// it finishes sleeping.
+		spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	// Wait on the container.
-	var wg sync.WaitGroup
-	wg.Add(1)
-	ch := make(chan struct{})
-	go func() {
-		ch <- struct{}{}
-		ws, err := s.Wait()
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// expectedPL lists the expected process state of the container.
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
 		}
-		if got, want := ws.Signal(), syscall.SIGTERM; got != want {
-			t.Fatalf("got signal %v, want %v", got, want)
+		// Create the container.
+		id := testutil.UniqueContainerID()
+		if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
+			t.Fatalf("error creating container: %v", err)
 		}
-		wg.Done()
-	}()
 
-	// Wait a bit to ensure that we've started waiting on the container
-	// before we signal.
-	<-ch
-	time.Sleep(100 * time.Millisecond)
-	// Send the container a SIGTERM which will cause it to stop.
-	if err := s.Signal(syscall.SIGTERM); err != nil {
-		t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
-	}
-	// Wait for it to die.
-	wg.Wait()
+		// Load the container from disk and check the status.
+		s, err := container.Load(rootDir, id)
+		if err != nil {
+			t.Fatalf("error loading container: %v", err)
+		}
+		if got, want := s.Status, container.Created; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
 
-	// The sandbox process should have exited by now, but it is a zombie.
-	// In normal runsc usage, it will be parented to init, and init will
-	// reap the sandbox. However, in this case the test runner is the
-	// parent and will not reap the sandbox process, so we must do it
-	// ourselves.
-	p, _ := os.FindProcess(s.Sandbox.Pid)
-	p.Wait()
-	g, _ := os.FindProcess(s.Sandbox.GoferPid)
-	g.Wait()
-
-	// Load the container from disk and check the status.
-	s, err = container.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading container: %v", err)
-	}
-	if got, want := s.Status, container.Stopped; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
-	}
+		// List should return the container id.
+		ids, err := container.List(rootDir)
+		if err != nil {
+			t.Fatalf("error listing containers: %v", err)
+		}
+		if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
+			t.Errorf("container list got %v, want %v", got, want)
+		}
 
-	// Destroy the container.
-	if err := s.Destroy(); err != nil {
-		t.Fatalf("error destroying container: %v", err)
-	}
+		// Start the container.
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		// Load the container from disk and check the status.
+		s, err = container.Load(rootDir, id)
+		if err != nil {
+			t.Fatalf("error loading container: %v", err)
+		}
+		if got, want := s.Status, container.Running; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
 
-	// List should not return the container id.
-	ids, err = container.List(rootDir)
-	if err != nil {
-		t.Fatalf("error listing containers: %v", err)
-	}
-	if len(ids) != 0 {
-		t.Errorf("expected container list to be empty, but got %v", ids)
-	}
+		// Verify that "sleep 100" is running.
+		if err := waitForProcessList(s, expectedPL); err != nil {
+			t.Error(err)
+		}
 
-	// Loading the container by id should fail.
-	if _, err = container.Load(rootDir, id); err == nil {
-		t.Errorf("expected loading destroyed container to fail, but it did not")
-	}
-}
+		// Wait on the container.
+		var wg sync.WaitGroup
+		wg.Add(1)
+		ch := make(chan struct{})
+		go func() {
+			ch <- struct{}{}
+			ws, err := s.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if got, want := ws.Signal(), syscall.SIGTERM; got != want {
+				t.Fatalf("got signal %v, want %v", got, want)
+			}
+			wg.Done()
+		}()
 
-// Test the we can execute the application with different path formats.
-func TestExePath(t *testing.T) {
-	for _, test := range []struct {
-		path    string
-		success bool
-	}{
-		{path: "true", success: true},
-		{path: "bin/true", success: true},
-		{path: "/bin/true", success: true},
-		{path: "thisfiledoesntexit", success: false},
-		{path: "bin/thisfiledoesntexit", success: false},
-		{path: "/bin/thisfiledoesntexit", success: false},
-	} {
-		spec := testutil.NewSpecWithArgs(test.path)
-		rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+		// Wait a bit to ensure that we've started waiting on the container
+		// before we signal.
+		<-ch
+		time.Sleep(100 * time.Millisecond)
+		// Send the container a SIGTERM which will cause it to stop.
+		if err := s.Signal(syscall.SIGTERM); err != nil {
+			t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
+		}
+		// Wait for it to die.
+		wg.Wait()
+
+		// The sandbox process should have exited by now, but it is a zombie.
+		// In normal runsc usage, it will be parented to init, and init will
+		// reap the sandbox. However, in this case the test runner is the
+		// parent and will not reap the sandbox process, so we must do it
+		// ourselves.
+		p, _ := os.FindProcess(s.Sandbox.Pid)
+		p.Wait()
+		g, _ := os.FindProcess(s.Sandbox.GoferPid)
+		g.Wait()
+
+		// Load the container from disk and check the status.
+		s, err = container.Load(rootDir, id)
 		if err != nil {
-			t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
+			t.Fatalf("error loading container: %v", err)
+		}
+		if got, want := s.Status, container.Stopped; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
+
+		// Destroy the container.
+		if err := s.Destroy(); err != nil {
+			t.Fatalf("error destroying container: %v", err)
 		}
 
-		ws, err := container.Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		// List should not return the container id.
+		ids, err = container.List(rootDir)
+		if err != nil {
+			t.Fatalf("error listing containers: %v", err)
+		}
+		if len(ids) != 0 {
+			t.Errorf("expected container list to be empty, but got %v", ids)
+		}
 
-		os.RemoveAll(rootDir)
-		os.RemoveAll(bundleDir)
+		// Loading the container by id should fail.
+		if _, err = container.Load(rootDir, id); err == nil {
+			t.Errorf("expected loading destroyed container to fail, but it did not")
+		}
+	}
+}
 
-		if test.success {
+// Test the we can execute the application with different path formats.
+func TestExePath(t *testing.T) {
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
+		for _, test := range []struct {
+			path    string
+			success bool
+		}{
+			{path: "true", success: true},
+			{path: "bin/true", success: true},
+			{path: "/bin/true", success: true},
+			{path: "thisfiledoesntexit", success: false},
+			{path: "bin/thisfiledoesntexit", success: false},
+			{path: "/bin/thisfiledoesntexit", success: false},
+		} {
+			spec := testutil.NewSpecWithArgs(test.path)
+			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
-				t.Errorf("exec: %s, error running container: %v", test.path, err)
+				t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
 			}
-			if ws.ExitStatus() != 0 {
-				t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
-			}
-		} else {
-			if err == nil {
-				t.Errorf("exec: %s, got: no error, want: error", test.path)
+
+			ws, err := container.Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+
+			os.RemoveAll(rootDir)
+			os.RemoveAll(bundleDir)
+
+			if test.success {
+				if err != nil {
+					t.Errorf("exec: %s, error running container: %v", test.path, err)
+				}
+				if ws.ExitStatus() != 0 {
+					t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
+				}
+			} else {
+				if err == nil {
+					t.Errorf("exec: %s, got: no error, want: error", test.path)
+				}
 			}
 		}
 	}
@@ -362,8 +397,8 @@ func TestExePath(t *testing.T) {
 func TestAppExitStatus(t *testing.T) {
 	// First container will succeed.
 	succSpec := testutil.NewSpecWithArgs("true")
-
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(succSpec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -382,7 +417,7 @@ func TestAppExitStatus(t *testing.T) {
 	wantStatus := 123
 	errSpec := testutil.NewSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
 
-	rootDir2, bundleDir2, conf, err := testutil.SetupContainer(errSpec)
+	rootDir2, bundleDir2, err := testutil.SetupContainer(errSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -400,82 +435,86 @@ func TestAppExitStatus(t *testing.T) {
 
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
-	const uid = 343
-	spec := testutil.NewSpecWithArgs("sleep", "100")
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+		const uid = 343
+		spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	// Create and start the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// expectedPL lists the expected process state of the container.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  uid,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
+		// Create and start the container.
+		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer s.Destroy()
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
 
-	// Verify that "sleep 100" is running.
-	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
-		t.Error(err)
-	}
+		// expectedPL lists the expected process state of the container.
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  uid,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
 
-	execArgs := control.ExecArgs{
-		Filename:         "/bin/sleep",
-		Argv:             []string{"sleep", "5"},
-		Envv:             []string{"PATH=" + os.Getenv("PATH")},
-		WorkingDirectory: "/",
-		KUID:             uid,
-	}
+		// Verify that "sleep 100" is running.
+		if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+			t.Error(err)
+		}
 
-	// Verify that "sleep 100" and "sleep 5" are running after exec.
-	// First, start running exec (whick blocks).
-	status := make(chan error, 1)
-	go func() {
-		exitStatus, err := s.Execute(&execArgs)
-		if err != nil {
-			status <- err
-		} else if exitStatus != 0 {
-			status <- fmt.Errorf("failed with exit status: %v", exitStatus)
-		} else {
-			status <- nil
+		execArgs := control.ExecArgs{
+			Filename:         "/bin/sleep",
+			Argv:             []string{"sleep", "5"},
+			Envv:             []string{"PATH=" + os.Getenv("PATH")},
+			WorkingDirectory: "/",
+			KUID:             uid,
 		}
-	}()
 
-	if err := waitForProcessList(s, expectedPL); err != nil {
-		t.Fatal(err)
-	}
+		// Verify that "sleep 100" and "sleep 5" are running after exec.
+		// First, start running exec (whick blocks).
+		status := make(chan error, 1)
+		go func() {
+			exitStatus, err := s.Execute(&execArgs)
+			if err != nil {
+				status <- err
+			} else if exitStatus != 0 {
+				status <- fmt.Errorf("failed with exit status: %v", exitStatus)
+			} else {
+				status <- nil
+			}
+		}()
 
-	// Ensure that exec finished without error.
-	select {
-	case <-time.After(10 * time.Second):
-		t.Fatalf("container timed out waiting for exec to finish.")
-	case st := <-status:
-		if st != nil {
-			t.Errorf("container failed to exec %v: %v", execArgs, err)
+		if err := waitForProcessList(s, expectedPL); err != nil {
+			t.Fatal(err)
+		}
+
+		// Ensure that exec finished without error.
+		select {
+		case <-time.After(10 * time.Second):
+			t.Fatalf("container timed out waiting for exec to finish.")
+		case st := <-status:
+			if st != nil {
+				t.Errorf("container failed to exec %v: %v", execArgs, err)
+			}
 		}
 	}
 }
@@ -486,129 +525,136 @@ func TestExec(t *testing.T) {
 // new containers and the first number printed from these containers is checked. Both should
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
-	outputPath := filepath.Join(os.TempDir(), "output")
-	// Make sure it does not already exist.
-	os.Remove(outputPath)
+	// Skip overlay because test requires writing to host file.
+	for _, conf := range configs(kvm) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	outputFile, err := createWriteableOutputFile(outputPath)
-	if err != nil {
-		t.Fatalf("error creating output file: %v", err)
-	}
-	defer outputFile.Close()
+		dir, err := ioutil.TempDir("", "checkpoint-test")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir failed: %v", err)
+		}
+		if err := os.Chmod(dir, 0777); err != nil {
+			t.Fatalf("error chmoding file: %q, %v", dir, err)
+		}
 
-	outputFileSandbox := strings.Replace(outputPath, os.TempDir(), "/tmp2", -1)
+		outputPath := filepath.Join(dir, "output")
+		outputFile, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile.Close()
 
-	script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %s; sleep 1; done", outputFileSandbox)
-	spec := testutil.NewSpecWithArgs("bash", "-c", script)
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Type:        "bind",
-		Destination: "/tmp2",
-		Source:      os.TempDir(),
-	})
+		script := "for ((i=0; ;i++)); do echo $i >> /tmp2/output; sleep 1; done"
+		spec := testutil.NewSpecWithArgs("bash", "-c", script)
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: "/tmp2",
+			Source:      dir,
+		})
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer cont.Destroy()
-	if err := cont.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
+		// Create and start the container.
+		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
 
-	// Set the image path, which is where the checkpoint image will be saved.
-	imagePath := filepath.Join(os.TempDir(), "test-image-file")
+		// Set the image path, which is where the checkpoint image will be saved.
+		imagePath := filepath.Join(dir, "test-image-file")
 
-	// Create the image file and open for writing.
-	file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
-	if err != nil {
-		t.Fatalf("error opening new file at imagePath: %v", err)
-	}
-	defer file.Close()
+		// Create the image file and open for writing.
+		file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+		if err != nil {
+			t.Fatalf("error opening new file at imagePath: %v", err)
+		}
+		defer file.Close()
 
-	time.Sleep(1 * time.Second)
+		time.Sleep(1 * time.Second)
 
-	// Checkpoint running container; save state into new file.
-	if err := cont.Checkpoint(file); err != nil {
-		t.Fatalf("error checkpointing container to empty file: %v", err)
-	}
-	defer os.RemoveAll(imagePath)
+		// Checkpoint running container; save state into new file.
+		if err := cont.Checkpoint(file); err != nil {
+			t.Fatalf("error checkpointing container to empty file: %v", err)
+		}
+		defer os.RemoveAll(imagePath)
 
-	lastNum, err := readOutputNum(outputFile, false)
-	if err != nil {
-		t.Fatalf("error with outputFile: %v", err)
-	}
+		lastNum, err := readOutputNum(outputFile, false)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
 
-	// Delete and recreate file before restoring.
-	if err := os.Remove(outputPath); err != nil {
-		t.Fatalf("error removing file")
-	}
-	outputFile2, err := createWriteableOutputFile(outputPath)
-	if err != nil {
-		t.Fatalf("error creating output file: %v", err)
-	}
-	defer outputFile2.Close()
+		// Delete and recreate file before restoring.
+		if err := os.Remove(outputPath); err != nil {
+			t.Fatalf("error removing file")
+		}
+		outputFile2, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile2.Close()
 
-	// Restore into a new container.
-	cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer cont2.Destroy()
+		// Restore into a new container.
+		cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont2.Destroy()
 
-	if err := cont2.Restore(spec, conf, imagePath); err != nil {
-		t.Fatalf("error restoring container: %v", err)
-	}
+		if err := cont2.Restore(spec, conf, imagePath); err != nil {
+			t.Fatalf("error restoring container: %v", err)
+		}
 
-	firstNum, err := readOutputNum(outputFile2, true)
-	if err != nil {
-		t.Fatalf("error with outputFile: %v", err)
-	}
+		firstNum, err := readOutputNum(outputFile2, true)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
 
-	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
-	if lastNum+1 != firstNum {
-		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
-	}
+		// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+		if lastNum+1 != firstNum {
+			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
+		}
 
-	// Restore into another container!
-	// Delete and recreate file before restoring.
-	if err := os.Remove(outputPath); err != nil {
-		t.Fatalf("error removing file")
-	}
-	outputFile3, err := createWriteableOutputFile(outputPath)
-	if err != nil {
-		t.Fatalf("error creating output file: %v", err)
-	}
-	defer outputFile3.Close()
+		// Restore into another container!
+		// Delete and recreate file before restoring.
+		if err := os.Remove(outputPath); err != nil {
+			t.Fatalf("error removing file")
+		}
+		outputFile3, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile3.Close()
 
-	// Restore into a new container.
-	cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer cont3.Destroy()
+		// Restore into a new container.
+		cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont3.Destroy()
 
-	if err := cont3.Restore(spec, conf, imagePath); err != nil {
-		t.Fatalf("error restoring container: %v", err)
-	}
+		if err := cont3.Restore(spec, conf, imagePath); err != nil {
+			t.Fatalf("error restoring container: %v", err)
+		}
 
-	firstNum2, err := readOutputNum(outputFile3, true)
-	if err != nil {
-		t.Fatalf("error with outputFile: %v", err)
-	}
+		firstNum2, err := readOutputNum(outputFile3, true)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
 
-	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
-	if lastNum+1 != firstNum2 {
-		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
+		// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+		if lastNum+1 != firstNum2 {
+			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
+		}
 	}
-
 }
 
 // TestPauseResume tests that we can successfully pause and resume a container.
@@ -617,102 +663,105 @@ func TestCheckpointRestore(t *testing.T) {
 // It will then unpause and confirm that both processes are running. Then it will
 // wait until one sleep completes and check to make sure the other is running.
 func TestPauseResume(t *testing.T) {
-	const uid = 343
-	spec := testutil.NewSpecWithArgs("sleep", "20")
-
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
+		const uid = 343
+		spec := testutil.NewSpecWithArgs("sleep", "20")
 
-	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer cont.Destroy()
-	if err := cont.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// expectedPL lists the expected process state of the container.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  uid,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
+		// Create and start the container.
+		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
 
-	execArgs := control.ExecArgs{
-		Filename:         "/bin/sleep",
-		Argv:             []string{"sleep", "5"},
-		Envv:             []string{"PATH=" + os.Getenv("PATH")},
-		WorkingDirectory: "/",
-		KUID:             uid,
-	}
+		// expectedPL lists the expected process state of the container.
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  uid,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
 
-	// First, start running exec (whick blocks).
-	go cont.Execute(&execArgs)
+		execArgs := control.ExecArgs{
+			Filename:         "/bin/sleep",
+			Argv:             []string{"sleep", "5"},
+			Envv:             []string{"PATH=" + os.Getenv("PATH")},
+			WorkingDirectory: "/",
+			KUID:             uid,
+		}
 
-	// Verify that "sleep 5" is running.
-	if err := waitForProcessList(cont, expectedPL); err != nil {
-		t.Fatal(err)
-	}
+		// First, start running exec (whick blocks).
+		go cont.Execute(&execArgs)
 
-	// Pause the running container.
-	if err := cont.Pause(); err != nil {
-		t.Errorf("error pausing container: %v", err)
-	}
-	if got, want := cont.Status, container.Paused; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
-	}
+		// Verify that "sleep 5" is running.
+		if err := waitForProcessList(cont, expectedPL); err != nil {
+			t.Fatal(err)
+		}
 
-	time.Sleep(10 * time.Second)
+		// Pause the running container.
+		if err := cont.Pause(); err != nil {
+			t.Errorf("error pausing container: %v", err)
+		}
+		if got, want := cont.Status, container.Paused; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
 
-	// Verify that the two processes still exist. Sleep 5 is paused so
-	// it should still be in the process list after 10 seconds.
-	if err := getAndCheckProcLists(cont, expectedPL); err != nil {
-		t.Fatal(err)
-	}
+		time.Sleep(10 * time.Second)
 
-	// Resume the running container.
-	if err := cont.Resume(); err != nil {
-		t.Errorf("error pausing container: %v", err)
-	}
-	if got, want := cont.Status, container.Running; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
-	}
+		// Verify that the two processes still exist. Sleep 5 is paused so
+		// it should still be in the process list after 10 seconds.
+		if err := getAndCheckProcLists(cont, expectedPL); err != nil {
+			t.Fatal(err)
+		}
 
-	if err := getAndCheckProcLists(cont, expectedPL); err != nil {
-		t.Fatal(err)
-	}
+		// Resume the running container.
+		if err := cont.Resume(); err != nil {
+			t.Errorf("error pausing container: %v", err)
+		}
+		if got, want := cont.Status, container.Running; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
 
-	expectedPL2 := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
+		if err := getAndCheckProcLists(cont, expectedPL); err != nil {
+			t.Fatal(err)
+		}
+
+		expectedPL2 := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
 
-	// Verify there is only one process left since we waited 10 at most seconds for
-	// sleep 5 to end.
-	if err := waitForProcessList(cont, expectedPL2); err != nil {
-		t.Fatal(err)
+		// Verify there is only one process left since we waited 10 at most seconds for
+		// sleep 5 to end.
+		if err := waitForProcessList(cont, expectedPL2); err != nil {
+			t.Fatal(err)
+		}
 	}
 }
 
@@ -721,8 +770,8 @@ func TestPauseResume(t *testing.T) {
 // occurs given the correct state.
 func TestPauseResumeStatus(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("sleep", "20")
-
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -780,297 +829,321 @@ func TestPauseResumeStatus(t *testing.T) {
 func TestCapabilities(t *testing.T) {
 	const uid = 343
 	const gid = 2401
-	spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	// We generate files in the host temporary directory.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: os.TempDir(),
-		Source:      os.TempDir(),
-		Type:        "bind",
-	})
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+		spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	// Create and start the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
+		// We generate files in the host temporary directory.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: os.TempDir(),
+			Source:      os.TempDir(),
+			Type:        "bind",
+		})
 
-	// expectedPL lists the expected process state of the container.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  uid,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "exe",
-		},
-	}
-	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
-		t.Fatalf("Failed to wait for sleep to start, err: %v", err)
-	}
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// Create an executable that can't be run with the specified UID:GID.
-	// This shouldn't be callable within the container until we add the
-	// CAP_DAC_OVERRIDE capability to skip the access check.
-	exePath := filepath.Join(rootDir, "exe")
-	if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
-		t.Fatalf("couldn't create executable: %v", err)
-	}
-	defer os.Remove(exePath)
+		// Create and start the container.
+		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer s.Destroy()
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
 
-	// Need to traverse the intermediate directory.
-	os.Chmod(rootDir, 0755)
+		// expectedPL lists the expected process state of the container.
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  uid,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "exe",
+			},
+		}
+		if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+			t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+		}
 
-	execArgs := control.ExecArgs{
-		Filename:         exePath,
-		Argv:             []string{exePath},
-		Envv:             []string{"PATH=" + os.Getenv("PATH")},
-		WorkingDirectory: "/",
-		KUID:             uid,
-		KGID:             gid,
-		Capabilities:     &auth.TaskCapabilities{},
-	}
+		// Create an executable that can't be run with the specified UID:GID.
+		// This shouldn't be callable within the container until we add the
+		// CAP_DAC_OVERRIDE capability to skip the access check.
+		exePath := filepath.Join(rootDir, "exe")
+		if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+			t.Fatalf("couldn't create executable: %v", err)
+		}
+		defer os.Remove(exePath)
+
+		// Need to traverse the intermediate directory.
+		os.Chmod(rootDir, 0755)
+
+		execArgs := control.ExecArgs{
+			Filename:         exePath,
+			Argv:             []string{exePath},
+			Envv:             []string{"PATH=" + os.Getenv("PATH")},
+			WorkingDirectory: "/",
+			KUID:             uid,
+			KGID:             gid,
+			Capabilities:     &auth.TaskCapabilities{},
+		}
 
-	// "exe" should fail because we don't have the necessary permissions.
-	if _, err := s.Execute(&execArgs); err == nil {
-		t.Fatalf("container executed without error, but an error was expected")
-	}
+		// "exe" should fail because we don't have the necessary permissions.
+		if _, err := s.Execute(&execArgs); err == nil {
+			t.Fatalf("container executed without error, but an error was expected")
+		}
 
-	// Now we run with the capability enabled and should succeed.
-	execArgs.Capabilities = &auth.TaskCapabilities{
-		EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
-	}
-	// "exe" should not fail this time.
-	if _, err := s.Execute(&execArgs); err != nil {
-		t.Fatalf("container failed to exec %v: %v", execArgs, err)
+		// Now we run with the capability enabled and should succeed.
+		execArgs.Capabilities = &auth.TaskCapabilities{
+			EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+		}
+		// "exe" should not fail this time.
+		if _, err := s.Execute(&execArgs); err != nil {
+			t.Fatalf("container failed to exec %v: %v", execArgs, err)
+		}
 	}
 }
 
 // Test that an tty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("true")
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
+		spec := testutil.NewSpecWithArgs("true")
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// Create a named socket and start listening.  We use a relative path
-	// to avoid overflowing the unix path length limit (108 chars).
-	socketPath := filepath.Join(bundleDir, "socket")
-	cwd, err := os.Getwd()
-	if err != nil {
-		t.Fatalf("error getting cwd: %v", err)
-	}
-	socketRelPath, err := filepath.Rel(cwd, socketPath)
-	if err != nil {
-		t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
-	}
-	if len(socketRelPath) > len(socketPath) {
-		socketRelPath = socketPath
-	}
-	srv, err := unet.BindAndListen(socketRelPath, false)
-	if err != nil {
-		t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
-	}
-	defer os.Remove(socketPath)
+		// Create a named socket and start listening.  We use a relative path
+		// to avoid overflowing the unix path length limit (108 chars).
+		socketPath := filepath.Join(bundleDir, "socket")
+		cwd, err := os.Getwd()
+		if err != nil {
+			t.Fatalf("error getting cwd: %v", err)
+		}
+		socketRelPath, err := filepath.Rel(cwd, socketPath)
+		if err != nil {
+			t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+		}
+		if len(socketRelPath) > len(socketPath) {
+			socketRelPath = socketPath
+		}
+		srv, err := unet.BindAndListen(socketRelPath, false)
+		if err != nil {
+			t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
+		}
+		defer os.Remove(socketPath)
 
-	// Create the container and pass the socket name.
-	id := testutil.UniqueContainerID()
-	s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
+		// Create the container and pass the socket name.
+		id := testutil.UniqueContainerID()
+		s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
 
-	// Open the othe end of the socket.
-	sock, err := srv.Accept()
-	if err != nil {
-		t.Fatalf("error accepting socket connection: %v", err)
-	}
+		// Open the othe end of the socket.
+		sock, err := srv.Accept()
+		if err != nil {
+			t.Fatalf("error accepting socket connection: %v", err)
+		}
 
-	// Allow 3 fds to be received.  We only expect 1.
-	r := sock.Reader(true /* blocking */)
-	r.EnableFDs(1)
+		// Allow 3 fds to be received.  We only expect 1.
+		r := sock.Reader(true /* blocking */)
+		r.EnableFDs(1)
 
-	// The socket is closed right after sending the FD, so EOF is
-	// an allowed error.
-	b := [][]byte{{}}
-	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
-		t.Fatalf("error reading from socket connection: %v", err)
-	}
+		// The socket is closed right after sending the FD, so EOF is
+		// an allowed error.
+		b := [][]byte{{}}
+		if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+			t.Fatalf("error reading from socket connection: %v", err)
+		}
 
-	// We should have gotten a control message.
-	fds, err := r.ExtractFDs()
-	if err != nil {
-		t.Fatalf("error extracting fds from socket connection: %v", err)
-	}
-	if len(fds) != 1 {
-		t.Fatalf("got %d fds from socket, wanted 1", len(fds))
-	}
+		// We should have gotten a control message.
+		fds, err := r.ExtractFDs()
+		if err != nil {
+			t.Fatalf("error extracting fds from socket connection: %v", err)
+		}
+		if len(fds) != 1 {
+			t.Fatalf("got %d fds from socket, wanted 1", len(fds))
+		}
 
-	// Verify that the fd is a terminal.
-	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
-		t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
-	}
+		// Verify that the fd is a terminal.
+		if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+			t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+		}
 
-	// Shut it down.
-	if err := s.Destroy(); err != nil {
-		t.Fatalf("error destroying container: %v", err)
-	}
+		// Shut it down.
+		if err := s.Destroy(); err != nil {
+			t.Fatalf("error destroying container: %v", err)
+		}
 
-	// Close socket.
-	if err := srv.Close(); err != nil {
-		t.Fatalf("error destroying container: %v", err)
+		// Close socket.
+		if err := srv.Close(); err != nil {
+			t.Fatalf("error destroying container: %v", err)
+		}
 	}
 }
 
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/true")
-	spec.Process.User.UID = 343
-	spec.Process.User.GID = 2401
+	for _, conf := range configs(kvm) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	// User that container runs as can't list '$TMP/blocked' and would fail to
-	// mount it.
-	dir, err := ioutil.TempDir("", "blocked")
-	if err != nil {
-		t.Fatalf("ioutil.TempDir() failed: %v", err)
-	}
-	if err := os.Chmod(dir, 0700); err != nil {
-		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
-	}
-	dir = path.Join(dir, "test")
-	if err := os.Mkdir(dir, 0755); err != nil {
-		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
-	}
+		spec := testutil.NewSpecWithArgs("/bin/true")
+		spec.Process.User.UID = 343
+		spec.Process.User.GID = 2401
+
+		// User that container runs as can't list '$TMP/blocked' and would fail to
+		// mount it.
+		dir, err := ioutil.TempDir("", "blocked")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir() failed: %v", err)
+		}
+		if err := os.Chmod(dir, 0700); err != nil {
+			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+		}
+		dir = path.Join(dir, "test")
+		if err := os.Mkdir(dir, 0755); err != nil {
+			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+		}
 
-	// We generate files in the host temporary directory.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: dir,
-		Source:      dir,
-		Type:        "bind",
-	})
+		// We generate files in the host temporary directory.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: dir,
+			Source:      dir,
+			Type:        "bind",
+		})
 
-	if err := run(spec); err != nil {
-		t.Fatalf("error running sadbox: %v", err)
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("error running sadbox: %v", err)
+		}
 	}
 }
 
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	srcDir := path.Join(os.TempDir(), "src", "newdir", "anotherdir")
-	if err := os.MkdirAll(srcDir, 0755); err != nil {
-		t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
-	}
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	// Attempt to remove dir to ensure it doesn't exist.
-	mountDir := path.Join(os.TempDir(), "newdir")
-	if err := os.RemoveAll(mountDir); err != nil {
-		t.Fatalf("os.RemoveAll(%q) failed: %v", mountDir, err)
-	}
-	mountDir = path.Join(mountDir, "anotherdir")
+		srcDir := path.Join(os.TempDir(), "src", "newdir", "anotherdir")
+		if err := os.MkdirAll(srcDir, 0755); err != nil {
+			t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
+		}
+
+		// Attempt to remove dir to ensure it doesn't exist.
+		mountDir := path.Join(os.TempDir(), "newdir")
+		if err := os.RemoveAll(mountDir); err != nil {
+			t.Fatalf("os.RemoveAll(%q) failed: %v", mountDir, err)
+		}
+		mountDir = path.Join(mountDir, "anotherdir")
 
-	spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: mountDir,
-		Source:      srcDir,
-		Type:        "bind",
-	})
+		spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: mountDir,
+			Source:      srcDir,
+			Type:        "bind",
+		})
 
-	if err := run(spec); err != nil {
-		t.Fatalf("error running sadbox: %v", err)
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("error running sadbox: %v", err)
+		}
 	}
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
-	spec.Root.Readonly = true
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+		spec.Root.Readonly = true
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	conf.Overlay = true
+		conf.Overlay = true
 
-	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-	ws, err := s.Wait()
-	if err != nil {
-		t.Fatalf("error waiting on container: %v", err)
-	}
-	if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-		t.Fatalf("container failed, waitStatus: %v", ws)
+		// Create, start and wait for the container.
+		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer s.Destroy()
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		ws, err := s.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
+		if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+			t.Fatalf("container failed, waitStatus: %v", ws)
+		}
 	}
 }
 
 func TestReadonlyMount(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/touch", "/foo/file")
-	dir, err := ioutil.TempDir("", "ro-mount")
-	if err != nil {
-		t.Fatalf("ioutil.TempDir() failed: %v", err)
-	}
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/foo",
-		Source:      dir,
-		Type:        "bind",
-		Options:     []string{"ro"},
-	})
-	spec.Root.Readonly = false
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo/file")
+		dir, err := ioutil.TempDir("", "ro-mount")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir() failed: %v", err)
+		}
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: "/foo",
+			Source:      dir,
+			Type:        "bind",
+			Options:     []string{"ro"},
+		})
+		spec.Root.Readonly = false
+
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	conf.Overlay = true
+		conf.Overlay = true
 
-	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-	ws, err := s.Wait()
-	if err != nil {
-		t.Fatalf("error waiting on container: %v", err)
-	}
-	if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-		t.Fatalf("container failed, waitStatus: %v", ws)
+		// Create, start and wait for the container.
+		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer s.Destroy()
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		ws, err := s.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
+		if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+			t.Fatalf("container failed, waitStatus: %v", ws)
+		}
 	}
 }
 
@@ -1089,7 +1162,8 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 	for _, cid := range cids {
 		spec := testutil.NewSpecWithArgs("sleep", "100")
-		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
@@ -1134,70 +1208,74 @@ func TestAbbreviatedIDs(t *testing.T) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	containerIDs := []string{
-		testutil.UniqueContainerID(),
-		testutil.UniqueContainerID(),
-	}
-	containerAnnotations := []map[string]string{
-		// The first container creates a sandbox.
-		map[string]string{
-			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
-		},
-		// The second container creates a container within the first
-		// container's sandbox.
-		map[string]string{
-			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
-			specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
-		},
-	}
-
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	// Setup the containers.
-	containers := make([]*container.Container, 0, len(containerIDs))
-	for i, annotations := range containerAnnotations {
-		spec := testutil.NewSpecWithArgs("sleep", "100")
-		spec.Annotations = annotations
-		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
+		containerIDs := []string{
+			testutil.UniqueContainerID(),
+			testutil.UniqueContainerID(),
 		}
-		defer os.RemoveAll(bundleDir)
-		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+		containerAnnotations := []map[string]string{
+			// The first container creates a sandbox.
+			map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+			},
+			// The second container creates a container within the first
+			// container's sandbox.
+			map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+				specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
+			},
+		}
+
+		rootDir, err := testutil.SetupRootDir()
 		if err != nil {
-			t.Fatalf("error creating container: %v", err)
+			t.Fatalf("error creating root dir: %v", err)
 		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
+		defer os.RemoveAll(rootDir)
+
+		// Setup the containers.
+		containers := make([]*container.Container, 0, len(containerIDs))
+		for i, annotations := range containerAnnotations {
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			spec.Annotations = annotations
+			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(bundleDir)
+			cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+			containers = append(containers, cont)
 		}
-		containers = append(containers, cont)
-	}
 
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  0,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  0,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
 
-	// Check via ps that multiple processes are running.
-	if err := waitForProcessList(containers[0], expectedPL); err != nil {
-		t.Errorf("failed to wait for sleep to start: %v", err)
+		// Check via ps that multiple processes are running.
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
 	}
 }
 
@@ -1238,7 +1316,8 @@ func TestMultiContainerWait(t *testing.T) {
 	for i, annotations := range containerAnnotations {
 		spec := testutil.NewSpecWithArgs(args[i][0], args[i][1])
 		spec.Annotations = annotations
-		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index fee2de283..40337bc53 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -31,7 +31,8 @@ func init() {
 
 func TestGoferExits(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 6aec54abe..3ebcc1362 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "docker.go",
         "testutil.go",
+        "testutil_race.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/test/testutil",
     visibility = [
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 9d70d29f2..c7cef9c75 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -29,6 +29,9 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
+// RaceEnabled is set to true if it was built with '--race' option.
+var RaceEnabled = false
+
 // ConfigureExePath configures the executable for runsc in the test environment.
 func ConfigureExePath() error {
 
@@ -66,6 +69,18 @@ func ConfigureExePath() error {
 	return nil
 }
 
+// TestConfig return the default configuration to use in tests.
+func TestConfig() *boot.Config {
+	return &boot.Config{
+		Debug:          true,
+		LogFormat:      "text",
+		LogPackets:     true,
+		Network:        boot.NetworkNone,
+		Strace:         true,
+		MultiContainer: true,
+	}
+}
+
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
 // in tests.
 func NewSpecWithArgs(args ...string) *specs.Spec {
@@ -96,38 +111,29 @@ func SetupRootDir() (string, error) {
 
 // SetupContainer creates a bundle and root dir for the container, generates a
 // test config, and writes the spec to config.json in the bundle dir.
-func SetupContainer(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
+func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) {
 	rootDir, err = SetupRootDir()
 	if err != nil {
-		return "", "", nil, err
+		return "", "", err
 	}
-	bundleDir, conf, err = SetupContainerInRoot(rootDir, spec)
-	return rootDir, bundleDir, conf, err
+	bundleDir, err = SetupContainerInRoot(rootDir, spec, conf)
+	return rootDir, bundleDir, err
 }
 
 // SetupContainerInRoot creates a bundle for the container, generates a test
 // config, and writes the spec to config.json in the bundle dir.
-func SetupContainerInRoot(rootDir string, spec *specs.Spec) (bundleDir string, conf *boot.Config, err error) {
+func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (bundleDir string, err error) {
 	bundleDir, err = ioutil.TempDir("", "bundle")
 	if err != nil {
-		return "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+		return "", fmt.Errorf("error creating bundle dir: %v", err)
 	}
 
 	if err = writeSpec(bundleDir, spec); err != nil {
-		return "", nil, fmt.Errorf("error writing spec: %v", err)
-	}
-
-	conf = &boot.Config{
-		Debug:          true,
-		LogFormat:      "text",
-		LogPackets:     true,
-		Network:        boot.NetworkNone,
-		RootDir:        rootDir,
-		Strace:         true,
-		MultiContainer: true,
+		return "", fmt.Errorf("error writing spec: %v", err)
 	}
 
-	return bundleDir, conf, nil
+	conf.RootDir = rootDir
+	return bundleDir, nil
 }
 
 // writeSpec writes the spec to disk in the given directory.
diff --git a/runsc/test/testutil/testutil_race.go b/runsc/test/testutil/testutil_race.go
new file mode 100644
index 000000000..59cfdaa7b
--- /dev/null
+++ b/runsc/test/testutil/testutil_race.go
@@ -0,0 +1,21 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package testutil
+
+func init() {
+	RaceEnabled = true
+}
-- 
cgit v1.2.3


From b5113574feb79b2266d603aa760a9df468725d87 Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Mon, 23 Jul 2018 13:54:33 -0700
Subject: Created a docker integration test for a tomcat image.

PiperOrigin-RevId: 205718733
Change-Id: I200b23af064d256f157baf9da5005ab16cc55928
---
 runsc/test/image/BUILD          |  1 +
 runsc/test/image/tomcat_test.go | 53 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 runsc/test/image/tomcat_test.go

(limited to 'runsc')

diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index e3985ecc4..fda6f2d9c 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -8,6 +8,7 @@ go_test(
     srcs = [
         "image_test.go",
         "python_test.go",
+        "tomcat_test.go",
     ],
     data = [
         "latin10k.txt",
diff --git a/runsc/test/image/tomcat_test.go b/runsc/test/image/tomcat_test.go
new file mode 100644
index 000000000..578385ca7
--- /dev/null
+++ b/runsc/test/image/tomcat_test.go
@@ -0,0 +1,53 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package image
+
+import (
+	"fmt"
+	"net/http"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+func TestTomcat(t *testing.T) {
+	d := testutil.MakeDocker("tomcat-test")
+	if out, err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
+		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := d.WaitForHTTP(port, 10*time.Second); err != nil {
+		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	}
+
+	// Ensure that content is being served.
+	url := fmt.Sprintf("http://localhost:%d", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Errorf("Error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+}
-- 
cgit v1.2.3


From 1129b35c92034d04ed22cf69e270ed9c034069d7 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 25 Jul 2018 09:10:32 -0700
Subject: runsc: Fix "exec" command when called without --pid-file.

When "exec" command is called without the "--detach" flag, we spawn a second
"exec" command and wait for that one to start. We use the pid file passed in
--pid-file to detect when this second command has started running.

However if "exec" is called with no --pid-file flag, this system breaks down,
as we don't have a pid file to wait for.

This CL ensures that the second instance of the "exec" command always writes a
pid-file, so the wait is successful.

PiperOrigin-RevId: 206002403
Change-Id: If9f2be31eb6e831734b1b833f25054ec71ab94a6
---
 runsc/cmd/exec.go | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index cbce07c8e..4ee370656 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -20,6 +20,7 @@ import (
 	"io/ioutil"
 	"os"
 	"os/exec"
+	"path/filepath"
 	"strconv"
 	"strings"
 	"syscall"
@@ -156,11 +157,28 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		Fatalf("error getting bin path: %v", err)
 	}
 	var args []string
+
+	// The command needs to write a pid file so that execAndWait can tell
+	// when it has started. If no pid-file was provided, we should use a
+	// filename in a temp directory.
+	pidFile := ex.pidFile
+	if pidFile == "" {
+		tmpDir, err := ioutil.TempDir("", "exec-pid-")
+		if err != nil {
+			Fatalf("error creating TempDir: %v", err)
+		}
+		defer os.RemoveAll(tmpDir)
+		pidFile = filepath.Join(tmpDir, "pid")
+		args = append(args, "--pid-file="+pidFile)
+	}
+
+	// Add the rest of the args, excluding the "detach" flag.
 	for _, a := range os.Args[1:] {
 		if !strings.Contains(a, "detach") {
 			args = append(args, a)
 		}
 	}
+
 	cmd := exec.Command(binPath, args...)
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
@@ -175,7 +193,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	// '--process' file is deleted as soon as this process returns and the child
 	// may fail to read it.
 	ready := func() (bool, error) {
-		_, err := os.Stat(ex.pidFile)
+		_, err := os.Stat(pidFile)
 		if err == nil {
 			// File appeared, we're done!
 			return true, nil
-- 
cgit v1.2.3


From e5adf42f66a3090f6124bceb5487238bf7526302 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 25 Jul 2018 17:36:52 -0700
Subject: Replace sleeps with waits in tests - part I

PiperOrigin-RevId: 206084473
Change-Id: I44e1b64b9cdd2964357799dca27cc0cbc19ce07d
---
 runsc/container/container_test.go | 34 ++++++++++++++++++++++------------
 runsc/test/testutil/BUILD         |  1 +
 runsc/test/testutil/testutil.go   | 10 ++++++++++
 3 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 34febe038..50f038450 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -122,18 +122,24 @@ func createWriteableOutputFile(path string) (*os.File, error) {
 	return outputFile, nil
 }
 
-func readOutputNum(f *os.File, first bool) (int, error) {
-	var num int
-	time.Sleep(1 * time.Second)
-
-	// Check that f exists and contains counting data.
-	fileInfo, err := f.Stat()
-	if err != nil {
-		return 0, fmt.Errorf("error creating output file: %v", err)
+func waitForFile(f *os.File) error {
+	op := func() error {
+		fi, err := f.Stat()
+		if err != nil {
+			return err
+		}
+		if fi.Size() == 0 {
+			return fmt.Errorf("file %q is empty", f.Name())
+		}
+		return nil
 	}
+	return testutil.Poll(op, 5*time.Second)
+}
 
-	if fileInfo.Size() == 0 {
-		return 0, fmt.Errorf("failed to write to file, file still appears empty")
+func readOutputNum(f *os.File, first bool) (int, error) {
+	// Wait until file has contents.
+	if err := waitForFile(f); err != nil {
+		return 0, err
 	}
 
 	// Read the first number in the new file
@@ -147,6 +153,7 @@ func readOutputNum(f *os.File, first bool) (int, error) {
 
 	nums := strings.Split(string(b), "\n")
 
+	var num int
 	if first {
 		num, err = strconv.Atoi(nums[0])
 	} else {
@@ -579,7 +586,10 @@ func TestCheckpointRestore(t *testing.T) {
 		}
 		defer file.Close()
 
-		time.Sleep(1 * time.Second)
+		// Wait until application has ran.
+		if err := waitForFile(outputFile); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
 
 		// Checkpoint running container; save state into new file.
 		if err := cont.Checkpoint(file); err != nil {
@@ -727,7 +737,7 @@ func TestPauseResume(t *testing.T) {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
-		time.Sleep(10 * time.Second)
+		time.Sleep(6 * time.Second)
 
 		// Verify that the two processes still exist. Sleep 5 is paused so
 		// it should still be in the process list after 10 seconds.
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 3ebcc1362..03ab3c4ac 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -16,6 +16,7 @@ go_library(
     deps = [
         "//runsc/boot",
         "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index c7cef9c75..721478353 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -16,6 +16,7 @@
 package testutil
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -24,6 +25,7 @@ import (
 	"path/filepath"
 	"time"
 
+	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -172,3 +174,11 @@ func Copy(src, dst string) error {
 	_, err = io.Copy(out, in)
 	return err
 }
+
+// Poll is a shorthand function to poll for something with given timeout.
+func Poll(cb func() error, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+	return backoff.Retry(cb, b)
+}
-- 
cgit v1.2.3


From b8f96a9d0b9868060025e7a89e99e1b30d17fa8b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 27 Jul 2018 10:08:59 -0700
Subject: Replace sleeps with waits in tests - part II

PiperOrigin-RevId: 206333130
Change-Id: Ic85874dbd53c5de2164a7bb75769d52d43666c2a
---
 runsc/container/container_test.go | 43 ++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 14 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 50f038450..7953f3380 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -673,11 +673,26 @@ func TestCheckpointRestore(t *testing.T) {
 // It will then unpause and confirm that both processes are running. Then it will
 // wait until one sleep completes and check to make sure the other is running.
 func TestPauseResume(t *testing.T) {
-	for _, conf := range configs(all) {
+	for _, conf := range configs(kvm) {
 		t.Logf("Running test with conf: %+v", conf)
 		const uid = 343
 		spec := testutil.NewSpecWithArgs("sleep", "20")
 
+		dir, err := ioutil.TempDir("", "pause-test")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir failed: %v", err)
+		}
+		lock, err := ioutil.TempFile(dir, "lock")
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer lock.Close()
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: "/tmp2",
+			Source:      dir,
+		})
+
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
@@ -709,19 +724,20 @@ func TestPauseResume(t *testing.T) {
 				PID:  2,
 				PPID: 0,
 				C:    0,
-				Cmd:  "sleep",
+				Cmd:  "bash",
 			},
 		}
 
+		script := fmt.Sprintf("while [[ -f /tmp2/%s ]]; do sleep 0.1; done", filepath.Base(lock.Name()))
 		execArgs := control.ExecArgs{
-			Filename:         "/bin/sleep",
-			Argv:             []string{"sleep", "5"},
+			Filename:         "/bin/bash",
+			Argv:             []string{"bash", "-c", script},
 			Envv:             []string{"PATH=" + os.Getenv("PATH")},
 			WorkingDirectory: "/",
 			KUID:             uid,
 		}
 
-		// First, start running exec (whick blocks).
+		// First, start running exec (which blocks).
 		go cont.Execute(&execArgs)
 
 		// Verify that "sleep 5" is running.
@@ -737,10 +753,14 @@ func TestPauseResume(t *testing.T) {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
-		time.Sleep(6 * time.Second)
+		if err := os.Remove(lock.Name()); err != nil {
+			t.Fatalf("os.Remove(lock) failed: %v", err)
+		}
+		// Script loops and sleeps for 100ms. Give a bit a time for it to exit in
+		// case pause didn't work.
+		time.Sleep(200 * time.Millisecond)
 
-		// Verify that the two processes still exist. Sleep 5 is paused so
-		// it should still be in the process list after 10 seconds.
+		// Verify that the two processes still exist.
 		if err := getAndCheckProcLists(cont, expectedPL); err != nil {
 			t.Fatal(err)
 		}
@@ -753,10 +773,6 @@ func TestPauseResume(t *testing.T) {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
-		if err := getAndCheckProcLists(cont, expectedPL); err != nil {
-			t.Fatal(err)
-		}
-
 		expectedPL2 := []*control.Process{
 			{
 				UID:  0,
@@ -767,8 +783,7 @@ func TestPauseResume(t *testing.T) {
 			},
 		}
 
-		// Verify there is only one process left since we waited 10 at most seconds for
-		// sleep 5 to end.
+		// Verify that deleting the file triggered the process to exit.
 		if err := waitForProcessList(cont, expectedPL2); err != nil {
 			t.Fatal(err)
 		}
-- 
cgit v1.2.3


From 3188859742e802a2e7d1d5d0ab22a6e2b426dfb8 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 27 Jul 2018 17:56:41 -0700
Subject: Make runsc visibility public.

(Why not?)

PiperOrigin-RevId: 206401282
Change-Id: Iadcb7fb8472de7aef7c4bf5182e9a1d339e4d259
---
 runsc/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index 2f0bbaf2b..a033c7caf 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -9,7 +9,7 @@ go_binary(
     ],
     pure = "on",
     visibility = [
-        "//runsc:__subpackages__",
+        "//visibility:public",
     ],
     x_defs = {"main.gitRevision": "{GIT_REVISION}"},
     deps = [
-- 
cgit v1.2.3


From 543c997978525ac7de3a24ff73203ddbb2cef6dc Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Mon, 30 Jul 2018 17:16:49 -0700
Subject: Cleans up files created if there is a failure.

PiperOrigin-RevId: 206674267
Change-Id: Ifc4eb19e0882e8bed566e9c553af910925fe6ae2
---
 runsc/fsgofer/fsgofer.go | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index f685738c3..52cdc91a2 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -392,11 +392,17 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	}
 	if err := fchown(fd, uid, gid); err != nil {
 		syscall.Close(fd)
+		if e := syscall.Unlinkat(l.controlFD(), name); e != nil {
+			log.Warningf("error unlinking file %q after failed chown: %v", name, e)
+		}
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 	stat, err := stat(fd)
 	if err != nil {
 		syscall.Close(fd)
+		if e := syscall.Unlinkat(l.controlFD(), name); e != nil {
+			log.Warningf("error unlinking file %q after failed stat: %v", name, e)
+		}
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 
-- 
cgit v1.2.3


From 6cad96f38a6de187d2aa3640c492bdfbdbdc589b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 31 Jul 2018 11:37:51 -0700
Subject: Drop dup2 filter

It is unused.

PiperOrigin-RevId: 206798328
Change-Id: I2d7d27c0e4a0ef51264b900f14f1b3fdad17f2c4
---
 runsc/boot/filter/config.go | 1 -
 1 file changed, 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index fdc3e02c6..0ce49b3b2 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -30,7 +30,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_CLONE:         {},
 	syscall.SYS_CLOSE:         {},
 	syscall.SYS_DUP:           {},
-	syscall.SYS_DUP2:          {},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT:   {},
-- 
cgit v1.2.3


From 413bfb39a940455cb116c7d0ca715b2ced78a11c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 31 Jul 2018 15:06:36 -0700
Subject: Use backoff package for retry logic

PiperOrigin-RevId: 206834838
Change-Id: I9a44c6fa5f4766a01f86e90810f025cefecdf2d4
---
 runsc/specutils/BUILD             |  1 +
 runsc/specutils/specutils.go      | 30 ++++++++++++++----------------
 runsc/specutils/specutils_test.go |  4 ++--
 3 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'runsc')

diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 34c952bdf..a22ab789a 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -13,6 +13,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/sentry/kernel/auth",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 861e7fd70..27441cbde 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -26,6 +26,7 @@ import (
 	"syscall"
 	"time"
 
+	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -313,33 +314,30 @@ func SandboxID(spec *specs.Spec) (string, bool) {
 // the 'ready' function returns true. It continues to wait if 'ready' returns
 // false. It returns error on timeout, if the process stops or if 'ready' fails.
 func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error {
-	backoff := 1 * time.Millisecond
-	for start := time.Now(); time.Now().Sub(start) < timeout; {
+	b := backoff.NewExponentialBackOff()
+	b.InitialInterval = 1 * time.Millisecond
+	b.MaxInterval = 1 * time.Second
+	b.MaxElapsedTime = timeout
+
+	op := func() error {
 		if ok, err := ready(); err != nil {
-			return err
+			return backoff.Permanent(err)
 		} else if ok {
 			return nil
 		}
 
 		// Check if the process is still running.
-		var ws syscall.WaitStatus
-		var ru syscall.Rusage
-
 		// If the process is alive, child is 0 because of the NOHANG option.
 		// If the process has terminated, child equals the process id.
+		var ws syscall.WaitStatus
+		var ru syscall.Rusage
 		child, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, &ru)
 		if err != nil {
-			return fmt.Errorf("error waiting for process: %v", err)
+			return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err))
 		} else if child == pid {
-			return fmt.Errorf("process %d has terminated", pid)
-		}
-
-		// Process continues to run, backoff and retry.
-		time.Sleep(backoff)
-		backoff *= 2
-		if backoff > 1*time.Second {
-			backoff = 1 * time.Second
+			return backoff.Permanent(fmt.Errorf("process %d has terminated", pid))
 		}
+		return fmt.Errorf("process %d not running yet", pid)
 	}
-	return fmt.Errorf("timed out waiting for process (%d)", pid)
+	return backoff.Retry(op, b)
 }
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index 2dc5d90cc..2c4e3e729 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -94,8 +94,8 @@ func TestWaitForReadyTimeout(t *testing.T) {
 	err := WaitForReady(cmd.Process.Pid, 50*time.Millisecond, func() (bool, error) {
 		return false, nil
 	})
-	if !strings.Contains(err.Error(), "timed out") {
-		t.Errorf("ProcessWaitReady got: %v, expected: timed out", err)
+	if !strings.Contains(err.Error(), "not running yet") {
+		t.Errorf("ProcessWaitReady got: %v, expected: not running yet", err)
 	}
 	cmd.Process.Kill()
 }
-- 
cgit v1.2.3


From 3cd7824410302da00d1c8c8323db8959a124814a Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 1 Aug 2018 20:21:00 -0700
Subject: Move stack clock to options struct

PiperOrigin-RevId: 207039273
Change-Id: Ib8f55a6dc302052ab4a10ccd70b07f0d73b373df
---
 pkg/dhcp/dhcp_test.go                              |  2 +-
 pkg/tcpip/adapters/gonet/gonet_test.go             |  2 +-
 pkg/tcpip/network/arp/arp_test.go                  |  2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go           |  2 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go              |  2 +-
 pkg/tcpip/stack/stack.go                           | 15 ++++++++++++++-
 pkg/tcpip/stack/stack_test.go                      | 22 +++++++++++-----------
 pkg/tcpip/stack/transport_test.go                  |  8 ++++----
 pkg/tcpip/transport/tcp/tcp_test.go                |  6 +++---
 pkg/tcpip/transport/tcp/testing/context/context.go |  2 +-
 pkg/tcpip/transport/udp/udp_test.go                |  2 +-
 runsc/boot/loader.go                               |  2 +-
 12 files changed, 40 insertions(+), 27 deletions(-)

(limited to 'runsc')

diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index 565b64045..731ed61a5 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -46,7 +46,7 @@ func TestDHCP(t *testing.T) {
 		}
 	}()
 
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{udp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName}, []string{udp.ProtocolName}, stack.Options{})
 
 	const nicid tcpip.NICID = 1
 	if err := s.CreateNIC(nicid, id); err != nil {
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 4c0855854..86a82f21d 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -57,7 +57,7 @@ func TestTimeouts(t *testing.T) {
 
 func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
 	// Create the stack and add a NIC.
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName, udp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName, udp.ProtocolName}, stack.Options{})
 
 	if err := s.CreateNIC(NICID, loopback.New()); err != nil {
 		return nil, err
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index c35299f3f..8fc79dc94 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -43,7 +43,7 @@ type testContext struct {
 }
 
 func newTestContext(t *testing.T) *testContext {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, arp.ProtocolName}, []string{ping.ProtocolName4})
+	s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, []string{ping.ProtocolName4}, stack.Options{})
 
 	const defaultMTU = 65536
 	id, linkEP := channel.New(256, defaultMTU, stackLinkAddr)
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 1915f7ef9..3030e84a7 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -123,7 +123,7 @@ func main() {
 
 	// Create the stack with ipv4 and tcp protocols, then add a tun-based
 	// NIC and ipv4 address.
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName}, stack.Options{})
 
 	mtu, err := rawfile.GetMTU(tunName)
 	if err != nil {
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index e01adf635..9cced35eb 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -109,7 +109,7 @@ func main() {
 
 	// Create the stack with ip and tcp protocols, then add a tun-based
 	// NIC and address.
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}, []string{tcp.ProtocolName}, stack.Options{})
 
 	mtu, err := rawfile.GetMTU(tunName)
 	if err != nil {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index b9d0a1762..9cdc7b6d8 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -285,6 +285,14 @@ type Stack struct {
 	clock tcpip.Clock
 }
 
+// Options contains optional Stack configuration.
+type Options struct {
+	// Clock is an optional clock source used for timestampping packets.
+	//
+	// If no Clock is specified, the clock source will be time.Now.
+	Clock tcpip.Clock
+}
+
 // New allocates a new networking stack with only the requested networking and
 // transport protocols configured with default options.
 //
@@ -292,7 +300,12 @@ type Stack struct {
 // SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
 // stack. Please refer to individual protocol implementations as to what options
 // are supported.
-func New(clock tcpip.Clock, network []string, transport []string) *Stack {
+func New(network []string, transport []string, opts Options) *Stack {
+	clock := opts.Clock
+	if clock == nil {
+		clock = &tcpip.StdClock{}
+	}
+
 	s := &Stack{
 		transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
 		networkProtocols:   make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 04806865d..57de5b93a 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -186,7 +186,7 @@ func TestNetworkReceive(t *testing.T) {
 	// Create a stack with the fake network protocol, one nic, and two
 	// addresses attached to it: 1 & 2.
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -280,7 +280,7 @@ func TestNetworkSend(t *testing.T) {
 	// address: 1. The route table sends all packets through the only
 	// existing nic.
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("NewNIC failed: %v", err)
 	}
@@ -302,7 +302,7 @@ func TestNetworkSendMultiRoute(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 
 	id1, linkEP1 := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id1); err != nil {
@@ -381,7 +381,7 @@ func TestRoutes(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 
 	id1, _ := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id1); err != nil {
@@ -445,7 +445,7 @@ func TestRoutes(t *testing.T) {
 }
 
 func TestAddressRemoval(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -489,7 +489,7 @@ func TestAddressRemoval(t *testing.T) {
 }
 
 func TestDelayedRemovalDueToRoute(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -557,7 +557,7 @@ func TestDelayedRemovalDueToRoute(t *testing.T) {
 }
 
 func TestPromiscuousMode(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -617,7 +617,7 @@ func TestAddressSpoofing(t *testing.T) {
 	srcAddr := tcpip.Address("\x01")
 	dstAddr := tcpip.Address("\x02")
 
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 
 	id, _ := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -658,7 +658,7 @@ func TestAddressSpoofing(t *testing.T) {
 
 // Set the subnet, then check that packet is delivered.
 func TestSubnetAcceptsMatchingPacket(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -692,7 +692,7 @@ func TestSubnetAcceptsMatchingPacket(t *testing.T) {
 
 // Set destination outside the subnet, then check it doesn't get delivered.
 func TestSubnetRejectsNonmatchingPacket(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
+	s := stack.New([]string{"fakeNet"}, nil, stack.Options{})
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -724,7 +724,7 @@ func TestSubnetRejectsNonmatchingPacket(t *testing.T) {
 }
 
 func TestNetworkOptions(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{})
+	s := stack.New([]string{"fakeNet"}, []string{}, stack.Options{})
 
 	// Try an unsupported network protocol.
 	if err := s.SetNetworkProtocolOption(tcpip.NetworkProtocolNumber(99999), fakeNetGoodOption(false)); err != tcpip.ErrUnknownProtocol {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index bd0802ccb..98d2f9d99 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -220,7 +220,7 @@ func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
 
 func TestTransportReceive(t *testing.T) {
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"}, stack.Options{})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -280,7 +280,7 @@ func TestTransportReceive(t *testing.T) {
 
 func TestTransportControlReceive(t *testing.T) {
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"}, stack.Options{})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -346,7 +346,7 @@ func TestTransportControlReceive(t *testing.T) {
 
 func TestTransportSend(t *testing.T) {
 	id, _ := channel.New(10, defaultMTU, "")
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"}, stack.Options{})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -383,7 +383,7 @@ func TestTransportSend(t *testing.T) {
 }
 
 func TestTransportOptions(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"}, stack.Options{})
 
 	// Try an unsupported transport protocol.
 	if err := s.SetTransportProtocolOption(tcpip.TransportProtocolNumber(99999), fakeTransportGoodOption(false)); err != tcpip.ErrUnknownProtocol {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 1b8463541..45ebca5b1 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2579,7 +2579,7 @@ func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 }
 
 func TestDefaultBufferSizes(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName}, stack.Options{})
 
 	// Check the default values.
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
@@ -2625,7 +2625,7 @@ func TestDefaultBufferSizes(t *testing.T) {
 }
 
 func TestMinMaxBufferSizes(t *testing.T) {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName}, stack.Options{})
 
 	// Check the default values.
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
@@ -2675,7 +2675,7 @@ func TestSelfConnect(t *testing.T) {
 	// it checks that if an endpoint binds to say 127.0.0.1:1000 then
 	// connects to 127.0.0.1:1000, then it will be connected to itself, and
 	// is able to send and receive data through the same endpoint.
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName}, stack.Options{})
 
 	id := loopback.New()
 	if testing.Verbose() {
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 9deae09e3..e44979527 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -139,7 +139,7 @@ type Context struct {
 // New allocates and initializes a test context containing a new
 // stack and a link-layer endpoint.
 func New(t *testing.T, mtu uint32) *Context {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName}, stack.Options{})
 
 	// Allow minimum send/receive buffer sizes to be 1 during tests.
 	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{1, tcp.DefaultBufferSize, tcp.DefaultBufferSize * 10}); err != nil {
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 3d5956145..7203d7705 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -66,7 +66,7 @@ type headers struct {
 }
 
 func newDualTestContext(t *testing.T, mtu uint32) *testContext {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{udp.ProtocolName})
+	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{udp.ProtocolName}, stack.Options{})
 
 	id, linkEP := channel.New(256, mtu, "")
 	if testing.Verbose() {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 66394cdf8..2f976cd52 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -534,7 +534,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) inet.Stack {
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
 		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
-		return &epsocket.Stack{stack.New(clock, netProtos, protoNames)}
+		return &epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{Clock: clock})}
 
 	default:
 		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
-- 
cgit v1.2.3


From 4c1167de4ee2aa7b71729ff8b1c742b4183168d1 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 2 Aug 2018 12:40:29 -0700
Subject: Isolate image pulling time from container startup

mysql image test is timing out sporadically and it's hard to tell
where the slow down in coming from.

PiperOrigin-RevId: 207147237
Change-Id: I05a4d2c116292695d63cf861f3b89cd1c54b6106
---
 runsc/test/image/image_test.go  |  9 +++++++++
 runsc/test/image/python_test.go |  3 +++
 runsc/test/image/tomcat_test.go |  3 +++
 runsc/test/testutil/docker.go   | 39 +++++++++++++++++++++++----------------
 4 files changed, 38 insertions(+), 16 deletions(-)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 04c334d92..248934484 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -82,6 +82,9 @@ func testHTTPServer(port int) error {
 }
 
 func TestHttpd(t *testing.T) {
+	if out, err := testutil.Pull("httpd"); err != nil {
+		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	}
 	d := testutil.MakeDocker("http-test")
 
 	dir, err := testutil.PrepareFiles("latin10k.txt")
@@ -112,6 +115,9 @@ func TestHttpd(t *testing.T) {
 }
 
 func TestNginx(t *testing.T) {
+	if out, err := testutil.Pull("nginx"); err != nil {
+		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	}
 	d := testutil.MakeDocker("net-test")
 
 	dir, err := testutil.PrepareFiles("latin10k.txt")
@@ -142,6 +148,9 @@ func TestNginx(t *testing.T) {
 }
 
 func TestMysql(t *testing.T) {
+	if out, err := testutil.Pull("mysql"); err != nil {
+		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	}
 	d := testutil.MakeDocker("mysql-test")
 
 	// Start the container.
diff --git a/runsc/test/image/python_test.go b/runsc/test/image/python_test.go
index e7324e83e..b77a6ec87 100644
--- a/runsc/test/image/python_test.go
+++ b/runsc/test/image/python_test.go
@@ -24,6 +24,9 @@ import (
 )
 
 func TestPythonHello(t *testing.T) {
+	if out, err := testutil.Pull("google/python-hello"); err != nil {
+		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	}
 	d := testutil.MakeDocker("python-hello-test")
 	if out, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
diff --git a/runsc/test/image/tomcat_test.go b/runsc/test/image/tomcat_test.go
index 578385ca7..dd47ab6da 100644
--- a/runsc/test/image/tomcat_test.go
+++ b/runsc/test/image/tomcat_test.go
@@ -24,6 +24,9 @@ import (
 )
 
 func TestTomcat(t *testing.T) {
+	if out, err := testutil.Pull("tomcat:8.0"); err != nil {
+		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	}
 	d := testutil.MakeDocker("tomcat-test")
 	if out, err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 4eb049591..ec5ff850b 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -94,6 +94,24 @@ func getLocalPath(file string) string {
 	return path.Join(".", file)
 }
 
+// do executes docker command.
+func do(args ...string) (string, error) {
+	fmt.Printf("Running: docker %s\n", args)
+	cmd := exec.Command("docker", args...)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("error executing docker %s: %v", args, err)
+	}
+	return string(out), nil
+}
+
+// Pull pulls a docker image. This is used in tests to isolate the
+// time to pull the image off the network from the time to actually
+// start the container, to avoid timeouts over slow networks.
+func Pull(image string) (string, error) {
+	return do("pull", image)
+}
+
 // Docker contains the name and the runtime of a docker container.
 type Docker struct {
 	Runtime string
@@ -107,30 +125,19 @@ func MakeDocker(namePrefix string) Docker {
 	return Docker{Name: namePrefix + suffix, Runtime: runtime()}
 }
 
-// Do executes docker command.
-func (d *Docker) Do(args ...string) (string, error) {
-	fmt.Printf("Running: docker %s\n", args)
-	cmd := exec.Command("docker", args...)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("error executing docker %s: %v", args, err)
-	}
-	return string(out), nil
-}
-
 // Run calls 'docker run' with the arguments provided.
 func (d *Docker) Run(args ...string) (string, error) {
 	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
 	a = append(a, args...)
-	return d.Do(a...)
+	return do(a...)
 }
 
 // CleanUp kills and deletes the container.
 func (d *Docker) CleanUp() error {
-	if _, err := d.Do("kill", d.Name); err != nil {
+	if _, err := do("kill", d.Name); err != nil {
 		return fmt.Errorf("error killing container %q: %v", d.Name, err)
 	}
-	if _, err := d.Do("rm", d.Name); err != nil {
+	if _, err := do("rm", d.Name); err != nil {
 		return fmt.Errorf("error deleting container %q: %v", d.Name, err)
 	}
 	return nil
@@ -140,7 +147,7 @@ func (d *Docker) CleanUp() error {
 // docker to allocate a free port in the host and prevent conflicts.
 func (d *Docker) FindPort(sandboxPort int) (int, error) {
 	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
-	out, err := d.Do("inspect", "-f", format, d.Name)
+	out, err := do("inspect", "-f", format, d.Name)
 	if err != nil {
 		return -1, fmt.Errorf("error retrieving port: %v", err)
 	}
@@ -158,7 +165,7 @@ func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) error {
 	var out string
 	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
 		var err error
-		out, err = d.Do("logs", d.Name)
+		out, err = do("logs", d.Name)
 		if err != nil {
 			return err
 		}
-- 
cgit v1.2.3


From bc9a1fca23870b21e16e024220e0c87e236c6cf5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 6 Aug 2018 11:47:07 -0700
Subject: Tiny reordering to network code

PiperOrigin-RevId: 207581723
Change-Id: I6e4eb1227b5ed302de5e6c891040b670955f1eea
---
 runsc/sandbox/network.go | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index d6685fd66..d0ce6228b 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -221,12 +221,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 			continue
 		}
 
-		// Get the link for the interface.
-		ifaceLink, err := netlink.LinkByName(iface.Name)
-		if err != nil {
-			return fmt.Errorf("error getting link for interface %q: %v", iface.Name, err)
-		}
-
 		// Create the socket.
 		const protocol = 0x0300 // htons(ETH_P_ALL)
 		fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
@@ -238,7 +232,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 		// Bind to the appropriate device.
 		ll := syscall.SockaddrLinklayer{
 			Protocol: protocol,
-			Ifindex:  ifaceLink.Attrs().Index,
+			Ifindex:  iface.Index,
 			Hatype:   0, // No ARP type.
 			Pkttype:  syscall.PACKET_OTHERHOST,
 		}
@@ -266,6 +260,12 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 			Routes: routes,
 		}
 
+		// Get the link for the interface.
+		ifaceLink, err := netlink.LinkByName(iface.Name)
+		if err != nil {
+			return fmt.Errorf("error getting link for interface %q: %v", iface.Name, err)
+		}
+
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
 		for _, addr := range ip4addrs {
-- 
cgit v1.2.3


From 9752174a7f211328c0ff59f8ed6c51325a6fc23d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 6 Aug 2018 18:07:15 -0700
Subject: Disable KVM dimension because it's making the test flaky

PiperOrigin-RevId: 207642348
Change-Id: Iacec9f097ab93b91c0c8eea61b1347e864f57a8b
---
 runsc/container/container_test.go | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7953f3380..9e38f5f77 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -212,12 +212,13 @@ func configs(opts configOptions) []*boot.Config {
 		cs = append(cs, c)
 	}
 
-	// TODO: KVM doesn't work with --race.
-	if !testutil.RaceEnabled && opts&kvm != 0 {
-		c := testutil.TestConfig()
-		c.Platform = boot.PlatformKVM
-		cs = append(cs, c)
-	}
+	// TODO: KVM tests are flaky. Disable until fixed.
+	// // TODO: KVM doesn't work with --race.
+	// if !testutil.RaceEnabled && opts&kvm != 0 {
+	// 	c := testutil.TestConfig()
+	// 	c.Platform = boot.PlatformKVM
+	// 	cs = append(cs, c)
+	// }
 
 	return cs
 }
-- 
cgit v1.2.3


From cb23232c37c092b60d7e3ee91cb8dd8bed855028 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 7 Aug 2018 13:47:16 -0700
Subject: Fix build break in test

integration_test runs manually and breakage wasn't detected. Added test to
kokoro to ensure breakages are detected in the future.

PiperOrigin-RevId: 207772835
Change-Id: Iada81b579b558477d4db3516b38366ef6a2e933d
---
 kokoro/run_tests.sh                        |  8 ++-
 runsc/test/image/image_test.go             | 40 +++++++-------
 runsc/test/image/python_test.go            | 12 ++--
 runsc/test/image/tomcat_test.go            | 12 ++--
 runsc/test/integration/integration_test.go | 88 ++++++++++++++++--------------
 runsc/test/testutil/docker.go              | 80 ++++++++++++++++++++-------
 runsc/test/testutil/testutil.go            | 10 ++++
 7 files changed, 155 insertions(+), 95 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 0069aa0c4..665d63390 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -44,10 +44,14 @@ bazel test --test_output=errors //...
 exit_code=${?}
 
 if [[ ${exit_code} -eq 0 ]]; then
+  # These names are used to exclude tests not supported in certain
+  # configuration, e.g. save/restore not supported with hostnet.
   declare -a variations=("" "-kvm" "-hostnet" "-overlay")
   for v in "${variations[@]}"; do
-    # image_test is tagged manual
-    bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}${v} //runsc/test/image:image_test
+    # Run runsc tests with docker that are tagged manual.
+    bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}${v} \
+      //runsc/test/image:image_test \
+      //runsc/test/integration:integration_test
     exit_code=${?}
     if [[ ${exit_code} -ne 0 ]]; then
       break
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 248934484..962c31b24 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -39,8 +39,8 @@ import (
 
 func TestHelloWorld(t *testing.T) {
 	d := testutil.MakeDocker("hello-test")
-	if out, err := d.Run("hello-world"); err != nil {
-		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	if _, err := d.Run("hello-world"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
 
@@ -82,8 +82,8 @@ func testHTTPServer(port int) error {
 }
 
 func TestHttpd(t *testing.T) {
-	if out, err := testutil.Pull("httpd"); err != nil {
-		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	if err := testutil.Pull("httpd"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
 	}
 	d := testutil.MakeDocker("http-test")
 
@@ -93,8 +93,8 @@ func TestHttpd(t *testing.T) {
 	}
 
 	// Start the container.
-	if out, err := d.Run("-p", "80", "-v", testutil.MountArg(dir, "/usr/local/apache2/htdocs:ro"), "httpd"); err != nil {
-		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	if _, err := d.Run("-p", "80", "-v", testutil.MountArg(dir, "/usr/local/apache2/htdocs:ro"), "httpd"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
 
@@ -105,8 +105,8 @@ func TestHttpd(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
-		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	if err := testutil.WaitForHTTP(port, 5*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	if err := testHTTPServer(port); err != nil {
@@ -115,8 +115,8 @@ func TestHttpd(t *testing.T) {
 }
 
 func TestNginx(t *testing.T) {
-	if out, err := testutil.Pull("nginx"); err != nil {
-		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	if err := testutil.Pull("nginx"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
 	}
 	d := testutil.MakeDocker("net-test")
 
@@ -126,8 +126,8 @@ func TestNginx(t *testing.T) {
 	}
 
 	// Start the container.
-	if out, err := d.Run("-p", "80", "-v", testutil.MountArg(dir, "/usr/share/nginx/html:ro"), "nginx"); err != nil {
-		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	if _, err := d.Run("-p", "80", "-v", testutil.MountArg(dir, "/usr/share/nginx/html:ro"), "nginx"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
 
@@ -138,8 +138,8 @@ func TestNginx(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
-		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	if err := testutil.WaitForHTTP(port, 5*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	if err := testHTTPServer(port); err != nil {
@@ -148,14 +148,14 @@ func TestNginx(t *testing.T) {
 }
 
 func TestMysql(t *testing.T) {
-	if out, err := testutil.Pull("mysql"); err != nil {
-		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	if err := testutil.Pull("mysql"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
 	}
 	d := testutil.MakeDocker("mysql-test")
 
 	// Start the container.
-	if out, err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
-		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	if _, err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
 
@@ -178,8 +178,8 @@ func TestMysql(t *testing.T) {
 		"mysql",
 		"mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql",
 	}
-	if out, err := client.Run(args...); err != nil {
-		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	if _, err := client.Run(args...); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
 	defer client.CleanUp()
 
diff --git a/runsc/test/image/python_test.go b/runsc/test/image/python_test.go
index b77a6ec87..a8d28e080 100644
--- a/runsc/test/image/python_test.go
+++ b/runsc/test/image/python_test.go
@@ -24,12 +24,12 @@ import (
 )
 
 func TestPythonHello(t *testing.T) {
-	if out, err := testutil.Pull("google/python-hello"); err != nil {
-		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	if err := testutil.Pull("google/python-hello"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
 	}
 	d := testutil.MakeDocker("python-hello-test")
-	if out, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
-		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	if _, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
 
@@ -40,8 +40,8 @@ func TestPythonHello(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := d.WaitForHTTP(port, 10*time.Second); err != nil {
-		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Ensure that content is being served.
diff --git a/runsc/test/image/tomcat_test.go b/runsc/test/image/tomcat_test.go
index dd47ab6da..97cf95834 100644
--- a/runsc/test/image/tomcat_test.go
+++ b/runsc/test/image/tomcat_test.go
@@ -24,12 +24,12 @@ import (
 )
 
 func TestTomcat(t *testing.T) {
-	if out, err := testutil.Pull("tomcat:8.0"); err != nil {
-		t.Fatalf("docker pull failed: %v\nout: %s", err, out)
+	if err := testutil.Pull("tomcat:8.0"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
 	}
 	d := testutil.MakeDocker("tomcat-test")
-	if out, err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
-		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	if _, err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
 
@@ -40,8 +40,8 @@ func TestTomcat(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := d.WaitForHTTP(port, 10*time.Second); err != nil {
-		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Ensure that content is being served.
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 09d845bfc..67b58523d 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -37,11 +37,9 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
-// This container is a docker image for the Flask microframework hello world application.
-const container = "python-hello-test"
-
 // httpRequestSucceeds sends a request to a given url and checks that the status is OK.
-func httpRequestSucceeds(client http.Client, url string) error {
+func httpRequestSucceeds(client http.Client, server string, port int) error {
+	url := fmt.Sprintf("http://%s:%d", server, port)
 	// Ensure that content is being served.
 	resp, err := client.Get(url)
 	if err != nil {
@@ -55,33 +53,50 @@ func httpRequestSucceeds(client http.Client, url string) error {
 
 // TestLifeCycle tests a basic Create/Start/Stop docker container life cycle.
 func TestLifeCycle(t *testing.T) {
-	d := testutil.MakeDocker(container)
-
-	// Test docker create.
-	if out, err := d.Do("create", "--runtime", d.Runtime, "--name", d.Name, "-p", "8080", "google/python-hello"); err != nil {
-		t.Fatalf("docker create failed: %v\nout: %s", err, out)
+	if err := testutil.Pull("nginx"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
 	}
-
-	// Test docker start.
-	if out, err := d.Do("start", d.Name); err != nil {
+	d := testutil.MakeDocker("lifecycle-test")
+	if err := d.Create("-p", "80", "nginx"); err != nil {
+		t.Fatalf("docker create failed: %v", err)
+	}
+	if err := d.Start(); err != nil {
 		d.CleanUp()
-		t.Fatalf("docker start failed: %v\nout: %s", err, out)
+		t.Fatalf("docker start failed: %v", err)
 	}
 
-	// Test docker stop.
-	if out, err := d.Do("stop", d.Name); err != nil {
-		d.CleanUp()
-		t.Fatalf("docker stop failed: %v\nout: %s", err, out)
+	// Test that container is working
+	port, err := d.FindPort(80)
+	if err != nil {
+		t.Fatalf("docker.FindPort(80) failed: %v", err)
+	}
+	if err := testutil.WaitForHTTP(port, 5*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	}
+	client := http.Client{Timeout: time.Duration(2 * time.Second)}
+	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+		t.Errorf("http request failed: %v", err)
 	}
 
-	// Test removing the container.
-	if out, err := d.Do("rm", d.Name); err != nil {
-		t.Fatalf("docker rm failed: %v\nout: %s", err, out)
+	if err := d.Stop(); err != nil {
+		d.CleanUp()
+		t.Fatalf("docker stop failed: %v", err)
+	}
+	if err := d.Remove(); err != nil {
+		t.Fatalf("docker rm failed: %v", err)
 	}
 }
 
 func TestPauseResume(t *testing.T) {
-	d := testutil.MakeDocker(container)
+	if !testutil.IsPauseResumeSupported() {
+		t.Log("Pause/resume is not supported, skipping test.")
+		return
+	}
+
+	if err := testutil.Pull("google/python-hello"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("pause-resume-test")
 	if out, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
 		t.Fatalf("docker run failed: %v\nout: %s", err, out)
 	}
@@ -94,28 +109,22 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
-		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
-	}
-
-	timeout := time.Duration(2 * time.Second)
-	client := http.Client{
-		Timeout: timeout,
+	if err := testutil.WaitForHTTP(port, 20*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
-	url := fmt.Sprintf("http://localhost:%d", port)
 	// Check that container is working.
-	if err := httpRequestSucceeds(client, url); err != nil {
+	client := http.Client{Timeout: time.Duration(2 * time.Second)}
+	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
 		t.Errorf("http request failed: %v", err)
 	}
 
-	// Pause container.
-	if out, err := d.Do("pause", d.Name); err != nil {
-		t.Fatalf("docker pause failed: %v\nout: %s", err, out)
+	if err := d.Pause(); err != nil {
+		t.Fatalf("docker pause failed: %v", err)
 	}
 
 	// Check if container is paused.
-	switch _, err := client.Get(url); v := err.(type) {
+	switch _, err := client.Get(fmt.Sprintf("http://localhost:%d", port)); v := err.(type) {
 	case nil:
 		t.Errorf("http req expected to fail but it succeeded")
 	case net.Error:
@@ -126,18 +135,17 @@ func TestPauseResume(t *testing.T) {
 		t.Errorf("http req got unexpected error %v", v)
 	}
 
-	// Resume container.
-	if out, err := d.Do("unpause", d.Name); err != nil {
-		t.Fatalf("docker unpause failed: %v\nout: %s", err, out)
+	if err := d.Unpause(); err != nil {
+		t.Fatalf("docker unpause failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := d.WaitForHTTP(port, 5*time.Second); err != nil {
-		t.Fatalf("docker.WaitForHTTP() timeout: %v", err)
+	if err := testutil.WaitForHTTP(port, 20*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check if container is working again.
-	if err := httpRequestSucceeds(client, url); err != nil {
+	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
 		t.Errorf("http request failed: %v", err)
 	}
 }
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index ec5ff850b..6825ed9ec 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -19,7 +19,6 @@ import (
 	"io/ioutil"
 	"log"
 	"math/rand"
-	"net/http"
 	"os"
 	"os/exec"
 	"path"
@@ -41,6 +40,12 @@ func runtime() string {
 	return r
 }
 
+// IsPauseResumeSupported returns true if Pause/Resume is supported by runtime.
+func IsPauseResumeSupported() bool {
+	// Native host network stack can't be saved.
+	return !strings.Contains(runtime(), "hostnet")
+}
+
 // EnsureSupportedDockerVersion checks if correct docker is installed.
 func EnsureSupportedDockerVersion() {
 	cmd := exec.Command("docker", "version")
@@ -100,7 +105,7 @@ func do(args ...string) (string, error) {
 	cmd := exec.Command("docker", args...)
 	out, err := cmd.CombinedOutput()
 	if err != nil {
-		return "", fmt.Errorf("error executing docker %s: %v", args, err)
+		return "", fmt.Errorf("error executing docker %s: %v\nout: %s", args, err, out)
 	}
 	return string(out), nil
 }
@@ -108,8 +113,9 @@ func do(args ...string) (string, error) {
 // Pull pulls a docker image. This is used in tests to isolate the
 // time to pull the image off the network from the time to actually
 // start the container, to avoid timeouts over slow networks.
-func Pull(image string) (string, error) {
-	return do("pull", image)
+func Pull(image string) error {
+	_, err := do("pull", image)
+	return err
 }
 
 // Docker contains the name and the runtime of a docker container.
@@ -125,6 +131,30 @@ func MakeDocker(namePrefix string) Docker {
 	return Docker{Name: namePrefix + suffix, Runtime: runtime()}
 }
 
+// Create calls 'docker create' with the arguments provided.
+func (d *Docker) Create(args ...string) error {
+	a := []string{"create", "--runtime", d.Runtime, "--name", d.Name}
+	a = append(a, args...)
+	_, err := do(a...)
+	return err
+}
+
+// Start calls 'docker start'.
+func (d *Docker) Start() error {
+	if _, err := do("start", d.Name); err != nil {
+		return fmt.Errorf("error starting container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Stop calls 'docker stop'.
+func (d *Docker) Stop() error {
+	if _, err := do("stop", d.Name); err != nil {
+		return fmt.Errorf("error stopping container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
 // Run calls 'docker run' with the arguments provided.
 func (d *Docker) Run(args ...string) (string, error) {
 	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
@@ -132,17 +162,38 @@ func (d *Docker) Run(args ...string) (string, error) {
 	return do(a...)
 }
 
-// CleanUp kills and deletes the container.
-func (d *Docker) CleanUp() error {
-	if _, err := do("kill", d.Name); err != nil {
-		return fmt.Errorf("error killing container %q: %v", d.Name, err)
+// Pause calls 'docker pause'.
+func (d *Docker) Pause() error {
+	if _, err := do("pause", d.Name); err != nil {
+		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
 	}
+	return nil
+}
+
+// Unpause calls 'docker pause'.
+func (d *Docker) Unpause() error {
+	if _, err := do("unpause", d.Name); err != nil {
+		return fmt.Errorf("error unpausing container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Remove calls 'docker rm'.
+func (d *Docker) Remove() error {
 	if _, err := do("rm", d.Name); err != nil {
 		return fmt.Errorf("error deleting container %q: %v", d.Name, err)
 	}
 	return nil
 }
 
+// CleanUp kills and deletes the container.
+func (d *Docker) CleanUp() error {
+	if _, err := do("kill", d.Name); err != nil {
+		return fmt.Errorf("error killing container %q: %v", d.Name, err)
+	}
+	return d.Remove()
+}
+
 // FindPort returns the host port that is mapped to 'sandboxPort'. This calls
 // docker to allocate a free port in the host and prevent conflicts.
 func (d *Docker) FindPort(sandboxPort int) (int, error) {
@@ -177,16 +228,3 @@ func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) error {
 	}
 	return fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
 }
-
-// WaitForHTTP tries GET requests on a port until the call succeeds or a timeout.
-func (d *Docker) WaitForHTTP(port int, timeout time.Duration) error {
-	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
-		url := fmt.Sprintf("http://localhost:%d/", port)
-		if _, err := http.Get(url); err == nil {
-			// Success!
-			return nil
-		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	return fmt.Errorf("timeout waiting for HTTP server on port %d", port)
-}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 721478353..4e7ab3760 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"net/http"
 	"os"
 	"path/filepath"
 	"time"
@@ -182,3 +183,12 @@ func Poll(cb func() error, timeout time.Duration) error {
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	return backoff.Retry(cb, b)
 }
+
+// WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
+func WaitForHTTP(port int, timeout time.Duration) error {
+	cb := func() error {
+		_, err := http.Get(fmt.Sprintf("http://localhost:%d/", port))
+		return err
+	}
+	return Poll(cb, timeout)
+}
-- 
cgit v1.2.3


From 0d350aac7f70487bc28bae0d0f457155a4e19081 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Aug 2018 10:24:53 -0700
Subject: Enable SACK in runsc

SACK is disabled by default and needs to be manually enabled. It not only
improves performance, but also fixes hangs downloading files from certain
websites.

PiperOrigin-RevId: 207906742
Change-Id: I4fb7277b67bfdf83ac8195f1b9c38265a0d51e8b
---
 pkg/sentry/socket/hostinet/stack.go |  6 ++++--
 runsc/boot/controller.go            |  5 ++++-
 runsc/boot/loader.go                | 15 +++++++++++----
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 44c3b9a3f..f64809d39 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -84,11 +84,13 @@ func (s *Stack) Configure() error {
 		log.Warningf("Failed to read TCP send buffer size, using default values")
 	}
 
-	s.tcpSACKEnabled = false
+	// SACK is important for performance and even compatibility, assume it's
+	// enabled if we can't find the actual value.
+	s.tcpSACKEnabled = true
 	if sack, err := ioutil.ReadFile("/proc/sys/net/ipv4/tcp_sack"); err == nil {
 		s.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0"
 	} else {
-		log.Warningf("Failed to read if TCP SACK if enabled, setting to false")
+		log.Warningf("Failed to read if TCP SACK if enabled, setting to true")
 	}
 
 	return nil
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index c6e934e66..fc6ea326a 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -285,7 +285,10 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
-	networkStack := newEmptyNetworkStack(cm.l.conf, k)
+	networkStack, err := newEmptyNetworkStack(cm.l.conf, k)
+	if err != nil {
+		return fmt.Errorf("failed to create network: %v", err)
+	}
 	info, err := o.FilePayload.Files[0].Stat()
 	if err != nil {
 		return err
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 2f976cd52..f6c7bf223 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -174,7 +174,10 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
-	networkStack := newEmptyNetworkStack(conf, k)
+	networkStack, err := newEmptyNetworkStack(conf, k)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create network: %v", err)
+	}
 
 	// Initiate the Kernel object, which is required by the Context passed
 	// to createVFS in order to mount (among other things) procfs.
@@ -525,16 +528,20 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) inet.Stack {
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	switch conf.Network {
 	case NetworkHost:
-		return hostinet.NewStack()
+		return hostinet.NewStack(), nil
 
 	case NetworkNone, NetworkSandbox:
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
 		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
-		return &epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{Clock: clock})}
+		s := &epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{Clock: clock})}
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+			return nil, fmt.Errorf("failed to enable SACK: %v", err)
+		}
+		return s, nil
 
 	default:
 		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
-- 
cgit v1.2.3


From ea1e39a314d3a248d8b682a9f63e686530597d61 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Aug 2018 22:02:09 -0700
Subject: Resend packets back to netstack if destined to itself

Add option to redirect packet back to netstack if it's destined to itself.
This fixes the problem where connecting to the local NIC address would
not work, e.g.:
echo bar | nc -l -p 8080 &
echo foo | nc 192.168.0.2 8080

PiperOrigin-RevId: 207995083
Change-Id: I17adc2a04df48bfea711011a5df206326a1fb8ef
---
 pkg/tcpip/link/fdbased/endpoint.go         | 52 ++++++++++++++---------
 pkg/tcpip/stack/nic.go                     |  3 +-
 pkg/tcpip/stack/route.go                   | 16 +++++---
 pkg/tcpip/stack/stack.go                   |  2 +-
 runsc/boot/network.go                      | 10 ++---
 runsc/test/integration/integration_test.go | 66 ++++++++++++++++++++++--------
 runsc/test/testutil/docker.go              |  7 ++++
 7 files changed, 107 insertions(+), 49 deletions(-)

(limited to 'runsc')

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 4e20cfbf8..152d8f0b2 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -55,10 +55,15 @@ type endpoint struct {
 	// its end of the communication pipe.
 	closed func(*tcpip.Error)
 
-	vv       *buffer.VectorisedView
-	iovecs   []syscall.Iovec
-	views    []buffer.View
-	attached bool
+	vv         *buffer.VectorisedView
+	iovecs     []syscall.Iovec
+	views      []buffer.View
+	dispatcher stack.NetworkDispatcher
+
+	// handleLocal indicates whether packets destined to itself should be
+	// handled by the netstack internally (true) or be forwarded to the FD
+	// endpoint (false).
+	handleLocal bool
 }
 
 // Options specify the details about the fd-based endpoint to be created.
@@ -71,6 +76,7 @@ type Options struct {
 	Address         tcpip.LinkAddress
 	SaveRestore     bool
 	DisconnectOk    bool
+	HandleLocal     bool
 }
 
 // New creates a new fd-based endpoint.
@@ -100,14 +106,15 @@ func New(opts *Options) tcpip.LinkEndpointID {
 	}
 
 	e := &endpoint{
-		fd:      opts.FD,
-		mtu:     opts.MTU,
-		caps:    caps,
-		closed:  opts.ClosedFunc,
-		addr:    opts.Address,
-		hdrSize: hdrSize,
-		views:   make([]buffer.View, len(BufConfig)),
-		iovecs:  make([]syscall.Iovec, len(BufConfig)),
+		fd:          opts.FD,
+		mtu:         opts.MTU,
+		caps:        caps,
+		closed:      opts.ClosedFunc,
+		addr:        opts.Address,
+		hdrSize:     hdrSize,
+		views:       make([]buffer.View, len(BufConfig)),
+		iovecs:      make([]syscall.Iovec, len(BufConfig)),
+		handleLocal: opts.HandleLocal,
 	}
 	vv := buffer.NewVectorisedView(0, e.views)
 	e.vv = &vv
@@ -117,16 +124,16 @@ func New(opts *Options) tcpip.LinkEndpointID {
 // Attach launches the goroutine that reads packets from the file descriptor and
 // dispatches them via the provided dispatcher.
 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
-	e.attached = true
+	e.dispatcher = dispatcher
 	// Link endpoints are not savable. When transportation endpoints are
 	// saved, they stop sending outgoing packets and all incoming packets
 	// are rejected.
-	go e.dispatchLoop(dispatcher) // S/R-SAFE: See above.
+	go e.dispatchLoop() // S/R-SAFE: See above.
 }
 
 // IsAttached implements stack.LinkEndpoint.IsAttached.
 func (e *endpoint) IsAttached() bool {
-	return e.attached
+	return e.dispatcher != nil
 }
 
 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
@@ -153,6 +160,12 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
 func (e *endpoint) WritePacket(r *stack.Route, hdr *buffer.Prependable, payload buffer.View, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+	if e.handleLocal && r.LocalAddress != "" && r.LocalAddress == r.RemoteAddress {
+		hdrView := hdr.View()
+		vv := buffer.NewVectorisedView(len(hdrView)+len(payload), []buffer.View{hdrView, payload})
+		e.dispatcher.DeliverNetworkPacket(e, r.RemoteLinkAddress, protocol, &vv)
+		return nil
+	}
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
 		eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize))
@@ -165,7 +178,6 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr *buffer.Prependable, payload
 
 	if len(payload) == 0 {
 		return rawfile.NonBlockingWrite(e.fd, hdr.UsedBytes())
-
 	}
 
 	return rawfile.NonBlockingWrite2(e.fd, hdr.UsedBytes(), payload)
@@ -198,7 +210,7 @@ func (e *endpoint) allocateViews(bufConfig []int) {
 }
 
 // dispatch reads one packet from the file descriptor and dispatches it.
-func (e *endpoint) dispatch(d stack.NetworkDispatcher, largeV buffer.View) (bool, *tcpip.Error) {
+func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
 	e.allocateViews(BufConfig)
 
 	n, err := rawfile.BlockingReadv(e.fd, e.iovecs)
@@ -234,7 +246,7 @@ func (e *endpoint) dispatch(d stack.NetworkDispatcher, largeV buffer.View) (bool
 	e.vv.SetSize(n)
 	e.vv.TrimFront(e.hdrSize)
 
-	d.DeliverNetworkPacket(e, addr, p, e.vv)
+	e.dispatcher.DeliverNetworkPacket(e, addr, p, e.vv)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -246,10 +258,10 @@ func (e *endpoint) dispatch(d stack.NetworkDispatcher, largeV buffer.View) (bool
 
 // dispatchLoop reads packets from the file descriptor in a loop and dispatches
 // them to the network stack.
-func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) *tcpip.Error {
+func (e *endpoint) dispatchLoop() *tcpip.Error {
 	v := buffer.NewView(header.MaxIPPacketSize)
 	for {
-		cont, err := e.dispatch(d, v)
+		cont, err := e.dispatch(v)
 		if err != nil || !cont {
 			if e.closed != nil {
 				e.closed(err)
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 25c06cba5..c1480f97b 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -327,8 +327,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remoteLinkAddr tcpip.Lin
 		return
 	}
 
-	r := makeRoute(protocol, dst, src, ref)
-	r.LocalLinkAddress = linkEP.LinkAddress()
+	r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
 	r.RemoteLinkAddress = remoteLinkAddr
 	ref.ep.HandlePacket(&r, vv)
 	ref.decRef()
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 200c39289..423f428df 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -50,12 +50,13 @@ type Route struct {
 
 // makeRoute initializes a new route. It takes ownership of the provided
 // reference to a network endpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, ref *referencedNetworkEndpoint) Route {
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint) Route {
 	return Route{
-		NetProto:      netProto,
-		LocalAddress:  localAddr,
-		RemoteAddress: remoteAddr,
-		ref:           ref,
+		NetProto:         netProto,
+		LocalAddress:     localAddr,
+		LocalLinkAddress: localLinkAddr,
+		RemoteAddress:    remoteAddr,
+		ref:              ref,
 	}
 }
 
@@ -92,6 +93,11 @@ func (r *Route) Resolve(waker *sleep.Waker) *tcpip.Error {
 
 	nextAddr := r.NextHop
 	if nextAddr == "" {
+		// Local link address is already known.
+		if r.RemoteAddress == r.LocalAddress {
+			r.RemoteLinkAddress = r.LocalLinkAddress
+			return nil
+		}
 		nextAddr = r.RemoteAddress
 	}
 	linkAddr, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index fa7aeb051..6c4aa7cc5 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -687,7 +687,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			remoteAddr = ref.ep.ID().LocalAddress
 		}
 
-		r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, ref)
+		r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref)
 		r.NextHop = s.routeTable[i].Gateway
 		return r, nil
 	}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index d2b52c823..d702ae74e 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -134,11 +134,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		linkEP := fdbased.New(&fdbased.Options{
-			FD:              newFD,
-			MTU:             uint32(link.MTU),
-			ChecksumOffload: false,
-			EthernetHeader:  true,
-			Address:         tcpip.LinkAddress(generateRndMac()),
+			FD:             newFD,
+			MTU:            uint32(link.MTU),
+			EthernetHeader: true,
+			HandleLocal:    true,
+			Address:        tcpip.LinkAddress(generateRndMac()),
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 67b58523d..c286e48d2 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -31,6 +31,7 @@ import (
 	"net"
 	"net/http"
 	"os"
+	"strings"
 	"testing"
 	"time"
 
@@ -54,36 +55,36 @@ func httpRequestSucceeds(client http.Client, server string, port int) error {
 // TestLifeCycle tests a basic Create/Start/Stop docker container life cycle.
 func TestLifeCycle(t *testing.T) {
 	if err := testutil.Pull("nginx"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
+		t.Fatal("docker pull failed:", err)
 	}
 	d := testutil.MakeDocker("lifecycle-test")
 	if err := d.Create("-p", "80", "nginx"); err != nil {
-		t.Fatalf("docker create failed: %v", err)
+		t.Fatal("docker create failed:", err)
 	}
 	if err := d.Start(); err != nil {
 		d.CleanUp()
-		t.Fatalf("docker start failed: %v", err)
+		t.Fatal("docker start failed:", err)
 	}
 
 	// Test that container is working
 	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatalf("docker.FindPort(80) failed: %v", err)
+		t.Fatal("docker.FindPort(80) failed: ", err)
 	}
 	if err := testutil.WaitForHTTP(port, 5*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
+		t.Fatal("WaitForHTTP() timeout:", err)
 	}
 	client := http.Client{Timeout: time.Duration(2 * time.Second)}
 	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Errorf("http request failed: %v", err)
+		t.Error("http request failed:", err)
 	}
 
 	if err := d.Stop(); err != nil {
 		d.CleanUp()
-		t.Fatalf("docker stop failed: %v", err)
+		t.Fatal("docker stop failed:", err)
 	}
 	if err := d.Remove(); err != nil {
-		t.Fatalf("docker rm failed: %v", err)
+		t.Fatal("docker rm failed:", err)
 	}
 }
 
@@ -94,7 +95,7 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	if err := testutil.Pull("google/python-hello"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
+		t.Fatal("docker pull failed:", err)
 	}
 	d := testutil.MakeDocker("pause-resume-test")
 	if out, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
@@ -105,22 +106,22 @@ func TestPauseResume(t *testing.T) {
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+		t.Fatal("docker.FindPort(8080) failed:", err)
 	}
 
 	// Wait until it's up and running.
 	if err := testutil.WaitForHTTP(port, 20*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
+		t.Fatal("WaitForHTTP() timeout:", err)
 	}
 
 	// Check that container is working.
 	client := http.Client{Timeout: time.Duration(2 * time.Second)}
 	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Errorf("http request failed: %v", err)
+		t.Error("http request failed:", err)
 	}
 
 	if err := d.Pause(); err != nil {
-		t.Fatalf("docker pause failed: %v", err)
+		t.Fatal("docker pause failed:", err)
 	}
 
 	// Check if container is paused.
@@ -136,17 +137,50 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	if err := d.Unpause(); err != nil {
-		t.Fatalf("docker unpause failed: %v", err)
+		t.Fatal("docker unpause failed:", err)
 	}
 
 	// Wait until it's up and running.
 	if err := testutil.WaitForHTTP(port, 20*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
+		t.Fatal("WaitForHTTP() timeout:", err)
 	}
 
 	// Check if container is working again.
 	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Errorf("http request failed: %v", err)
+		t.Error("http request failed:", err)
+	}
+}
+
+// Create client and server that talk to each other using the local IP.
+func TestConnectToSelf(t *testing.T) {
+	d := testutil.MakeDocker("connect-to-self-test")
+
+	// Creates server that replies "server" and exists. Sleeps at the end because
+	// 'docker exec' gets killed if the init process exists before it can finish.
+	if _, err := d.Run("ubuntu:trusty", "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+	defer d.CleanUp()
+
+	// Finds IP address for eth0.
+	ip, err := d.Exec("/bin/sh", "-c", "ifconfig eth0 | grep -E -o \".*inet [^ ]+\" | cut -d: -f2")
+	if err != nil {
+		t.Fatal("docker exec failed:", err)
+	}
+	ip = strings.TrimRight(ip, "\n")
+
+	// Runs client that sends "client" to the server and exits.
+	reply, err := d.Exec("/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
+	if err != nil {
+		t.Fatal("docker exec failed:", err)
+	}
+
+	// Ensure both client and server got the message from each other.
+	if want := "server\n"; reply != want {
+		t.Errorf("Error on server, want: %q, got: %q", want, reply)
+	}
+	if err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil {
+		t.Fatal("docker.WaitForOutput(client) timeout:", err)
 	}
 }
 
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 6825ed9ec..b7d60e712 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -162,6 +162,13 @@ func (d *Docker) Run(args ...string) (string, error) {
 	return do(a...)
 }
 
+// Exec calls 'docker exec' with the arguments provided.
+func (d *Docker) Exec(args ...string) (string, error) {
+	a := []string{"exec", d.Name}
+	a = append(a, args...)
+	return do(a...)
+}
+
 // Pause calls 'docker pause'.
 func (d *Docker) Pause() error {
 	if _, err := do("pause", d.Name); err != nil {
-- 
cgit v1.2.3


From 4e171f7590284c1f4cedf90c92204873961b2e97 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Aug 2018 22:38:41 -0700
Subject: Basic support for ip link/addr and ifconfig

Closes #94

PiperOrigin-RevId: 207997580
Change-Id: I19b426f1586b5ec12f8b0cd5884d5b401d334924
---
 pkg/abi/linux/netlink_route.go              |  5 +++
 pkg/sentry/inet/inet.go                     |  3 ++
 pkg/sentry/socket/epsocket/epsocket.go      | 52 ++++++++++-------------
 pkg/sentry/socket/epsocket/stack.go         | 12 ++++--
 pkg/sentry/socket/netlink/route/protocol.go | 15 ++++++-
 pkg/sentry/socket/netlink/socket.go         | 64 ++++++++++++++++++++++++++---
 pkg/tcpip/link/loopback/loopback.go         |  2 +-
 pkg/tcpip/stack/nic.go                      |  7 ++++
 pkg/tcpip/stack/registration.go             |  1 +
 pkg/tcpip/stack/stack.go                    | 36 ++++++++--------
 runsc/boot/network.go                       |  5 ++-
 11 files changed, 138 insertions(+), 64 deletions(-)

(limited to 'runsc')

diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 0d88bc5c5..a5d778748 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -184,3 +184,8 @@ const (
 	IFA_MULTICAST = 7
 	IFA_FLAGS     = 8
 )
+
+// Device types, from uapi/linux/if_arp.h.
+const (
+	ARPHRD_LOOPBACK = 772
+)
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index e54a61196..30ca4e0c0 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -67,6 +67,9 @@ type Interface struct {
 
 	// Addr is the hardware device address.
 	Addr []byte
+
+	// MTU is the maximum transmission unit.
+	MTU uint32
 }
 
 // InterfaceAddr contains information about a network interface address.
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index f969a1d7c..b32eda96f 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -48,7 +48,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	nstack "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -452,7 +452,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
 // sockets backed by a commonEndpoint.
 func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType unix.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
-	case syscall.SOL_SOCKET:
+	case linux.SOL_SOCKET:
 		switch name {
 		case linux.SO_TYPE:
 			if outLen < sizeOfInt32 {
@@ -634,7 +634,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 // sockets backed by a commonEndpoint.
 func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
 	switch level {
-	case syscall.SOL_SOCKET:
+	case linux.SOL_SOCKET:
 		switch name {
 		case linux.SO_SNDBUF:
 			if len(optVal) < sizeOfInt32 {
@@ -1191,7 +1191,9 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 		if err != nil {
 			return err
 		}
-		usermem.ByteOrder.PutUint16(ifr.Data[:2], f)
+		// Drop the flags that don't fit in the size that we need to return. This
+		// matches Linux behavior.
+		usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
 
 	case syscall.SIOCGIFADDR:
 		// Copy the IPv4 address out.
@@ -1304,7 +1306,7 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
 // interfaceStatusFlags returns status flags for an interface in the stack.
 // Flag values and meanings are described in greater detail in netdevice(7) in
 // the SIOCGIFFLAGS section.
-func interfaceStatusFlags(stack inet.Stack, name string) (uint16, *syserr.Error) {
+func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
 	// epsocket should only ever be passed an epsocket.Stack.
 	epstack, ok := stack.(*Stack)
 	if !ok {
@@ -1312,37 +1314,27 @@ func interfaceStatusFlags(stack inet.Stack, name string) (uint16, *syserr.Error)
 	}
 
 	// Find the NIC corresponding to this interface.
-	var (
-		nicid tcpip.NICID
-		info  nstack.NICInfo
-		found bool
-	)
-	ns := epstack.Stack
-	for nicid, info = range ns.NICInfo() {
+	for _, info := range epstack.Stack.NICInfo() {
 		if info.Name == name {
-			found = true
-			break
+			return nicStateFlagsToLinux(info.Flags), nil
 		}
 	}
-	if !found {
-		return 0, syserr.ErrNoDevice
-	}
+	return 0, syserr.ErrNoDevice
+}
 
-	// Set flags based on NIC state.
-	nicFlags, err := ns.NICFlags(nicid)
-	if err != nil {
-		return 0, syserr.TranslateNetstackError(err)
+func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
+	var rv uint32
+	if f.Up {
+		rv |= linux.IFF_UP | linux.IFF_LOWER_UP
 	}
-
-	var retFlags uint16
-	if nicFlags.Up {
-		retFlags |= linux.IFF_UP
+	if f.Running {
+		rv |= linux.IFF_RUNNING
 	}
-	if nicFlags.Running {
-		retFlags |= linux.IFF_RUNNING
+	if f.Promiscuous {
+		rv |= linux.IFF_PROMISC
 	}
-	if nicFlags.Promiscuous {
-		retFlags |= linux.IFF_PROMISC
+	if f.Loopback {
+		rv |= linux.IFF_LOOPBACK
 	}
-	return retFlags, nil
+	return rv
 }
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 12b4b4767..e4ed52fc8 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -41,10 +41,16 @@ func (s *Stack) SupportsIPv6() bool {
 func (s *Stack) Interfaces() map[int32]inet.Interface {
 	is := make(map[int32]inet.Interface)
 	for id, ni := range s.Stack.NICInfo() {
+		var devType uint16
+		if ni.Flags.Loopback {
+			devType = linux.ARPHRD_LOOPBACK
+		}
 		is[int32(id)] = inet.Interface{
-			Name: ni.Name,
-			Addr: []byte(ni.LinkAddress),
-			// TODO: Other fields.
+			Name:       ni.Name,
+			Addr:       []byte(ni.LinkAddress),
+			Flags:      uint32(nicStateFlagsToLinux(ni.Flags)),
+			DeviceType: devType,
+			MTU:        ni.MTU,
 		}
 	}
 	return is
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 55a76e916..70322b9ed 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -16,6 +16,8 @@
 package route
 
 import (
+	"bytes"
+
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
@@ -97,9 +99,18 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		})
 
 		m.PutAttrString(linux.IFLA_IFNAME, i.Name)
+		m.PutAttr(linux.IFLA_MTU, i.MTU)
+
+		mac := make([]byte, 6)
+		brd := mac
+		if len(i.Addr) > 0 {
+			mac = i.Addr
+			brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
+		}
+		m.PutAttr(linux.IFLA_ADDRESS, mac)
+		m.PutAttr(linux.IFLA_BROADCAST, brd)
 
-		// TODO: There are many more attributes, such as
-		// MAC address.
+		// TODO: There are many more attributes.
 	}
 
 	return nil
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index e15d1546c..f3b2c7256 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -16,6 +16,7 @@
 package netlink
 
 import (
+	"math"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -39,8 +40,18 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// defaultSendBufferSize is the default size for the send buffer.
-const defaultSendBufferSize = 16 * 1024
+const sizeOfInt32 int = 4
+
+const (
+	// minBufferSize is the smallest size of a send buffer.
+	minSendBufferSize = 4 << 10 // 4096 bytes.
+
+	// defaultSendBufferSize is the default size for the send buffer.
+	defaultSendBufferSize = 16 * 1024
+
+	// maxBufferSize is the largest size a send buffer can grow to.
+	maxSendBufferSize = 4 << 20 // 4MB
+)
 
 // netlinkSocketDevice is the netlink socket virtual device.
 var netlinkSocketDevice = device.NewAnonDevice()
@@ -86,7 +97,7 @@ type Socket struct {
 
 	// sendBufferSize is the send buffer "size". We don't actually have a
 	// fixed buffer but only consume this many bytes.
-	sendBufferSize uint64
+	sendBufferSize uint32
 }
 
 var _ socket.Socket = (*Socket)(nil)
@@ -273,13 +284,54 @@ func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error {
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
 func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
-	// TODO: no sockopts supported.
+	switch level {
+	case linux.SOL_SOCKET:
+		switch name {
+		case linux.SO_SNDBUF:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+			return int32(s.sendBufferSize), nil
+
+		case linux.SO_RCVBUF:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+			// We don't have limit on receiving size.
+			return math.MaxInt32, nil
+		}
+	}
+	// TODO: other sockopts are not supported.
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
-	// TODO: no sockopts supported.
+	switch level {
+	case linux.SOL_SOCKET:
+		switch name {
+		case linux.SO_SNDBUF:
+			if len(opt) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+			size := usermem.ByteOrder.Uint32(opt)
+			if size < minSendBufferSize {
+				size = minSendBufferSize
+			} else if size > maxSendBufferSize {
+				size = maxSendBufferSize
+			}
+			s.sendBufferSize = size
+			return nil
+		case linux.SO_RCVBUF:
+			if len(opt) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+			// We don't have limit on receiving size. So just accept anything as
+			// valid for compatibility.
+			return nil
+		}
+	}
+	// TODO: other sockopts are not supported.
 	return syserr.ErrProtocolNotAvailable
 }
 
@@ -489,7 +541,7 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte,
 
 	// For simplicity, and consistency with Linux, we copy in the entire
 	// message up front.
-	if uint64(src.NumBytes()) > s.sendBufferSize {
+	if src.NumBytes() > int64(s.sendBufferSize) {
 		return 0, syserr.ErrMessageTooLong
 	}
 
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index b4dc4833c..015275721 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -56,7 +56,7 @@ func (*endpoint) MTU() uint32 {
 // Capabilities implements stack.LinkEndpoint.Capabilities. Loopback advertises
 // itself as supporting checksum offload, but in reality it's just omitted.
 func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return stack.CapabilityChecksumOffload | stack.CapabilitySaveRestore
+	return stack.CapabilityChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
 }
 
 // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. Given that the
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index c1480f97b..592006a32 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -67,6 +67,13 @@ func (n *NIC) setPromiscuousMode(enable bool) {
 	n.mu.Unlock()
 }
 
+func (n *NIC) isPromiscuousMode() bool {
+	n.mu.RLock()
+	rv := n.promiscuous
+	n.mu.RUnlock()
+	return rv
+}
+
 // setSpoofing enables or disables address spoofing.
 func (n *NIC) setSpoofing(enable bool) {
 	n.mu.Lock()
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 01a29689d..bbe887144 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -205,6 +205,7 @@ const (
 	CapabilityResolutionRequired
 	CapabilitySaveRestore
 	CapabilityDisconnectOk
+	CapabilityLoopback
 )
 
 // LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6c4aa7cc5..e2b9dc2c0 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -563,6 +563,12 @@ type NICInfo struct {
 	Name              string
 	LinkAddress       tcpip.LinkAddress
 	ProtocolAddresses []tcpip.ProtocolAddress
+
+	// Flags indicate the state of the NIC.
+	Flags NICStateFlags
+
+	// MTU is the maximum transmission unit.
+	MTU uint32
 }
 
 // NICInfo returns a map of NICIDs to their associated information.
@@ -572,10 +578,18 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 
 	nics := make(map[tcpip.NICID]NICInfo)
 	for id, nic := range s.nics {
+		flags := NICStateFlags{
+			Up:          true, // Netstack interfaces are always up.
+			Running:     nic.linkEP.IsAttached(),
+			Promiscuous: nic.isPromiscuousMode(),
+			Loopback:    nic.linkEP.Capabilities()&CapabilityLoopback != 0,
+		}
 		nics[id] = NICInfo{
 			Name:              nic.name,
 			LinkAddress:       nic.linkEP.LinkAddress(),
 			ProtocolAddresses: nic.Addresses(),
+			Flags:             flags,
+			MTU:               nic.linkEP.MTU(),
 		}
 	}
 	return nics
@@ -591,27 +605,9 @@ type NICStateFlags struct {
 
 	// Promiscuous indicates whether the interface is in promiscuous mode.
 	Promiscuous bool
-}
-
-// NICFlags returns flags about the state of the NIC. It returns an error if
-// the NIC corresponding to id cannot be found.
-func (s *Stack) NICFlags(id tcpip.NICID) (NICStateFlags, *tcpip.Error) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
 
-	nic := s.nics[id]
-	if nic == nil {
-		return NICStateFlags{}, tcpip.ErrUnknownNICID
-	}
-
-	ret := NICStateFlags{
-		// Netstack interfaces are always up.
-		Up: true,
-
-		Running:     nic.linkEP.IsAttached(),
-		Promiscuous: nic.promiscuous,
-	}
-	return ret, nil
+	// Loopback indicates whether the interface is a loopback.
+	Loopback bool
 }
 
 // AddAddress adds a new network-layer address to the specified NIC.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index d702ae74e..0e43c91be 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -133,15 +133,16 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
 		}
 
+		mac := tcpip.LinkAddress(generateRndMac())
 		linkEP := fdbased.New(&fdbased.Options{
 			FD:             newFD,
 			MTU:            uint32(link.MTU),
 			EthernetHeader: true,
 			HandleLocal:    true,
-			Address:        tcpip.LinkAddress(generateRndMac()),
+			Address:        mac,
 		})
 
-		log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
 		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
 		}
-- 
cgit v1.2.3


From 0ac912f99e44e8e89985dd83ec946deadbfd8797 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 9 Aug 2018 17:04:18 -0700
Subject: Fix runsc integration_test when using --network=host

inethost doesn't support netlink and 'ifconfig' call to retrieve IP address
fails. Look up IP address in /etc/hosts instead.

PiperOrigin-RevId: 208135641
Change-Id: I3c2ce15db6fc7c3306a45e4bfb9cc5d4423ffad3
---
 runsc/test/integration/integration_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index c286e48d2..c6b546a56 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -162,8 +162,8 @@ func TestConnectToSelf(t *testing.T) {
 	}
 	defer d.CleanUp()
 
-	// Finds IP address for eth0.
-	ip, err := d.Exec("/bin/sh", "-c", "ifconfig eth0 | grep -E -o \".*inet [^ ]+\" | cut -d: -f2")
+	// Finds IP address for host.
+	ip, err := d.Exec("/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
 	if err != nil {
 		t.Fatal("docker exec failed:", err)
 	}
-- 
cgit v1.2.3


From 4ececd8e8d1124cdd0884480bda5fabd2b48aa8d Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Fri, 10 Aug 2018 14:31:56 -0700
Subject: Enable checkpoint/restore in cases of UDS use.

Previously, processes which used file-system Unix Domain Sockets could not be
checkpoint-ed in runsc because the sockets were saved with their inode
numbers which do not necessarily remain the same upon restore. Now,
the sockets are also saved with their paths so that the new inodes
can be determined for the sockets based on these paths after restoring.
Tests for cases with UDS use are included. Test cleanup to come.

PiperOrigin-RevId: 208268781
Change-Id: Ieaa5d5d9a64914ca105cae199fd8492710b1d7ec
---
 pkg/sentry/fs/dirent.go              |  53 +++++---
 pkg/sentry/fs/fsutil/inode.go        |   4 +-
 pkg/sentry/fs/gofer/gofer_test.go    |   2 +-
 pkg/sentry/fs/gofer/path.go          |  37 ++++--
 pkg/sentry/fs/gofer/session.go       | 149 +++++++++++++++++++----
 pkg/sentry/fs/gofer/session_state.go |  26 +++-
 pkg/sentry/fs/gofer/socket.go        |   2 +
 pkg/sentry/fs/host/inode.go          |   4 +-
 pkg/sentry/fs/inode.go               |   2 +-
 pkg/sentry/fs/inode_operations.go    |   2 +-
 pkg/sentry/fs/inode_overlay.go       |   4 +-
 pkg/sentry/fs/ramfs/dir.go           |  15 ++-
 pkg/sentry/fs/ramfs/ramfs.go         |   4 +-
 pkg/sentry/fs/tty/dir.go             |   4 +-
 pkg/sentry/socket/unix/unix.go       |   4 +-
 runsc/container/BUILD                |  10 +-
 runsc/container/container_test.go    | 228 ++++++++++++++++++++++++++++++++---
 runsc/container/uds_test_app.go      |  83 +++++++++++++
 18 files changed, 541 insertions(+), 92 deletions(-)
 create mode 100644 runsc/container/uds_test_app.go

(limited to 'runsc')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4658d044f..821cc5789 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -385,6 +385,19 @@ func (d *Dirent) fullName(root *Dirent) (string, bool) {
 	return s, reachable
 }
 
+// MountRoot finds and returns the mount-root for a given dirent.
+func (d *Dirent) MountRoot() *Dirent {
+	renameMu.RLock()
+	defer renameMu.RUnlock()
+
+	mountRoot := d
+	for !mountRoot.mounted && mountRoot.parent != nil {
+		mountRoot = mountRoot.parent
+	}
+	mountRoot.IncRef()
+	return mountRoot
+}
+
 func (d *Dirent) freeze() {
 	if d.frozen {
 		// Already frozen.
@@ -665,6 +678,16 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
 	}
 	child := file.Dirent
 
+	d.finishCreate(child, name)
+
+	// Return the reference and the new file. When the last reference to
+	// the file is dropped, file.Dirent may no longer be cached.
+	return file, nil
+}
+
+// finishCreate validates the created file, adds it as a child of this dirent,
+// and notifies any watchers.
+func (d *Dirent) finishCreate(child *Dirent, name string) {
 	// Sanity check c, its name must be consistent.
 	if child.name != name {
 		panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name))
@@ -697,10 +720,6 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
 
 	// Allow the file system to take extra references on c.
 	child.maybeExtendReference()
-
-	// Return the reference and the new file. When the last reference to
-	// the file is dropped, file.Dirent may no longer be cached.
-	return file, nil
 }
 
 // genericCreate executes create if name does not exist. Removes a negative Dirent at name if
@@ -718,11 +737,6 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
 		return syscall.ENOENT
 	}
 
-	// Execute the create operation.
-	if err := create(); err != nil {
-		return err
-	}
-
 	// Remove any negative Dirent. We've already asserted above with d.exists
 	// that the only thing remaining here can be a negative Dirent.
 	if w, ok := d.children[name]; ok {
@@ -745,7 +759,8 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
 		w.Drop()
 	}
 
-	return nil
+	// Execute the create operation.
+	return create()
 }
 
 // CreateLink creates a new link in this directory.
@@ -797,23 +812,29 @@ func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string,
 }
 
 // Bind satisfies the InodeOperations interface; otherwise same as GetFile.
-func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, socket unix.BoundEndpoint, perms FilePermissions) error {
+func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data unix.BoundEndpoint, perms FilePermissions) (*Dirent, error) {
 	d.dirMu.Lock()
 	defer d.dirMu.Unlock()
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
+	var childDir *Dirent
 	err := d.genericCreate(ctx, root, name, func() error {
-		if err := d.Inode.Bind(ctx, name, socket, perms); err != nil {
-			return err
+		var e error
+		childDir, e = d.Inode.Bind(ctx, name, data, perms)
+		if e != nil {
+			return e
 		}
-		d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+		d.finishCreate(childDir, name)
 		return nil
 	})
 	if err == syscall.EEXIST {
-		return syscall.EADDRINUSE
+		return nil, syscall.EADDRINUSE
+	}
+	if err != nil {
+		return nil, err
 	}
-	return err
+	return childDir, err
 }
 
 // CreateFifo creates a new named pipe under this dirent.
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 177396fdc..3479f2fad 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -254,8 +254,8 @@ func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error {
-	return syserror.ENOTDIR
+func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, syserror.ENOTDIR
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo.
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 764b530cb..45fdaacfd 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -74,7 +74,7 @@ func root(ctx context.Context, cp cachePolicy, mode p9.FileMode, size uint64) (*
 	}
 
 	rootFile := goodMockFile(mode, size)
-	sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{file: rootFile}, p9.QID{}, rootFile.GetAttrMock.Valid, rootFile.GetAttrMock.Attr)
+	sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{file: rootFile}, p9.QID{}, rootFile.GetAttrMock.Valid, rootFile.GetAttrMock.Attr, false /* socket */)
 	m := fs.NewMountSource(s, &filesystem{}, fs.MountSourceFlags{})
 	return rootFile, fs.NewInode(rootInodeOperations, m, sattr), nil
 }
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index bfeab3833..15e9863fb 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -57,7 +57,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 	}
 
 	// Construct the Inode operations.
-	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr)
+	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr, false)
 
 	// Construct a positive Dirent.
 	return fs.NewDirent(fs.NewInode(node, dir.MountSource, sattr), name), nil
@@ -113,7 +113,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	}
 
 	// Construct the InodeOperations.
-	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr)
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr, false)
 
 	// Construct the positive Dirent.
 	d := fs.NewDirent(fs.NewInode(iops, dir.MountSource, sattr), name)
@@ -175,10 +175,10 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s
 	return nil
 }
 
-// Bind implements InodeOperations.
-func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perm fs.FilePermissions) error {
+// Bind implements InodeOperations.Bind.
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
 	if i.session().endpoints == nil {
-		return syscall.EOPNOTSUPP
+		return nil, syscall.EOPNOTSUPP
 	}
 
 	// Create replaces the directory fid with the newly created/opened
@@ -186,7 +186,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	// this node.
 	_, newFile, err := i.fileState.file.walk(ctx, nil)
 	if err != nil {
-		return err
+		return nil, err
 	}
 
 	// Stabilize the endpoint map while creation is in progress.
@@ -198,7 +198,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	owner := fs.FileOwnerFromContext(ctx)
 	hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
 	if err != nil {
-		return err
+		return nil, err
 	}
 	// We're not going to use this file.
 	hostFile.Close()
@@ -206,10 +206,10 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	i.touchModificationTime(ctx, dir)
 
 	// Get the attributes of the file to create inode key.
-	qid, _, attr, err := getattr(ctx, newFile)
+	qid, mask, attr, err := getattr(ctx, newFile)
 	if err != nil {
 		newFile.close(ctx)
-		return err
+		return nil, err
 	}
 
 	key := device.MultiDeviceKey{
@@ -217,9 +217,24 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 		SecondaryDevice: i.session().connID,
 		Inode:           qid.Path,
 	}
-	i.session().endpoints.add(key, ep)
 
-	return nil
+	// Create child dirent.
+
+	// Get an unopened p9.File for the file we created so that it can be
+	// cloned and re-opened multiple times after creation.
+	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	if err != nil {
+		newFile.close(ctx)
+		return nil, err
+	}
+
+	// Construct the InodeOperations.
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr, true)
+
+	// Construct the positive Dirent.
+	childDir := fs.NewDirent(fs.NewInode(iops, dir.MountSource, sattr), name)
+	i.session().endpoints.add(key, childDir, ep)
+	return childDir, nil
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo. Gofer nodes do not support the
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 648a11435..bfb1154dc 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -15,6 +15,7 @@
 package gofer
 
 import (
+	"fmt"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/p9"
@@ -28,39 +29,60 @@ import (
 )
 
 // +stateify savable
-type endpointMap struct {
+type endpointMaps struct {
+	// mu protexts the direntMap, the keyMap, and the pathMap below.
 	mu sync.RWMutex `state:"nosave"`
-	// TODO: Make map with private unix sockets savable.
-	m map[device.MultiDeviceKey]unix.BoundEndpoint
+
+	// direntMap links sockets to their dirents.
+	// It is filled concurrently with the keyMap and is stored upon save.
+	// Before saving, this map is used to populate the pathMap.
+	direntMap map[unix.BoundEndpoint]*fs.Dirent
+
+	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets.
+	// It is not stored during save because the inode ID may change upon restore.
+	keyMap map[device.MultiDeviceKey]unix.BoundEndpoint `state:"nosave"`
+
+	// pathMap links the sockets to their paths.
+	// It is filled before saving from the direntMap and is stored upon save.
+	// Upon restore, this map is used to re-populate the keyMap.
+	pathMap map[unix.BoundEndpoint]string
 }
 
-// add adds the endpoint to the map.
+// add adds the endpoint to the maps.
+// A reference is taken on the dirent argument.
 //
-// Precondition: map must have been locked with 'lock'.
-func (e *endpointMap) add(key device.MultiDeviceKey, ep unix.BoundEndpoint) {
-	e.m[key] = ep
+// Precondition: maps must have been locked with 'lock'.
+func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep unix.BoundEndpoint) {
+	e.keyMap[key] = ep
+	d.IncRef()
+	e.direntMap[ep] = d
 }
 
-// remove deletes the key from the map.
+// remove deletes the key from the maps.
 //
-// Precondition: map must have been locked with 'lock'.
-func (e *endpointMap) remove(key device.MultiDeviceKey) {
-	delete(e.m, key)
+// Precondition: maps must have been locked with 'lock'.
+func (e *endpointMaps) remove(key device.MultiDeviceKey) {
+	endpoint := e.get(key)
+	delete(e.keyMap, key)
+
+	d := e.direntMap[endpoint]
+	d.DecRef()
+	delete(e.direntMap, endpoint)
 }
 
 // lock blocks other addition and removal operations from happening while
 // the backing file is being created or deleted. Returns a function that unlocks
 // the endpoint map.
-func (e *endpointMap) lock() func() {
+func (e *endpointMaps) lock() func() {
 	e.mu.Lock()
 	return func() { e.mu.Unlock() }
 }
 
-func (e *endpointMap) get(key device.MultiDeviceKey) unix.BoundEndpoint {
-	e.mu.RLock()
-	ep := e.m[key]
-	e.mu.RUnlock()
-	return ep
+// get returns the endpoint mapped to the given key.
+//
+// Precondition: maps must have been locked for reading.
+func (e *endpointMaps) get(key device.MultiDeviceKey) unix.BoundEndpoint {
+	return e.keyMap[key]
 }
 
 // session holds state for each 9p session established during sys_mount.
@@ -115,7 +137,7 @@ type session struct {
 	// TODO: there are few possible races with someone stat'ing the
 	// file and another deleting it concurrently, where the file will not be
 	// reported as socket file.
-	endpoints *endpointMap `state:"wait"`
+	endpoints *endpointMaps `state:"wait"`
 }
 
 // Destroy tears down the session.
@@ -149,7 +171,9 @@ func (s *session) SaveInodeMapping(inode *fs.Inode, path string) {
 
 // newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes
 // (p9.QID, p9.AttrMask, p9.Attr).
-func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) {
+//
+// Endpoints lock must not be held if socket == false.
+func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr, socket bool) (fs.StableAttr, *inodeOperations) {
 	deviceKey := device.MultiDeviceKey{
 		Device:          attr.RDev,
 		SecondaryDevice: s.connID,
@@ -164,10 +188,16 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 	}
 
 	if s.endpoints != nil {
-		// If unix sockets are allowed on this filesystem, check if this file is
-		// supposed to be a socket file.
-		if s.endpoints.get(deviceKey) != nil {
+		if socket {
 			sattr.Type = fs.Socket
+		} else {
+			// If unix sockets are allowed on this filesystem, check if this file is
+			// supposed to be a socket file.
+			unlock := s.endpoints.lock()
+			if s.endpoints.get(deviceKey) != nil {
+				sattr.Type = fs.Socket
+			}
+			unlock()
 		}
 	}
 
@@ -215,7 +245,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	}
 
 	if o.privateunixsocket {
-		s.endpoints = &endpointMap{m: make(map[device.MultiDeviceKey]unix.BoundEndpoint)}
+		s.endpoints = newEndpointMaps()
 	}
 
 	// Construct the MountSource with the session and superBlockFlags.
@@ -248,6 +278,77 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 		return nil, err
 	}
 
-	sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr)
+	sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr, false)
 	return fs.NewInode(iops, m, sattr), nil
 }
+
+// newEndpointMaps creates a new endpointMaps.
+func newEndpointMaps() *endpointMaps {
+	return &endpointMaps{
+		direntMap: make(map[unix.BoundEndpoint]*fs.Dirent),
+		keyMap:    make(map[device.MultiDeviceKey]unix.BoundEndpoint),
+		pathMap:   make(map[unix.BoundEndpoint]string),
+	}
+}
+
+// fillKeyMap populates key and dirent maps upon restore from saved
+// pathmap.
+func (s *session) fillKeyMap(ctx context.Context) error {
+	unlock := s.endpoints.lock()
+	defer unlock()
+
+	for ep, dirPath := range s.endpoints.pathMap {
+		_, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath))
+		if err != nil {
+			return fmt.Errorf("error filling endpointmaps, failed to walk to %q: %v", dirPath, err)
+		}
+
+		qid, _, attr, err := file.getAttr(ctx, p9.AttrMaskAll())
+		if err != nil {
+			return fmt.Errorf("failed to get file attributes of %s: %v", dirPath, err)
+		}
+
+		key := device.MultiDeviceKey{
+			Device:          attr.RDev,
+			SecondaryDevice: s.connID,
+			Inode:           qid.Path,
+		}
+
+		s.endpoints.keyMap[key] = ep
+	}
+	return nil
+}
+
+// fillPathMap populates paths for endpoints from dirents in direntMap
+// before save.
+func (s *session) fillPathMap() error {
+	unlock := s.endpoints.lock()
+	defer unlock()
+
+	for ep, dir := range s.endpoints.direntMap {
+		mountRoot := dir.MountRoot()
+		defer mountRoot.DecRef()
+		dirPath, _ := dir.FullName(mountRoot)
+		if dirPath == "" {
+			return fmt.Errorf("error getting path from dirent")
+		}
+		s.endpoints.pathMap[ep] = dirPath
+	}
+	return nil
+}
+
+// restoreEndpointMaps recreates and fills the key and dirent maps.
+func (s *session) restoreEndpointMaps(ctx context.Context) error {
+	// When restoring, only need to create the keyMap because the dirent and path
+	// maps got stored through the save.
+	s.endpoints.keyMap = make(map[device.MultiDeviceKey]unix.BoundEndpoint)
+	if err := s.fillKeyMap(ctx); err != nil {
+		return fmt.Errorf("failed to insert sockets into endpoint map: %v", err)
+	}
+
+	// Re-create pathMap because it can no longer be trusted as socket paths can
+	// change while process continues to run. Empty pathMap will be re-filled upon
+	// next save.
+	s.endpoints.pathMap = make(map[unix.BoundEndpoint]string)
+	return nil
+}
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 0154810c8..8e6424492 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -18,16 +18,17 @@ import (
 	"fmt"
 
 	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
 // beforeSave is invoked by stateify.
-//
-// TODO: Make map with private unix sockets savable.
-func (e *endpointMap) beforeSave() {
-	if len(e.m) != 0 {
-		panic("EndpointMap with existing private unix sockets cannot be saved")
+func (s *session) beforeSave() {
+	if s.endpoints != nil {
+		if err := s.fillPathMap(); err != nil {
+			panic("failed to save paths to endpoint map before saving" + err.Error())
+		}
 	}
 }
 
@@ -72,6 +73,9 @@ func (s *session) afterLoad() {
 	if opts.aname != s.aname {
 		panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname))
 	}
+
+	// Check if endpointMaps exist when uds sockets are enabled
+	// (only pathmap will actualy have been saved).
 	if opts.privateunixsocket != (s.endpoints != nil) {
 		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil))
 	}
@@ -96,4 +100,16 @@ func (s *session) afterLoad() {
 	if err != nil {
 		panic(fmt.Sprintf("failed to attach to aname: %v", err))
 	}
+
+	// If private unix sockets are enabled, create and fill the session's endpoint
+	// maps.
+	if opts.privateunixsocket {
+		// TODO: Context is not plumbed to save/restore.
+		ctx := &dummyClockContext{context.Background()}
+
+		if err = s.restoreEndpointMaps(ctx); err != nil {
+			panic("failed to restore endpoint maps: " + err.Error())
+		}
+	}
+
 }
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 406756f5f..8628b9c69 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -30,6 +30,8 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.Bound
 	}
 
 	if i.session().endpoints != nil {
+		unlock := i.session().endpoints.lock()
+		defer unlock()
 		ep := i.session().endpoints.get(i.fileState.key)
 		if ep != nil {
 			return ep
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 66c17debb..e7254fa7d 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -310,8 +310,8 @@ func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldNa
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error {
-	return syserror.EOPNOTSUPP
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, syserror.EOPNOTSUPP
 }
 
 // BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index d0dbce5dd..db7240dca 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -223,7 +223,7 @@ func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent,
 }
 
 // Bind calls i.InodeOperations.Bind with i as the directory.
-func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint, perm FilePermissions) error {
+func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
 	if i.overlay != nil {
 		return overlayBind(ctx, i.overlay, name, data, perm)
 	}
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index b33980178..952f9704d 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -146,7 +146,7 @@ type InodeOperations interface {
 	// Implementations must ensure that name does not already exist.
 	//
 	// The caller must ensure that this operation is permitted.
-	Bind(ctx context.Context, dir *Inode, name string, data unix.BoundEndpoint, perm FilePermissions) error
+	Bind(ctx context.Context, dir *Inode, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error)
 
 	// BoundEndpoint returns the socket endpoint at path stored in
 	// or generated by an Inode.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 53fbd1481..543db9ac7 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -334,13 +334,13 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 	return nil
 }
 
-func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.BoundEndpoint, perm FilePermissions) error {
+func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 	// We do not support doing anything exciting with sockets unless there
 	// is already a directory in the upper filesystem.
 	if o.upper == nil {
-		return syserror.EOPNOTSUPP
+		return nil, syserror.EOPNOTSUPP
 	}
 	return o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm)
 }
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 04432f28c..d8333194b 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -314,17 +314,22 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perms fs.FilePermissions) error {
+func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) {
 	if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil {
-		return ErrDenied
+		return nil, ErrDenied
 	}
-	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
+	inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewBoundEndpoint(ctx, dir, ep, perms)
 	})
 	if err == syscall.EEXIST {
-		return syscall.EADDRINUSE
+		return nil, syscall.EADDRINUSE
 	}
-	return err
+	if err != nil {
+		return nil, err
+	}
+	// Take another ref on inode which will be donated to the new dirent.
+	inode.IncRef()
+	return fs.NewDirent(inode, name), nil
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo.
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index 13e72e775..1028b5f1d 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -279,8 +279,8 @@ func (*Entry) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermiss
 }
 
 // Bind is not supported by default.
-func (*Entry) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error {
-	return ErrInvalidOp
+func (*Entry) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, ErrInvalidOp
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo. CreateFifo is not supported by
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index c91091db4..c6f39fce3 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -215,8 +215,8 @@ func (d *dirInodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode,
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error {
-	return syserror.EPERM
+func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, syserror.EPERM
 }
 
 // GetFile implements fs.InodeOperations.GetFile.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 5b6411f97..1c22e78b3 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -276,9 +276,11 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 			}
 
 			// Create the socket.
-			if err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}); err != nil {
+			childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}})
+			if err != nil {
 				return tcpip.ErrPortInUse
 			}
+			childDir.DecRef()
 		}
 
 		return nil
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 7ec68f573..d4c650892 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,6 +1,13 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
+
+go_binary(
+    name = "uds_test_app",
+    srcs = [
+        "uds_test_app.go",
+    ],
+)
 
 go_library(
     name = "container",
@@ -29,6 +36,7 @@ go_test(
     size = "medium",
     srcs = ["container_test.go"],
     data = [
+        ":uds_test_app",
         "//runsc",
     ],
     tags = [
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 9e38f5f77..11edcd615 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -15,6 +15,7 @@
 package container_test
 
 import (
+	"bytes"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -108,7 +109,8 @@ func procListToString(pl []*control.Process) string {
 	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
 }
 
-// createWriteableOutputFile creates an output file that can be read and written to in the sandbox.
+// createWriteableOutputFile creates an output file that can be read and
+// written to in the sandbox.
 func createWriteableOutputFile(path string) (*os.File, error) {
 	outputFile, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
 	if err != nil {
@@ -136,13 +138,19 @@ func waitForFile(f *os.File) error {
 	return testutil.Poll(op, 5*time.Second)
 }
 
-func readOutputNum(f *os.File, first bool) (int, error) {
-	// Wait until file has contents.
+// readOutputNum reads a file at given filepath and returns the int at the
+// requested position.
+func readOutputNum(file string, position int) (int, error) {
+	f, err := os.Open(file)
+	if err != nil {
+		return 0, fmt.Errorf("error opening file: %q, %v", file, err)
+	}
+
+	// Ensure that there is content in output file.
 	if err := waitForFile(f); err != nil {
-		return 0, err
+		return 0, fmt.Errorf("error waiting for output file: %v", err)
 	}
 
-	// Read the first number in the new file
 	b, err := ioutil.ReadAll(f)
 	if err != nil {
 		return 0, fmt.Errorf("error reading file: %v", err)
@@ -151,14 +159,18 @@ func readOutputNum(f *os.File, first bool) (int, error) {
 		return 0, fmt.Errorf("error no content was read")
 	}
 
+	// Strip leading null bytes caused by file offset not being 0 upon restore.
+	b = bytes.Trim(b, "\x00")
 	nums := strings.Split(string(b), "\n")
 
-	var num int
-	if first {
-		num, err = strconv.Atoi(nums[0])
-	} else {
-		num, err = strconv.Atoi(nums[len(nums)-2])
+	if position >= len(nums) {
+		return 0, fmt.Errorf("position %v is not within the length of content %v", position, nums)
+	}
+	if position == -1 {
+		// Expectation of newline at the end of last position.
+		position = len(nums) - 2
 	}
+	num, err := strconv.Atoi(nums[position])
 	if err != nil {
 		return 0, fmt.Errorf("error getting number from file: %v", err)
 	}
@@ -194,6 +206,27 @@ func run(spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
+// findUDSApp finds the uds_test_app binary to be used in the UnixDomainSocket test.
+func findUDSApp() (string, error) {
+	// TODO: Use bazel FindBinary function.
+
+	// uds_test_app is in a directory like:
+	// './linux_amd64_pure_stripped/uds_test_app.go'.
+	//
+	// Since I don't want to construct 'linux_amd64_pure_stripped' based on the
+	// build type, do a quick search for: './*/uds_test_app'
+	// Note: This glob will only succeed when file is one directory deep.
+	matches, err := filepath.Glob("./*/uds_test_app")
+	if err != nil {
+		return "", fmt.Errorf("error globbing: %v", err)
+	}
+	if i := len(matches); i != 1 {
+		return "", fmt.Errorf("error identifying uds_test_app from matches: got %d matches", i)
+	}
+
+	return matches[0], nil
+}
+
 type configOptions int
 
 const (
@@ -204,7 +237,8 @@ const all = overlay | kvm
 
 // configs generates different configurations to run tests.
 func configs(opts configOptions) []*boot.Config {
-	cs := []*boot.Config{testutil.TestConfig()}
+	cs := []*boot.Config{testutil.TestConfig(), testutil.TestConfig()}
+	return cs
 
 	if opts&overlay != 0 {
 		c := testutil.TestConfig()
@@ -544,6 +578,7 @@ func TestCheckpointRestore(t *testing.T) {
 		if err := os.Chmod(dir, 0777); err != nil {
 			t.Fatalf("error chmoding file: %q, %v", dir, err)
 		}
+		defer os.RemoveAll(dir)
 
 		outputPath := filepath.Join(dir, "output")
 		outputFile, err := createWriteableOutputFile(outputPath)
@@ -598,7 +633,7 @@ func TestCheckpointRestore(t *testing.T) {
 		}
 		defer os.RemoveAll(imagePath)
 
-		lastNum, err := readOutputNum(outputFile, false)
+		lastNum, err := readOutputNum(outputPath, -1)
 		if err != nil {
 			t.Fatalf("error with outputFile: %v", err)
 		}
@@ -624,15 +659,22 @@ func TestCheckpointRestore(t *testing.T) {
 			t.Fatalf("error restoring container: %v", err)
 		}
 
-		firstNum, err := readOutputNum(outputFile2, true)
+		// Wait until application has ran.
+		if err := waitForFile(outputFile2); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
+
+		firstNum, err := readOutputNum(outputPath, 0)
 		if err != nil {
 			t.Fatalf("error with outputFile: %v", err)
 		}
 
-		// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+		// Check that lastNum is one less than firstNum and that the container picks
+		// up from where it left off.
 		if lastNum+1 != firstNum {
 			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
 		}
+		cont2.Destroy()
 
 		// Restore into another container!
 		// Delete and recreate file before restoring.
@@ -656,15 +698,169 @@ func TestCheckpointRestore(t *testing.T) {
 			t.Fatalf("error restoring container: %v", err)
 		}
 
-		firstNum2, err := readOutputNum(outputFile3, true)
+		// Wait until application has ran.
+		if err := waitForFile(outputFile3); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
+
+		firstNum2, err := readOutputNum(outputPath, 0)
 		if err != nil {
 			t.Fatalf("error with outputFile: %v", err)
 		}
 
-		// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+		// Check that lastNum is one less than firstNum and that the container picks
+		// up from where it left off.
 		if lastNum+1 != firstNum2 {
 			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
 		}
+		cont3.Destroy()
+	}
+}
+
+// TestUnixDomainSockets checks that Checkpoint/Restore works in cases
+// with filesystem Unix Domain Socket use.
+func TestUnixDomainSockets(t *testing.T) {
+	const (
+		output    = "uds_output"
+		goferRoot = "/tmp2"
+		socket    = "uds_socket"
+	)
+
+	// Skip overlay because test requires writing to host file.
+	for _, conf := range configs(kvm) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		dir, err := ioutil.TempDir("", "uds-test")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir failed: %v", err)
+		}
+		if err := os.Chmod(dir, 0777); err != nil {
+			t.Fatalf("error chmoding file: %q, %v", dir, err)
+		}
+		defer os.RemoveAll(dir)
+
+		outputPath := filepath.Join(dir, output)
+
+		outputFile, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile.Close()
+
+		// Get file path for corresponding output file in sandbox.
+		outputFileSandbox := filepath.Join(goferRoot, output)
+
+		// Need to get working directory, even though not intuitive.
+		wd, _ := os.Getwd()
+		localPath, err := findUDSApp()
+		if err != nil {
+			t.Fatalf("error finding localPath: %v", err)
+		}
+		app := filepath.Join(wd, localPath)
+
+		if _, err = os.Stat(app); err != nil {
+			t.Fatalf("error finding the uds_test_app: %v", err)
+		}
+
+		socketPath := filepath.Join(dir, socket)
+		socketPathSandbox := filepath.Join(goferRoot, socket)
+		defer os.Remove(socketPath)
+
+		spec := testutil.NewSpecWithArgs(app, "--file", outputFileSandbox,
+			"--socket", socketPathSandbox)
+
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: goferRoot,
+			Source:      dir,
+		})
+
+		spec.Process.User = specs.User{
+			UID: uint32(os.Getuid()),
+			GID: uint32(os.Getgid()),
+		}
+
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// Create and start the container.
+		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+
+		// Set the image path, the location where the checkpoint image will be saved.
+		imagePath := filepath.Join(dir, "test-image-file")
+
+		// Create the image file and open for writing.
+		file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+		if err != nil {
+			t.Fatalf("error opening new file at imagePath: %v", err)
+		}
+		defer file.Close()
+		defer os.RemoveAll(imagePath)
+
+		// Wait until application has ran.
+		if err := waitForFile(outputFile); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
+
+		// Checkpoint running container; save state into new file.
+		if err := cont.Checkpoint(file); err != nil {
+			t.Fatalf("error checkpointing container to empty file: %v", err)
+		}
+
+		// Read last number outputted before checkpoint.
+		lastNum, err := readOutputNum(outputPath, -1)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
+
+		// Delete and recreate file before restoring.
+		if err := os.Remove(outputPath); err != nil {
+			t.Fatalf("error removing file")
+		}
+		outputFile2, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile2.Close()
+
+		// Restore into a new container.
+		contRestore, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer contRestore.Destroy()
+
+		if err := contRestore.Restore(spec, conf, imagePath); err != nil {
+			t.Fatalf("error restoring container: %v", err)
+		}
+
+		// Wait until application has ran.
+		if err := waitForFile(outputFile2); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
+
+		// Read first number outputted after restore.
+		firstNum, err := readOutputNum(outputPath, 0)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
+
+		// Check that lastNum is one less than firstNum.
+		if lastNum+1 != firstNum {
+			t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum)
+		}
+		contRestore.Destroy()
 	}
 }
 
diff --git a/runsc/container/uds_test_app.go b/runsc/container/uds_test_app.go
new file mode 100644
index 000000000..bef98ac66
--- /dev/null
+++ b/runsc/container/uds_test_app.go
@@ -0,0 +1,83 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary uds-test-app opens a socket and reads a series of numbers
+// which are then written to an output file.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"net"
+	"os"
+	"strconv"
+	"time"
+)
+
+var (
+	fileName   = flag.String("file", "", "name of output file")
+	socketPath = flag.String("socket", "", "path to socket")
+)
+
+func server(listener net.Listener, f *os.File) {
+	buf := make([]byte, 16)
+
+	for {
+		c, err := listener.Accept()
+		if err != nil {
+			log.Fatal("error accepting connection:", err)
+		}
+		nr, err := c.Read(buf)
+		if err != nil {
+			log.Fatal("error reading from buf:", err)
+		}
+		data := buf[0:nr]
+		fmt.Fprintf(f, string(data)+"\n")
+	}
+}
+
+func main() {
+	flag.Parse()
+	if *fileName == "" || *socketPath == "" {
+		log.Fatalf("Flags cannot be empty, given: fileName=%s, socketPath=%s", *fileName, *socketPath)
+	}
+	outputFile, err := os.OpenFile(*fileName, os.O_WRONLY|os.O_CREATE, 0666)
+	if err != nil {
+		log.Fatal("error opening output file:", err)
+	}
+
+	socket := *socketPath
+	defer os.Remove(socket)
+
+	listener, err := net.Listen("unix", socket)
+	if err != nil {
+		log.Fatal("error listening on socket:", err)
+	}
+
+	go server(listener, outputFile)
+	for i := 0; ; i++ {
+
+		conn, err := net.Dial("unix", socket)
+		if err != nil {
+			log.Fatal("error dialing:", err)
+		}
+		if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil {
+			log.Fatal("error writing:", err)
+		}
+		conn.Close()
+		time.Sleep(100 * time.Millisecond)
+	}
+
+}
-- 
cgit v1.2.3


From f213a5e0fdb5314dd128363cc4e27cee825e80e6 Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Fri, 10 Aug 2018 15:07:20 -0700
Subject: README for Checkpoint/Restore.

PiperOrigin-RevId: 208274833
Change-Id: Iddda875a87205f7b8fa6f5c60b547522b94a6696
---
 runsc/checkpoint_restore.md | 108 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 runsc/checkpoint_restore.md

(limited to 'runsc')

diff --git a/runsc/checkpoint_restore.md b/runsc/checkpoint_restore.md
new file mode 100644
index 000000000..5fa3280a8
--- /dev/null
+++ b/runsc/checkpoint_restore.md
@@ -0,0 +1,108 @@
+# runsc checkpoint/restore
+
+gVisor supports checkpointing and restoring containers. A container’s state can
+be checkpointed and later restored into one or more containers. This can be used
+to save work and time in cases of failure and allow for container migration. A
+single container can perform slower setup tasks and then be checkpointed so that
+many containers with the same task can be “restored” and started more quickly.
+
+### How to checkpoint/restore
+
+To use the runsc checkpoint command, first run a container.
+
+```sh
+runsc run <container id>
+```
+
+To checkpoint the container, the --image-path flag must be provided. This is the
+directory path within which the checkpoint state-file will be created. The file
+will be called checkpoint.img and necessary directories will be created if they
+do not yet exist.
+
+> Note: Two checkpoints cannot be saved to the save directory; every image-path
+provided must be unique.
+
+```sh
+runsc checkpoint --image-path=<path> <container id>
+```
+
+There is also an optional --leave-running flag that allows the container to
+continue to run after the checkpoint has been made. (By default, containers stop
+their processes after committing a checkpoint.)
+
+> Note: All top-level runsc flags needed when calling run must be provided to
+checkpoint if --leave-running is used.
+
+> Note: --leave-running functions by causing an immediate restore so the
+container, although will maintain its given container id, may have a different
+process id.
+
+```sh
+runsc checkpoint --image-path=<path> --leave-running <container id>
+```
+
+To restore, provide the image path to the checkpoint.img file created during the
+checkpoint. Because containers stop by default after checkpointing, restore
+needs to happen in a new container (restore is a command which parallels start).
+
+```sh
+runsc create <container id>
+
+runsc restore --image-path=<path> <container id>
+```
+
+### How to use checkpoint/restore in Docker:
+
+Currently checkpoint/restore through runsc is not entirely compatible with
+Docker, although there has been progress made from both gVisor and Docker to
+enable compatibility. Here, we document the ideal workflow.
+
+To run with Docker, first follow the [instructions](https://gvisor.googlesource.com/gvisor/+/master/README.md#configuring-docker) to use runsc as a runtime.
+
+Run a container:
+
+```sh
+docker run [options] --runtime=runsc <image>`
+```
+
+Checkpoint a container:
+
+```sh
+docker checkpoint create <container> <checkpoint_name>`
+```
+
+Create a new container into which to restore:
+
+```sh
+docker create [options] --runtime=runsc <image>
+```
+
+Restore a container:
+
+```sh
+docker start --checkpoint --checkpoint-dir=<directory> <container>
+```
+
+**Issues Preventing Compatibility with Docker**
+1. [Moby #37360][leave-running]
+
+Docker version 18.03.0-ce and earlier hangs when checkpointing and
+does not create the checkpoint. To successfully use this feature, install a
+custom version of docker-ce from the moby repository. This issue is caused by an
+improper implementation of the `--leave-running` flag. This issue is now fixed
+although is not yet part of an official release.
+
+2. Docker does not support restoration into new containers.
+
+Docker currently expects the container which created the checkpoint
+to be the same container used to restore which is not possible in runsc. When
+Docker supports container migration and therefore restoration into new
+containers, this will be the flow.
+
+3. [Moby #37344][checkpoint-dir]
+
+Docker does not currently support the `--checkpoint-dir` flag but this will be
+required when restoring from a checkpoint made in another container.
+
+[leave-running]: https://github.com/moby/moby/pull/37360
+[checkpoint-dir]: https://github.com/moby/moby/issues/37344
-- 
cgit v1.2.3


From 36c940b093af58d02eb6e7fd186f14cce84a8dd9 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 10 Aug 2018 15:56:31 -0700
Subject: Move checkpoint/restore readme to g3doc directory.

PiperOrigin-RevId: 208282383
Change-Id: Ifa4aaf5d925b17d9a0672ea951a4570d35855300
---
 README.md                   |   3 +-
 g3doc/checkpoint_restore.md | 108 ++++++++++++++++++++++++++++++++++++++++++++
 runsc/checkpoint_restore.md | 108 --------------------------------------------
 3 files changed, 110 insertions(+), 109 deletions(-)
 create mode 100644 g3doc/checkpoint_restore.md
 delete mode 100644 runsc/checkpoint_restore.md

(limited to 'runsc')

diff --git a/README.md b/README.md
index 01c898086..709da37a0 100644
--- a/README.md
+++ b/README.md
@@ -365,7 +365,7 @@ Then restart the Docker daemon.
 gVisor has the ability to checkpoint a process, save its current state in a
 state file, and restore into a new container using the state file. For more
 information about the checkpoint and restore commands, see the
-[checkpoint/restore readme](https://gvisor.googlesource.com/gvisor/+/master/runsc/checkpoint_restore.md).
+[checkpoint-restore][checkpoint/restore readme]
 
 ## FAQ & Known Issues
 
@@ -442,6 +442,7 @@ See [Contributing.md](CONTRIBUTING.md).
 [apparmor]: https://wiki.ubuntu.com/AppArmor
 [bazel]: https://bazel.build
 [bug]: https://github.com/google/gvisor/issues
+[checkpoint-restore]: https://gvisor.googlesource.com/gvisor/+/master/g3doc/checkpoint_restore.md
 [cri-o-k8s]: https://github.com/kubernetes-incubator/cri-o/blob/master/kubernetes.md
 [cri-o]: https://github.com/kubernetes-incubator/cri-o
 [docker-storage-driver]: https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-storage-driver
diff --git a/g3doc/checkpoint_restore.md b/g3doc/checkpoint_restore.md
new file mode 100644
index 000000000..5fa3280a8
--- /dev/null
+++ b/g3doc/checkpoint_restore.md
@@ -0,0 +1,108 @@
+# runsc checkpoint/restore
+
+gVisor supports checkpointing and restoring containers. A container’s state can
+be checkpointed and later restored into one or more containers. This can be used
+to save work and time in cases of failure and allow for container migration. A
+single container can perform slower setup tasks and then be checkpointed so that
+many containers with the same task can be “restored” and started more quickly.
+
+### How to checkpoint/restore
+
+To use the runsc checkpoint command, first run a container.
+
+```sh
+runsc run <container id>
+```
+
+To checkpoint the container, the --image-path flag must be provided. This is the
+directory path within which the checkpoint state-file will be created. The file
+will be called checkpoint.img and necessary directories will be created if they
+do not yet exist.
+
+> Note: Two checkpoints cannot be saved to the save directory; every image-path
+provided must be unique.
+
+```sh
+runsc checkpoint --image-path=<path> <container id>
+```
+
+There is also an optional --leave-running flag that allows the container to
+continue to run after the checkpoint has been made. (By default, containers stop
+their processes after committing a checkpoint.)
+
+> Note: All top-level runsc flags needed when calling run must be provided to
+checkpoint if --leave-running is used.
+
+> Note: --leave-running functions by causing an immediate restore so the
+container, although will maintain its given container id, may have a different
+process id.
+
+```sh
+runsc checkpoint --image-path=<path> --leave-running <container id>
+```
+
+To restore, provide the image path to the checkpoint.img file created during the
+checkpoint. Because containers stop by default after checkpointing, restore
+needs to happen in a new container (restore is a command which parallels start).
+
+```sh
+runsc create <container id>
+
+runsc restore --image-path=<path> <container id>
+```
+
+### How to use checkpoint/restore in Docker:
+
+Currently checkpoint/restore through runsc is not entirely compatible with
+Docker, although there has been progress made from both gVisor and Docker to
+enable compatibility. Here, we document the ideal workflow.
+
+To run with Docker, first follow the [instructions](https://gvisor.googlesource.com/gvisor/+/master/README.md#configuring-docker) to use runsc as a runtime.
+
+Run a container:
+
+```sh
+docker run [options] --runtime=runsc <image>`
+```
+
+Checkpoint a container:
+
+```sh
+docker checkpoint create <container> <checkpoint_name>`
+```
+
+Create a new container into which to restore:
+
+```sh
+docker create [options] --runtime=runsc <image>
+```
+
+Restore a container:
+
+```sh
+docker start --checkpoint --checkpoint-dir=<directory> <container>
+```
+
+**Issues Preventing Compatibility with Docker**
+1. [Moby #37360][leave-running]
+
+Docker version 18.03.0-ce and earlier hangs when checkpointing and
+does not create the checkpoint. To successfully use this feature, install a
+custom version of docker-ce from the moby repository. This issue is caused by an
+improper implementation of the `--leave-running` flag. This issue is now fixed
+although is not yet part of an official release.
+
+2. Docker does not support restoration into new containers.
+
+Docker currently expects the container which created the checkpoint
+to be the same container used to restore which is not possible in runsc. When
+Docker supports container migration and therefore restoration into new
+containers, this will be the flow.
+
+3. [Moby #37344][checkpoint-dir]
+
+Docker does not currently support the `--checkpoint-dir` flag but this will be
+required when restoring from a checkpoint made in another container.
+
+[leave-running]: https://github.com/moby/moby/pull/37360
+[checkpoint-dir]: https://github.com/moby/moby/issues/37344
diff --git a/runsc/checkpoint_restore.md b/runsc/checkpoint_restore.md
deleted file mode 100644
index 5fa3280a8..000000000
--- a/runsc/checkpoint_restore.md
+++ /dev/null
@@ -1,108 +0,0 @@
-# runsc checkpoint/restore
-
-gVisor supports checkpointing and restoring containers. A container’s state can
-be checkpointed and later restored into one or more containers. This can be used
-to save work and time in cases of failure and allow for container migration. A
-single container can perform slower setup tasks and then be checkpointed so that
-many containers with the same task can be “restored” and started more quickly.
-
-### How to checkpoint/restore
-
-To use the runsc checkpoint command, first run a container.
-
-```sh
-runsc run <container id>
-```
-
-To checkpoint the container, the --image-path flag must be provided. This is the
-directory path within which the checkpoint state-file will be created. The file
-will be called checkpoint.img and necessary directories will be created if they
-do not yet exist.
-
-> Note: Two checkpoints cannot be saved to the save directory; every image-path
-provided must be unique.
-
-```sh
-runsc checkpoint --image-path=<path> <container id>
-```
-
-There is also an optional --leave-running flag that allows the container to
-continue to run after the checkpoint has been made. (By default, containers stop
-their processes after committing a checkpoint.)
-
-> Note: All top-level runsc flags needed when calling run must be provided to
-checkpoint if --leave-running is used.
-
-> Note: --leave-running functions by causing an immediate restore so the
-container, although will maintain its given container id, may have a different
-process id.
-
-```sh
-runsc checkpoint --image-path=<path> --leave-running <container id>
-```
-
-To restore, provide the image path to the checkpoint.img file created during the
-checkpoint. Because containers stop by default after checkpointing, restore
-needs to happen in a new container (restore is a command which parallels start).
-
-```sh
-runsc create <container id>
-
-runsc restore --image-path=<path> <container id>
-```
-
-### How to use checkpoint/restore in Docker:
-
-Currently checkpoint/restore through runsc is not entirely compatible with
-Docker, although there has been progress made from both gVisor and Docker to
-enable compatibility. Here, we document the ideal workflow.
-
-To run with Docker, first follow the [instructions](https://gvisor.googlesource.com/gvisor/+/master/README.md#configuring-docker) to use runsc as a runtime.
-
-Run a container:
-
-```sh
-docker run [options] --runtime=runsc <image>`
-```
-
-Checkpoint a container:
-
-```sh
-docker checkpoint create <container> <checkpoint_name>`
-```
-
-Create a new container into which to restore:
-
-```sh
-docker create [options] --runtime=runsc <image>
-```
-
-Restore a container:
-
-```sh
-docker start --checkpoint --checkpoint-dir=<directory> <container>
-```
-
-**Issues Preventing Compatibility with Docker**
-1. [Moby #37360][leave-running]
-
-Docker version 18.03.0-ce and earlier hangs when checkpointing and
-does not create the checkpoint. To successfully use this feature, install a
-custom version of docker-ce from the moby repository. This issue is caused by an
-improper implementation of the `--leave-running` flag. This issue is now fixed
-although is not yet part of an official release.
-
-2. Docker does not support restoration into new containers.
-
-Docker currently expects the container which created the checkpoint
-to be the same container used to restore which is not possible in runsc. When
-Docker supports container migration and therefore restoration into new
-containers, this will be the flow.
-
-3. [Moby #37344][checkpoint-dir]
-
-Docker does not currently support the `--checkpoint-dir` flag but this will be
-required when restoring from a checkpoint made in another container.
-
-[leave-running]: https://github.com/moby/moby/pull/37360
-[checkpoint-dir]: https://github.com/moby/moby/issues/37344
-- 
cgit v1.2.3


From e8a4f2e133c3a7fb4a2dceb6675ebc57ea4f7350 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 14 Aug 2018 16:24:46 -0700
Subject: runsc: Change cache policy for root fs and volume mounts.

Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively.  While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.

This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.

This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.

A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.

All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.

PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
---
 pkg/sentry/fs/gofer/cache_policy.go |   7 +-
 runsc/boot/config.go                |   9 ++
 runsc/boot/fs.go                    |  62 +++++++----
 runsc/boot/loader_test.go           |  36 +++----
 runsc/container/container_test.go   | 207 +++++++++++++++++++++++++++++++-----
 runsc/main.go                       |   6 +-
 runsc/sandbox/sandbox.go            |   8 +-
 runsc/test/testutil/testutil.go     |   1 +
 8 files changed, 261 insertions(+), 75 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 52d97b54f..fa8abf51c 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -35,10 +35,9 @@ const (
 	// fs agent immediately.
 	cacheAllWritethrough
 
-	// Use virtual file system cache for everything, but reload dirents
-	// from the remote filesystem on each lookup. Thus, if the remote
-	// filesystem has changed, the returned dirent will have the updated
-	// state.
+	// Use the (host) page cache for reads/writes, but don't cache anything
+	// else. This allows the sandbox filesystem to stay in sync with any
+	// changes to the remote filesystem.
 	//
 	// This policy should *only* be used with remote filesystems that
 	// donate their host FDs to the sandbox and thus use the host page
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 074cd6a63..6c69a7c38 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -64,6 +64,11 @@ const (
 	// requests and forwards them to the host.
 	FileAccessProxy FileAccessType = iota
 
+	// FileAccessProxyExclusive is the same as FileAccessProxy, but enables
+	// extra caching for improved performance. It should only be used if
+	// the sandbox has exclusive access to the filesystem.
+	FileAccessProxyExclusive
+
 	// FileAccessDirect connects the sandbox directly to the host filesystem.
 	FileAccessDirect
 )
@@ -73,6 +78,8 @@ func MakeFileAccessType(s string) (FileAccessType, error) {
 	switch s {
 	case "proxy":
 		return FileAccessProxy, nil
+	case "proxy-exclusive":
+		return FileAccessProxyExclusive, nil
 	case "direct":
 		return FileAccessDirect, nil
 	default:
@@ -84,6 +91,8 @@ func (f FileAccessType) String() string {
 	switch f {
 	case FileAccessProxy:
 		return "proxy"
+	case FileAccessProxyExclusive:
+		return "proxy-exclusive"
 	case FileAccessDirect:
 		return "direct"
 	default:
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e596c739f..eea2ec1f5 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"fmt"
 	"path/filepath"
+	"strconv"
 	"strings"
 
 	// Include filesystem types that OCI spec might mount.
@@ -54,6 +55,9 @@ type fdDispenser struct {
 }
 
 func (f *fdDispenser) remove() int {
+	if f.empty() {
+		panic("fdDispenser out of fds")
+	}
 	rv := f.fds[0]
 	f.fds = f.fds[1:]
 	return rv
@@ -160,8 +164,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 // setMounts iterates over mounts and mounts them in the specified
 // mount namespace.
 func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
-
-	// Mount all submounts from mounts.
 	for _, m := range mounts {
 		if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil {
 			return err
@@ -181,11 +183,12 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	)
 
 	switch conf.FileAccess {
-	case FileAccessProxy:
+	case FileAccessProxy, FileAccessProxyExclusive:
 		fd := fds.remove()
 		log.Infof("Mounting root over 9P, ioFD: %d", fd)
 		hostFS := mustFindFilesystem("9p")
-		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
+		opts := p9MountOptions(conf, fd)
+		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
 		if err != nil {
 			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 		}
@@ -242,13 +245,16 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
 }
 
-// getMountNameAndOptions retrieves the fsName, data, and useOverlay values
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
 // used for mounts.
 func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
-	var fsName string
-	var data []string
-	var useOverlay bool
-	var err error
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+		err        error
+	)
+
 	switch m.Type {
 	case "devpts", "devtmpfs", "proc", "sysfs":
 		fsName = m.Type
@@ -258,17 +264,17 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 		fsName = m.Type
 
 		// tmpfs has some extra supported options that we must pass through.
-		data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
 
 	case "bind":
 		switch conf.FileAccess {
-		case FileAccessProxy:
+		case FileAccessProxy, FileAccessProxyExclusive:
 			fd := fds.remove()
 			fsName = "9p"
-			data = []string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}
+			opts = p9MountOptions(conf, fd)
 		case FileAccessDirect:
 			fsName = "whitelistfs"
-			data = []string{"root=" + m.Source, "dont_translate_ownership=true"}
+			opts = []string{"root=" + m.Source, "dont_translate_ownership=true"}
 		default:
 			err = fmt.Errorf("invalid file access type: %v", conf.FileAccess)
 		}
@@ -282,13 +288,13 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 		// we do not support.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
-	return fsName, data, useOverlay, err
+	return fsName, opts, useOverlay, err
 }
 
 func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
-	fsName, data, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
 
 	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
 	if err != nil {
@@ -307,7 +313,7 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fd
 		mf.ReadOnly = true
 	}
 
-	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(data, ","))
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","))
 	if err != nil {
 		return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
 	}
@@ -387,6 +393,20 @@ func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
 	return nil
 }
 
+// p9MountOptions creates a slice of options for a p9 mount.
+func p9MountOptions(conf *Config, fd int) []string {
+	opts := []string{
+		"trans=fd",
+		"rfdno=" + strconv.Itoa(fd),
+		"wfdno=" + strconv.Itoa(fd),
+		"privateunixsocket=true",
+	}
+	if conf.FileAccess == FileAccessProxy {
+		opts = append(opts, "cache=remote_revalidating")
+	}
+	return opts
+}
+
 // parseAndFilterOptions parses a MountOptions slice and filters by the allowed
 // keys.
 func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
@@ -436,8 +456,7 @@ func mountDevice(m specs.Mount) string {
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
 func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
-	fsName, data, _, err := getMountNameAndOptions(conf, m, fds)
-	dataString := strings.Join(data, ",")
+	fsName, opts, _, err := getMountNameAndOptions(conf, m, fds)
 
 	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
 	if err != nil {
@@ -452,7 +471,7 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 	newMount := fs.MountArgs{
 		Dev:   mountDevice(m),
 		Flags: mountFlags(m.Options),
-		Data:  dataString,
+		Data:  strings.Join(opts, ","),
 	}
 	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
 	log.Infof("Added mount at %q: %+v", fsName, newMount)
@@ -473,7 +492,8 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 
 	// Add root mount.
 	fd := fds.remove()
-	dataString := strings.Join([]string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}, ",")
+	opts := p9MountOptions(conf, fd)
+
 	mf := fs.MountSourceFlags{}
 	if spec.Root.Readonly {
 		mf.ReadOnly = true
@@ -482,7 +502,7 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	rootMount := fs.MountArgs{
 		Dev:   rootDevice,
 		Flags: mf,
-		Data:  dataString,
+		Data:  strings.Join(opts, ","),
 	}
 	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 7ea2e1ee5..f2f690b5d 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -398,7 +398,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -458,11 +458,11 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 						{
 							Dev:  "9pfs-/dev/fd-foo",
-							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true",
+							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -522,7 +522,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -606,21 +606,21 @@ func TestRestoreEnvironment(t *testing.T) {
 			errorExpected: true,
 		},
 	}
-
 	for _, tc := range testCases {
-		fds := &fdDispenser{fds: tc.ioFDs}
-
-		actualRenv, err := createRestoreEnvironment(tc.spec, tc.conf, fds)
-		if !tc.errorExpected && err != nil {
-			t.Fatalf("could not create restore environment for test:%s", tc.name)
-		} else if tc.errorExpected {
-			if err == nil {
-				t.Fatalf("expected an error, but no error occurred.")
+		t.Run(tc.name, func(t *testing.T) {
+			fds := &fdDispenser{fds: tc.ioFDs}
+			actualRenv, err := createRestoreEnvironment(tc.spec, tc.conf, fds)
+			if !tc.errorExpected && err != nil {
+				t.Fatalf("could not create restore environment for test:%s", tc.name)
+			} else if tc.errorExpected {
+				if err == nil {
+					t.Errorf("expected an error, but no error occurred.")
+				}
+			} else {
+				if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) {
+					t.Errorf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv)
+				}
 			}
-		} else {
-			if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) {
-				t.Fatalf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv)
-			}
-		}
+		})
 	}
 }
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 11edcd615..33c53e189 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -227,33 +227,43 @@ func findUDSApp() (string, error) {
 	return matches[0], nil
 }
 
-type configOptions int
+type configOption int
 
 const (
-	overlay configOptions = 1 << iota
+	overlay configOption = iota
 	kvm
+	nonExclusiveFS
 )
-const all = overlay | kvm
+
+var all = []configOption{overlay, kvm, nonExclusiveFS}
 
 // configs generates different configurations to run tests.
-func configs(opts configOptions) []*boot.Config {
-	cs := []*boot.Config{testutil.TestConfig(), testutil.TestConfig()}
-	return cs
+func configs(opts ...configOption) []*boot.Config {
+	// Always load the default config.
+	cs := []*boot.Config{testutil.TestConfig()}
 
-	if opts&overlay != 0 {
+	for _, o := range opts {
 		c := testutil.TestConfig()
-		c.Overlay = true
+		switch o {
+		case overlay:
+			c.Overlay = true
+		case kvm:
+			// TODO: KVM tests are flaky. Disable until fixed.
+			continue
+
+			// TODO: KVM doesn't work with --race.
+			if testutil.RaceEnabled {
+				continue
+			}
+			c.Platform = boot.PlatformKVM
+		case nonExclusiveFS:
+			c.FileAccess = boot.FileAccessProxy
+		default:
+			panic(fmt.Sprintf("unknown config option %v", o))
+
+		}
 		cs = append(cs, c)
 	}
-
-	// TODO: KVM tests are flaky. Disable until fixed.
-	// // TODO: KVM doesn't work with --race.
-	// if !testutil.RaceEnabled && opts&kvm != 0 {
-	// 	c := testutil.TestConfig()
-	// 	c.Platform = boot.PlatformKVM
-	// 	cs = append(cs, c)
-	// }
-
 	return cs
 }
 
@@ -261,7 +271,7 @@ func configs(opts configOptions) []*boot.Config {
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
 func TestLifecycle(t *testing.T) {
-	for _, conf := range configs(all) {
+	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		// The container will just sleep for a long time.  We will kill it before
 		// it finishes sleeping.
@@ -1049,10 +1059,11 @@ func TestPauseResumeStatus(t *testing.T) {
 // - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips
 //   this check.
 func TestCapabilities(t *testing.T) {
-	const uid = 343
-	const gid = 2401
+	// Pick uid/gid different than ours.
+	uid := auth.KUID(os.Getuid() + 1)
+	gid := auth.KGID(os.Getgid() + 1)
 
-	for _, conf := range configs(all) {
+	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("sleep", "100")
@@ -1142,7 +1153,7 @@ func TestCapabilities(t *testing.T) {
 
 // Test that an tty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	for _, conf := range configs(all) {
+	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		spec := testutil.NewSpecWithArgs("true")
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
@@ -1303,8 +1314,6 @@ func TestReadonlyRoot(t *testing.T) {
 		defer os.RemoveAll(rootDir)
 		defer os.RemoveAll(bundleDir)
 
-		conf.Overlay = true
-
 		// Create, start and wait for the container.
 		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
@@ -1348,8 +1357,6 @@ func TestReadonlyMount(t *testing.T) {
 		defer os.RemoveAll(rootDir)
 		defer os.RemoveAll(bundleDir)
 
-		conf.Overlay = true
-
 		// Create, start and wait for the container.
 		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
@@ -1430,7 +1437,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	for _, conf := range configs(all) {
+	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		containerIDs := []string{
@@ -1619,3 +1626,149 @@ func TestMultiContainerWait(t *testing.T) {
 
 	wg.Wait()
 }
+
+// Check that modifications to a volume mount are propigated into and out of
+// the sandbox.
+func TestContainerVolumeContentsShared(t *testing.T) {
+	// Only run this test with shared proxy, since that is the only
+	// behavior it is testing.
+	conf := testutil.TestConfig()
+	conf.FileAccess = boot.FileAccessProxy
+	t.Logf("Running test with conf: %+v", conf)
+
+	// Main process just sleeps. We will use "exec" to probe the state of
+	// the filesystem.
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	// Mount host temp dir inside the sandbox at '/tmp2'.
+	hostTmpDir, err := ioutil.TempDir("", "root-fs-test")
+	sandboxTmpDir := "/tmp2"
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Type:        "bind",
+		Destination: sandboxTmpDir,
+		Source:      hostTmpDir,
+	})
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// File that will be used to check consistency inside/outside sandbox.
+	hostFilename := filepath.Join(hostTmpDir, "file")
+	sandboxFilename := filepath.Join(sandboxTmpDir, "file")
+
+	// File does not exist yet. Reading from the sandbox should fail.
+	execArgsTestFile := control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", sandboxFilename},
+	}
+	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
+	}
+
+	// Create the file from outside of the sandbox.
+	if err := ioutil.WriteFile(hostFilename, []byte("foobar"), 0777); err != nil {
+		t.Fatalf("error writing to file %q: %v", hostFilename, err)
+	}
+
+	// Now we should be able to test the file from within the sandbox.
+	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+	}
+
+	// Rename the file from outside of the sandbox.
+	newHostFilename := filepath.Join(hostTmpDir, "newfile")
+	newSandboxFilename := filepath.Join(sandboxTmpDir, "newfile")
+	if err := os.Rename(hostFilename, newHostFilename); err != nil {
+		t.Fatalf("os.Rename(%q, %q) failed: %v", hostFilename, newHostFilename, err)
+	}
+
+	// File should no longer exist at the old path within the sandbox.
+	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", sandboxFilename, ws.ExitStatus())
+	}
+
+	// We should be able to test the new filename from within the sandbox.
+	execArgsTestNewFile := control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", newSandboxFilename},
+	}
+	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newSandboxFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", newSandboxFilename, ws.ExitStatus())
+	}
+
+	// Delete the renamed file from outside of the sandbox.
+	if err := os.Remove(newHostFilename); err != nil {
+		t.Fatalf("error removing file %q: %v", hostFilename, err)
+	}
+
+	// Renamed file should no longer exist at the old path within the sandbox.
+	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newSandboxFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", newSandboxFilename, ws.ExitStatus())
+	}
+
+	// Now create the file from WITHIN the sandbox.
+	execArgsTouch := control.ExecArgs{
+		Filename: "/usr/bin/touch",
+		Argv:     []string{"touch", sandboxFilename},
+		KUID:     auth.KUID(os.Getuid()),
+		KGID:     auth.KGID(os.Getgid()),
+	}
+	if ws, err := c.Execute(&execArgsTouch); err != nil {
+		t.Fatalf("unexpected error touching file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("touch %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(hostFilename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", hostFilename, err)
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(hostFilename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", hostFilename, err)
+	}
+
+	// Delete the file from within the sandbox.
+	execArgsRemove := control.ExecArgs{
+		Filename: "/bin/rm",
+		Argv:     []string{"rm", sandboxFilename},
+	}
+	if ws, err := c.Execute(&execArgsRemove); err != nil {
+		t.Fatalf("unexpected error removing file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("remove %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+	}
+
+	// File should not exist outside the sandbox.
+	if _, err := os.Stat(hostFilename); !os.IsNotExist(err) {
+		t.Errorf("stat %q got error %v, wanted ErrNotExist", hostFilename, err)
+	}
+}
diff --git a/runsc/main.go b/runsc/main.go
index 10ae44b5e..b36100cca 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -57,7 +57,7 @@ var (
 	// Flags that control sandbox runtime behavior.
 	platform       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
 	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	fileAccess     = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.")
+	fileAccess     = flag.String("file-access", "proxy-exclusive", "specifies which filesystem to use: proxy-exclusive (default), proxy-shared, or direct. Using a proxy is more secure because it disallows the sandbox from opening files directly in the host. Setting 'proxy-shared' will disable caches and should be used if external modifications to the filesystem are expected.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
@@ -119,6 +119,10 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
+	if *fileAccess == "proxy" && *overlay {
+		cmd.Fatalf("overlay flag is incompatible with file-access=proxy")
+	}
+
 	// Create a new Config from the flags.
 	conf := &boot.Config{
 		RootDir:        *rootDir,
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 196949f11..2b043d412 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -233,7 +233,7 @@ func (s *Sandbox) connError(err error) error {
 }
 
 func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) ([]*os.File, error) {
-	if conf.FileAccess != boot.FileAccessProxy {
+	if conf.FileAccess == boot.FileAccessDirect {
 		// Don't start a gofer. The sandbox will access host FS directly.
 		return nil, nil
 	}
@@ -369,11 +369,11 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
 	}
 
-	if conf.FileAccess == boot.FileAccessProxy {
+	if conf.FileAccess == boot.FileAccessDirect {
+		log.Infof("Sandbox will be started in the current mount namespace")
+	} else {
 		log.Infof("Sandbox will be started in new mount namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
-	} else {
-		log.Infof("Sandbox will be started in the current mount namespace")
 	}
 
 	// Joins the network namespace if network is enabled. the sandbox talks
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4e7ab3760..d2b39b58c 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -81,6 +81,7 @@ func TestConfig() *boot.Config {
 		Network:        boot.NetworkNone,
 		Strace:         true,
 		MultiContainer: true,
+		FileAccess:     boot.FileAccessProxyExclusive,
 	}
 }
 
-- 
cgit v1.2.3


From 2033f61aae6ff1b3e613d7bb9e9da273791a5176 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 15 Aug 2018 09:33:19 -0700
Subject: runsc: Fix instances of file access "proxy".

This file access type is actually called "proxy-shared", but I forgot to update
all locations.

PiperOrigin-RevId: 208832491
Change-Id: I7848bc4ec2478f86cf2de1dcd1bfb5264c6276de
---
 runsc/boot/config.go | 4 ++--
 runsc/main.go        | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 6c69a7c38..bc392deb3 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -76,7 +76,7 @@ const (
 // MakeFileAccessType converts type from string.
 func MakeFileAccessType(s string) (FileAccessType, error) {
 	switch s {
-	case "proxy":
+	case "proxy-shared":
 		return FileAccessProxy, nil
 	case "proxy-exclusive":
 		return FileAccessProxyExclusive, nil
@@ -90,7 +90,7 @@ func MakeFileAccessType(s string) (FileAccessType, error) {
 func (f FileAccessType) String() string {
 	switch f {
 	case FileAccessProxy:
-		return "proxy"
+		return "proxy-shared"
 	case FileAccessProxyExclusive:
 		return "proxy-exclusive"
 	case FileAccessDirect:
diff --git a/runsc/main.go b/runsc/main.go
index b36100cca..0a2cbca6c 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -109,6 +109,10 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
+	if fsAccess == boot.FileAccessProxy && *overlay {
+		cmd.Fatalf("overlay flag is incompatible with proxy-shared file access")
+	}
+
 	netType, err := boot.MakeNetworkType(*network)
 	if err != nil {
 		cmd.Fatalf("%v", err)
@@ -119,10 +123,6 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
-	if *fileAccess == "proxy" && *overlay {
-		cmd.Fatalf("overlay flag is incompatible with file-access=proxy")
-	}
-
 	// Create a new Config from the flags.
 	conf := &boot.Config{
 		RootDir:        *rootDir,
-- 
cgit v1.2.3


From 635b0c45933cd841298b0c21a513a9169e849594 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 15 Aug 2018 16:24:07 -0700
Subject: runsc fsgofer: Support dynamic serving of filesystems.

When multiple containers run inside a sentry, each container has its own root
filesystem and set of mounts. Containers are also added after sentry boot rather
than all configured and known at boot time.

The fsgofer needs to be able to serve the root filesystem of each container.
Thus, it must be possible to add filesystems after the fsgofer has already
started.

This change:
* Creates a URPC endpoint within the gofer process that listens for requests to
  serve new content.
* Enables the sentry, when starting a new container, to add the new container's
  filesystem.
* Mounts those new filesystems at separate roots within the sentry.

PiperOrigin-RevId: 208903248
Change-Id: Ifa91ec9c8caf5f2f0a9eead83c4a57090ce92068
---
 pkg/sentry/kernel/kernel.go  |  14 ++-
 pkg/urpc/urpc.go             |  10 +++
 runsc/boot/controller.go     |  19 +++-
 runsc/boot/fs.go             | 159 ++++++++++++++++++++++++---------
 runsc/boot/loader.go         |  36 ++++----
 runsc/cmd/BUILD              |   1 -
 runsc/cmd/gofer.go           |  36 +++-----
 runsc/container/container.go |   7 ++
 runsc/fsgofer/BUILD          |   4 +
 runsc/fsgofer/control.go     | 203 +++++++++++++++++++++++++++++++++++++++++++
 runsc/fsgofer/fsgofer.go     |   5 ++
 runsc/sandbox/BUILD          |   1 +
 runsc/sandbox/sandbox.go     | 139 ++++++++++++++++++++++-------
 13 files changed, 515 insertions(+), 119 deletions(-)
 create mode 100644 runsc/fsgofer/control.go

(limited to 'runsc')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 419a1d473..cb43fdcdc 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -504,6 +504,14 @@ type CreateProcessArgs struct {
 
 	// IPCNamespace is the initial IPC namespace.
 	IPCNamespace *IPCNamespace
+
+	// Root optionally contains the dirent that serves as the root for the
+	// process. If nil, the mount namespace's root is used as the process'
+	// root.
+	//
+	// Anyone setting Root must donate a reference (i.e. increment it) to
+	// keep it alive until it is decremented by CreateProcess.
+	Root *fs.Dirent
 }
 
 // NewContext returns a context.Context that represents the task that will be
@@ -581,8 +589,12 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	ctx := args.NewContext(k)
 
 	// Grab the root directory.
-	root := fs.RootFromContext(ctx)
+	root := args.Root
+	if root == nil {
+		root = fs.RootFromContext(ctx)
+	}
 	defer root.DecRef()
+	args.Root = nil
 
 	// Grab the working directory.
 	wd := root // Default.
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index af620b704..1ec06dd4c 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -63,6 +63,10 @@ func (r RemoteError) Error() string {
 // file as a result of an RPC. These are not actually serialized, rather they
 // are sent via an accompanying SCM_RIGHTS message (plumbed through the unet
 // package).
+//
+// When embedding a FilePayload in an argument struct, the argument type _must_
+// be a pointer to the struct rather than the struct type itself. This is
+// because the urpc package defines pointer methods on FilePayload.
 type FilePayload struct {
 	Files []*os.File `json:"-"`
 }
@@ -552,6 +556,12 @@ func (c *Client) Call(method string, arg interface{}, result interface{}) error
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
+	// If arg is a FilePayload, not a *FilePayload, files won't actually be
+	// sent, so error out.
+	if _, ok := arg.(FilePayload); ok {
+		return fmt.Errorf("argument is a FilePayload, but should be a *FilePayload")
+	}
+
 	// Are there files to send?
 	var fs []*os.File
 	if fp, ok := arg.(filePayloader); ok {
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index fc6ea326a..69e88d8e0 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"errors"
 	"fmt"
+	"path"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
@@ -181,11 +182,15 @@ type StartArgs struct {
 
 	// CID is the ID of the container to start.
 	CID string
+
+	// FilePayload contains the file descriptor over which the sandbox will
+	// request files from its root filesystem.
+	urpc.FilePayload
 }
 
 // Start runs a created container within a sandbox.
 func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
-	log.Debugf("containerManager.Start")
+	log.Debugf("containerManager.Start: %+v", args)
 
 	// Validate arguments.
 	if args == nil {
@@ -200,8 +205,18 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if args.CID == "" {
 		return errors.New("start argument missing container ID")
 	}
+	// Prevent CIDs containing ".." from confusing the sentry when creating
+	// /containers/<cid> directory.
+	// TODO: Once we have multiple independant roots, this
+	// check won't be necessary.
+	if path.Clean(args.CID) != args.CID {
+		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
+	}
+	if len(args.FilePayload.Files) != 1 {
+		return fmt.Errorf("start arguments must contain one file for the container root")
+	}
 
-	tgid, err := cm.l.startContainer(args, cm.l.k)
+	tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files[0])
 	if err != nil {
 		return err
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index eea2ec1f5..8996b1398 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -48,6 +48,19 @@ const (
 
 	// Device name for root mount.
 	rootDevice = "9pfs-/"
+
+	// childContainersDir is the directory where child container root
+	// filesystems are mounted.
+	childContainersDir = "/__runsc_containers__"
+
+	// Filesystems that runsc supports.
+	bind     = "bind"
+	devpts   = "devpts"
+	devtmpfs = "devtmpfs"
+	proc     = "proc"
+	sysfs    = "sysfs"
+	tmpfs    = "tmpfs"
+	nonefs   = "none"
 )
 
 type fdDispenser struct {
@@ -70,8 +83,15 @@ func (f *fdDispenser) empty() bool {
 // createMountNamespace creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
 func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
+	mounts := compileMounts(spec)
+	// Create a tmpfs mount where we create and mount a root filesystem for
+	// each child container.
+	mounts = append(mounts, specs.Mount{
+		Type:        tmpfs,
+		Destination: childContainersDir,
+	})
 	fds := &fdDispenser{fds: ioFDs}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds)
+	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount: %v", err)
 	}
@@ -79,7 +99,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
 	}
-	mounts := compileMounts(spec)
+
 	if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil {
 		return nil, fmt.Errorf("failed to configure mounts: %v", err)
 	}
@@ -98,12 +118,12 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 
 	// Always mount /dev.
 	mounts = append(mounts, specs.Mount{
-		Type:        "devtmpfs",
+		Type:        devtmpfs,
 		Destination: "/dev",
 	})
 
 	mounts = append(mounts, specs.Mount{
-		Type:        "devpts",
+		Type:        devpts,
 		Destination: "/dev/pts",
 	})
 
@@ -129,13 +149,13 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	var mandatoryMounts []specs.Mount
 	if !procMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        "proc",
+			Type:        proc,
 			Destination: "/proc",
 		})
 	}
 	if !sysMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        "sysfs",
+			Type:        sysfs,
 			Destination: "/sys",
 		})
 	}
@@ -149,7 +169,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 		// that. Until then, the /tmp mount will always appear empty at
 		// container creation.
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        "tmpfs",
+			Type:        tmpfs,
 			Destination: "/tmp",
 		})
 	}
@@ -165,7 +185,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 // mount namespace.
 func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
 	for _, m := range mounts {
-		if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil {
+		if err := mountSubmount(ctx, conf, mns, fds, m, mounts, m.Destination); err != nil {
 			return err
 		}
 	}
@@ -173,7 +193,7 @@ func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *f
 }
 
 // createRootMount creates the root filesystem.
-func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) {
+func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
 	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly}
 
@@ -207,7 +227,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	// We need to overlay the root on top of a ramfs with stub directories
 	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
 	// mounted even if they are not in the spec.
-	submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp")
+	submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
 	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
 	if err != nil {
 		return nil, fmt.Errorf("error adding submount overlay: %v", err)
@@ -256,17 +276,17 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 	)
 
 	switch m.Type {
-	case "devpts", "devtmpfs", "proc", "sysfs":
+	case devpts, devtmpfs, proc, sysfs:
 		fsName = m.Type
-	case "none":
-		fsName = "sysfs"
-	case "tmpfs":
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
 		fsName = m.Type
 
 		// tmpfs has some extra supported options that we must pass through.
 		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
 
-	case "bind":
+	case bind:
 		switch conf.FileAccess {
 		case FileAccessProxy, FileAccessProxyExclusive:
 			fd := fds.remove()
@@ -291,7 +311,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 	return fsName, opts, useOverlay, err
 }
 
-func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
+func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount, dest string) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
@@ -342,51 +362,52 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fd
 	// in the right location, e.g.
 	//   mount: /var/run/secrets, may be created in '/run/secrets' if
 	//   '/var/run' => '/var'.
-	if err := mkdirAll(ctx, mns, m.Destination); err != nil {
+	if err := mkdirAll(ctx, mns, dest); err != nil {
 		return err
 	}
 
 	root := mns.Root()
 	defer root.DecRef()
-	dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals)
+	dirent, err := mns.FindInode(ctx, root, nil, dest, linux.MaxSymlinkTraversals)
 	if err != nil {
-		return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
+		return fmt.Errorf("failed to find mount destination %q: %v", dest, err)
 	}
 	defer dirent.DecRef()
 	if err := mns.Mount(ctx, dirent, inode); err != nil {
-		return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err)
+		return fmt.Errorf("failed to mount at destination %q: %v", dest, err)
 	}
 
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	log.Infof("Mounted %q to %q type %s", m.Source, dest, m.Type)
 	return nil
 }
 
 func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
+	log.Infof("mkdirAll called with path %s", path)
 	root := mns.Root()
 	defer root.DecRef()
 
 	// Starting at the root, walk the path.
 	parent := root
 	ps := strings.Split(filepath.Clean(path), string(filepath.Separator))
-	for i := 0; i < len(ps); i++ {
-		if ps[i] == "" {
+	for _, pathElem := range ps {
+		if pathElem == "" {
 			// This will be case for the first and last element, if the path
 			// begins or ends with '/'. Note that we always treat the path as
 			// absolute, regardless of what the first character contains.
 			continue
 		}
-		d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit)
+		d, err := mns.FindInode(ctx, root, parent, pathElem, fs.DefaultTraversalLimit)
 		if err == syserror.ENOENT {
 			// If we encounter a path that does not exist, then
 			// create it.
-			if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil {
-				return fmt.Errorf("failed to create directory %q: %v", ps[i], err)
+			if err := parent.CreateDirectory(ctx, root, pathElem, fs.FilePermsFromMode(0755)); err != nil {
+				return fmt.Errorf("failed to create directory %q: %v", pathElem, err)
 			}
-			if d, err = parent.Walk(ctx, root, ps[i]); err != nil {
-				return fmt.Errorf("walk to %q failed: %v", ps[i], err)
+			if d, err = parent.Walk(ctx, root, pathElem); err != nil {
+				return fmt.Errorf("walk to %q failed: %v", pathElem, err)
 			}
 		} else if err != nil {
-			return fmt.Errorf("failed to find inode %q: %v", ps[i], err)
+			return fmt.Errorf("failed to find inode %q: %v", pathElem, err)
 		}
 		parent = d
 	}
@@ -444,7 +465,7 @@ func destinations(mounts []specs.Mount, extra ...string) []string {
 // mountDevice returns a device string based on the fs type and target
 // of the mount.
 func mountDevice(m specs.Mount) string {
-	if m.Type == "bind" {
+	if m.Type == bind {
 		// Make a device string that includes the target, which is consistent across
 		// S/R and uniquely identifies the connection.
 		return "9pfs-" + m.Destination
@@ -589,7 +610,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
 
 // setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
 // procArgs are passed by reference and the FDMap field is modified.
-func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel) error {
+func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
 	ctx := procArgs.NewContext(k)
 
 	// Create the FD map, which will set stdin, stdout, and stderr.  If
@@ -604,27 +625,79 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	// won't need ours either way.
 	procArgs.FDMap = fdm
 
+	// Use root user to configure mounts. The current user might not have
+	// permission to do so.
+	rootProcArgs := kernel.CreateProcessArgs{
+		WorkingDirectory:     "/",
+		Credentials:          auth.NewRootCredentials(creds.UserNamespace),
+		Umask:                0022,
+		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+	}
+	rootCtx := rootProcArgs.NewContext(k)
+
 	// If this is the root container, we also need to setup the root mount
 	// namespace.
-	if k.RootMountNamespace() == nil {
-		// Use root user to configure mounts. The current user might not have
-		// permission to do so.
-		rootProcArgs := kernel.CreateProcessArgs{
-			WorkingDirectory:     "/",
-			Credentials:          auth.NewRootCredentials(creds.UserNamespace),
-			Umask:                0022,
-			MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		}
-		rootCtx := rootProcArgs.NewContext(k)
-
+	mns := k.RootMountNamespace()
+	if mns == nil {
 		// Create the virtual filesystem.
 		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
 		if err != nil {
 			return fmt.Errorf("error creating mounts: %v", err)
 		}
-
 		k.SetRootMountNamespace(mns)
+		return nil
+	}
+
+	// Setup a child container.
+
+	// Create the container's root filesystem mount.
+	log.Infof("Creating new process in child container.")
+	fds := &fdDispenser{fds: append([]int{}, ioFDs...)}
+	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+	if err != nil {
+		return fmt.Errorf("error creating filesystem for container: %v", err)
+	}
+
+	// Make directories for submounts within the container.
+	rootDir := mns.Root()
+	defer rootDir.DecRef()
+	containerRoot := filepath.Join(childContainersDir, cid)
+	mkdirAll(ctx, mns, containerRoot)
+
+	// Mount the container's root filesystem to the newly created
+	// mount point.
+	containerRootDirent, err := mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("failed to find mount destination: %q: %v", containerRoot, err)
+	}
+	if err := mns.Mount(ctx, containerRootDirent, rootInode); err != nil {
+		return fmt.Errorf("failed to mount at destination %q: %v", containerRoot, err)
+	}
+	containerRootDirent.DecRef()
+
+	// We have to re-walk to the dirent to find the mounted
+	// directory. The old dirent is invalid at this point.
+	containerRootDirent, err = mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("failed to find mount destination2: %q: %v", containerRoot, err)
+	}
+	log.Infof("Mounted child's root fs to %q", containerRoot)
+
+	// Mount all submounts.
+	mounts := compileMounts(spec)
+	for _, m := range mounts {
+		// TODO: Enable bind mounts in child containers.
+		if m.Type == bind {
+			log.Infof("Bind mounts in child containers are not yet supported: %+v", m)
+			continue
+		}
+		dest := filepath.Join(containerRoot, m.Destination)
+		if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil {
+			return fmt.Errorf("error mounting filesystem for container: %v", err)
+		}
 	}
 
+	// Set the procArgs root directory.
+	procArgs.Root = containerRootDirent
 	return nil
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f6c7bf223..7debf0ac2 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -19,6 +19,7 @@ import (
 	"errors"
 	"fmt"
 	"math/rand"
+	"os"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -229,7 +230,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// Ensure that signals received are forwarded to the emulated kernel.
 	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
 
-	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
+	procArgs, err := newProcess(spec, creds, utsns, ipcns, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
@@ -250,7 +251,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 }
 
 // newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
@@ -277,7 +278,6 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds
 		UTSNamespace:         utsns,
 		IPCNamespace:         ipcns,
 	}
-
 	return procArgs, nil
 }
 
@@ -356,7 +356,8 @@ func (l *Loader) run() error {
 			l.console,
 			l.rootProcArgs.Credentials,
 			l.rootProcArgs.Limits,
-			l.k)
+			l.k,
+			"" /* CID, which isn't needed for the root container */)
 		if err != nil {
 			return err
 		}
@@ -376,8 +377,7 @@ func (l *Loader) run() error {
 
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process.
-func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.ThreadID, error) {
-	spec := args.Spec
+func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
@@ -406,26 +406,24 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa
 	// when indicated by the spec.
 
 	procArgs, err := newProcess(
-		args.Spec,
-		args.Conf,
-		nil,   // ioFDs
-		false, // console
+		spec,
 		creds,
-		k.RootUTSNamespace(),
-		k.RootIPCNamespace(),
-		k)
+		l.k.RootUTSNamespace(),
+		l.k.RootIPCNamespace(),
+		l.k)
 	if err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
 	err = setFileSystemForProcess(
 		&procArgs,
-		args.Spec,
-		args.Conf,
-		nil,
+		spec,
+		conf,
+		[]int{int(file.Fd())}, // ioFDs
 		false,
 		creds,
 		procArgs.Limits,
-		k)
+		k,
+		cid)
 	if err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
@@ -435,7 +433,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa
 		return 0, fmt.Errorf("failed to create process in sentry: %v", err)
 	}
 
-	ts := k.TaskSet()
+	ts := l.k.TaskSet()
 	tgid := ts.Root.IDOfThreadGroup(tg)
 	if tgid == 0 {
 		return 0, errors.New("failed to get thread group ID of new process")
@@ -446,7 +444,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa
 
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	l.containerRootTGIDs[args.CID] = tgid
+	l.containerRootTGIDs[cid] = tgid
 
 	return tgid, nil
 }
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 18e95284b..c45784749 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -36,7 +36,6 @@ go_library(
         "//pkg/p9",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
-        "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/container",
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 55315c0e8..ed4b1d29c 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -16,7 +16,6 @@ package cmd
 
 import (
 	"os"
-	"sync"
 	"syscall"
 
 	"context"
@@ -25,7 +24,6 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -36,6 +34,10 @@ type Gofer struct {
 	bundleDir string
 	ioFDs     intFlags
 	applyCaps bool
+
+	// controllerFD is the file descriptor of a stream socket for the
+	// control server that is donated to this process.
+	controllerFD int
 }
 
 // Name implements subcommands.Command.
@@ -58,11 +60,12 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
 	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
+	f.IntVar(&g.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
 }
 
 // Execute implements subcommands.Command.
 func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if g.bundleDir == "" || len(g.ioFDs) < 1 {
+	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.controllerFD == -1 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
@@ -134,29 +137,14 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
 	}
 
-	runServers(ats, g.ioFDs)
-	return subcommands.ExitSuccess
-}
+	ctrl, err := fsgofer.NewController(g.controllerFD, g.bundleDir)
 
-func runServers(ats []p9.Attacher, ioFDs []int) {
-	// Run the loops and wait for all to exit.
-	var wg sync.WaitGroup
-	for i, ioFD := range ioFDs {
-		wg.Add(1)
-		go func(ioFD int, at p9.Attacher) {
-			socket, err := unet.NewSocket(ioFD)
-			if err != nil {
-				Fatalf("err creating server on FD %d: %v", ioFD, err)
-			}
-			s := p9.NewServer(at)
-			if err := s.Handle(socket); err != nil {
-				Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
-			}
-			wg.Done()
-		}(ioFD, ats[i])
+	if err := ctrl.Serve(ats, g.ioFDs); err != nil {
+		Fatalf("Failed to serve via P9: %v", err)
 	}
-	wg.Wait()
-	log.Infof("All 9P servers exited.")
+	ctrl.Wait()
+
+	return subcommands.ExitSuccess
 }
 
 func isReadonlyMount(opts []string) bool {
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 574075b00..da2ce0d25 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -249,6 +249,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 			return nil, err
 		}
 		c.Sandbox = sb.Sandbox
+
+		// Prepare the gofer to serve the container's filesystem.
+		err = sb.Sandbox.CreateChild(c.ID, bundleDir)
+		if err != nil {
+			c.Destroy()
+			return nil, err
+		}
 	}
 	c.Status = Created
 
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 24e172f48..0bc682b5f 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "fsgofer",
     srcs = [
+        "control.go",
         "fsgofer.go",
         "fsgofer_unsafe.go",
     ],
@@ -14,9 +15,12 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/control/server",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/unet",
+        "//pkg/urpc",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/control.go b/runsc/fsgofer/control.go
new file mode 100644
index 000000000..8ce8ee8a0
--- /dev/null
+++ b/runsc/fsgofer/control.go
@@ -0,0 +1,203 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+	"fmt"
+	"path/filepath"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// Controller manages the fsgofer's control server.
+type Controller struct {
+	// api holds the control server's URPC endpoints.
+	api api
+
+	// srv is the control server.
+	srv *server.Server
+}
+
+// NewController creates a new Controller and starts it listenting
+func NewController(fd int, rootBundleDir string) (*Controller, error) {
+	if !filepath.IsAbs(rootBundleDir) {
+		return nil, fmt.Errorf("NewController should receive an absolute bundle dir path, but got %q", rootBundleDir)
+	}
+
+	srv, err := server.CreateFromFD(fd)
+	if err != nil {
+		return nil, err
+	}
+
+	cr := &Controller{srv: srv}
+	cr.api.rootBundleDir = rootBundleDir
+	cr.api.bundleDirs = make(map[string]string)
+	srv.Register(&cr.api)
+
+	if err := srv.StartServing(); err != nil {
+		return nil, err
+	}
+
+	return cr, nil
+}
+
+// Wait waits for all the p9 servers to finish, then shuts down the control
+// server.
+func (cr *Controller) Wait() {
+	cr.api.p9wg.Wait()
+	cr.srv.Stop()
+	log.Infof("All 9P servers exited.")
+}
+
+// Serve starts serving each Attacher in ats via its corresponding file
+// descriptor in ioFDs.
+func (cr *Controller) Serve(ats []p9.Attacher, ioFDs []int) error {
+	if len(ats) != len(ioFDs) {
+		return fmt.Errorf("number of attach points does not match the number of IO FDs (%d and %d)", len(ats), len(ioFDs))
+	}
+	for i, _ := range ats {
+		cr.api.serve(ats[i], ioFDs[i])
+	}
+	return nil
+}
+
+// api URPC methods.
+const (
+	// AddBundleDirs readies the gofer to serve from a new bundle
+	// directory. It should be called during runsc create.
+	AddBundleDirs = "api.AddBundleDirs"
+
+	// ServeDirectory serves a new directory via the fsgofer. It should be
+	// called during runsc start.
+	ServeDirectory = "api.ServeDirectory"
+)
+
+// API defines and implements the URPC endpoints for the gofer.
+type api struct {
+	// p9wg waits for all the goroutines serving the sentry via p9. When its
+	// counter is 0, the gofer is out of work and exits.
+	p9wg sync.WaitGroup
+
+	// bundleDirs maps from container ID to bundle directory for each
+	// container.
+	bundleDirs map[string]string
+
+	// rootBundleDir is the bundle directory of the root container.
+	rootBundleDir string
+}
+
+// AddBundleDirsRequest is the URPC argument to AddBundleDirs.
+type AddBundleDirsRequest struct {
+	// BundleDirs is a map of container IDs to bundle directories to add to
+	// the gofer.
+	BundleDirs map[string]string
+}
+
+// AddBundleDirsRequest adds bundle directories that for the gofer to serve.
+func (api *api) AddBundleDirs(req *AddBundleDirsRequest, _ *struct{}) error {
+	log.Debugf("fsgofer.AddBundleDirs")
+	for cid, bd := range req.BundleDirs {
+		if _, ok := api.bundleDirs[cid]; ok {
+			return fmt.Errorf("fsgofer already has a bundleDir for container %q", cid)
+		}
+		api.bundleDirs[cid] = bd
+	}
+	return nil
+}
+
+// ServeDirectoryRequest is the URPC argument to ServeDirectory.
+type ServeDirectoryRequest struct {
+	// Dir is the absolute path to a directory to be served to the sentry.
+	Dir string
+
+	// IsReadOnly specifies whether the directory should be served in
+	// read-only mode.
+	IsReadOnly bool
+
+	// CID is the container ID of the container that needs to serve a
+	// directory.
+	CID string
+
+	// FilePayload contains the socket over which the sentry will request
+	// files from Dir.
+	urpc.FilePayload
+}
+
+// ServeDirectory begins serving a directory via a file descriptor for the
+// sentry. Directories must be added via AddBundleDirsRequest before
+// ServeDirectory is called.
+func (api *api) ServeDirectory(req *ServeDirectoryRequest, _ *struct{}) error {
+	log.Debugf("fsgofer.ServeDirectory: %+v", req)
+
+	if req.Dir == "" {
+		return fmt.Errorf("ServeDirectory should receive a directory argument, but was empty")
+	}
+	if req.CID == "" {
+		return fmt.Errorf("ServeDirectory should receive a CID argument, but was empty")
+	}
+	// Prevent CIDs containing ".." from confusing the sentry when creating
+	// /containers/<cid> directory.
+	// TODO: Once we have multiple independant roots, this
+	// check won't be necessary.
+	if filepath.Clean(req.CID) != req.CID {
+		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", req.CID)
+	}
+	if nFiles := len(req.FilePayload.Files); nFiles != 1 {
+		return fmt.Errorf("ServeDirectory should receive 1 file descriptor, but got %d", nFiles)
+	}
+
+	bd, ok := api.bundleDirs[req.CID]
+	if !ok {
+		// If there's no entry in bundleDirs for the container ID, this
+		// is the root container.
+		bd = api.rootBundleDir
+	}
+
+	// Relative paths are served relative to the bundle directory.
+	absDir := req.Dir
+	if !filepath.IsAbs(absDir) {
+		absDir = filepath.Join(bd, req.Dir)
+	}
+
+	// Create the attach point and start serving.
+	at := NewAttachPoint(absDir, Config{
+		ROMount:          req.IsReadOnly,
+		LazyOpenForWrite: true,
+	})
+	api.serve(at, int(req.FilePayload.Files[0].Fd()))
+
+	return nil
+}
+
+// serve begins serving a directory via a file descriptor.
+func (api *api) serve(at p9.Attacher, ioFD int) {
+	api.p9wg.Add(1)
+	go func(ioFD int, at p9.Attacher) {
+		socket, err := unet.NewSocket(ioFD)
+		if err != nil {
+			panic(fmt.Sprintf("err creating server on FD %d: %v", ioFD, err))
+		}
+		s := p9.NewServer(at)
+		if err := s.Handle(socket); err != nil {
+			panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err))
+		}
+		api.p9wg.Done()
+	}(ioFD, at)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 52cdc91a2..38263896a 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -66,6 +66,11 @@ func (f fileType) String() string {
 	return "unknown"
 }
 
+// ControlSocketAddr generates an abstract unix socket name for the given id.
+func ControlSocketAddr(id string) string {
+	return fmt.Sprintf("\x00runsc-gofer.%s", id)
+}
+
 // Config sets configuration options for each attach point.
 type Config struct {
 	// ROMount is set to true if this is a readonly mount.
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index a961c3cc7..cdacc5e22 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/sentry/control",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/fsgofer",
         "//runsc/specutils",
         "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 2b043d412..83cc94dc4 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -84,7 +85,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 // StartRoot starts running the root container process inside the sandbox.
 func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 	log.Debugf("Start root sandbox %q, pid: %d", s.ID, s.Pid)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -104,21 +105,67 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
+// CreateChild creates a non-root container inside the sandbox.
+func (s *Sandbox) CreateChild(cid, bundleDir string) error {
+	log.Debugf("Create non-root container sandbox %q, pid: %d for container %q with bundle directory %q", s.ID, s.Pid, cid, bundleDir)
+
+	// Connect to the gofer and prepare it to serve from bundleDir for this
+	// container.
+	goferConn, err := s.goferConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to gofer: %v", err)
+	}
+	defer goferConn.Close()
+	goferReq := fsgofer.AddBundleDirsRequest{BundleDirs: map[string]string{cid: bundleDir}}
+	if err := goferConn.Call(fsgofer.AddBundleDirs, &goferReq, nil); err != nil {
+		return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err)
+	}
+
+	return nil
+}
+
 // Start starts running a non-root container inside the sandbox.
 func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error {
 	log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid)
-	conn, err := s.connect()
+
+	sandboxConn, err := s.sandboxConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to sandbox: %v", err)
+	}
+	defer sandboxConn.Close()
+	goferConn, err := s.goferConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to gofer: %v", err)
+	}
+	defer goferConn.Close()
+
+	// Create socket that connects the sandbox and gofer.
+	sandEnd, goferEnd, err := createSocketPair()
 	if err != nil {
 		return err
 	}
-	defer conn.Close()
+	defer sandEnd.Close()
+	defer goferEnd.Close()
+
+	// Tell the Gofer about the new filesystem it needs to serve.
+	goferReq := fsgofer.ServeDirectoryRequest{
+		Dir:         spec.Root.Path,
+		IsReadOnly:  spec.Root.Readonly,
+		CID:         cid,
+		FilePayload: urpc.FilePayload{Files: []*os.File{goferEnd}},
+	}
+	if err := goferConn.Call(fsgofer.ServeDirectory, &goferReq, nil); err != nil {
+		return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err)
+	}
 
+	// Start running the container.
 	args := boot.StartArgs{
-		Spec: spec,
-		Conf: conf,
-		CID:  cid,
+		Spec:        spec,
+		Conf:        conf,
+		CID:         cid,
+		FilePayload: urpc.FilePayload{Files: []*os.File{sandEnd}},
 	}
-	if err := conn.Call(boot.ContainerStart, args, nil); err != nil {
+	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
 		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
 	}
 
@@ -142,7 +189,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 		SandboxID: s.ID,
 	}
 
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -165,7 +212,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 // given container in this sandbox.
 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return nil, err
 	}
@@ -183,7 +230,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 // Execute runs the specified command in the container.
 func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, error) {
 	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return 0, s.connError(err)
 	}
@@ -203,7 +250,7 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 // Event retrieves stats about the sandbox such as memory and CPU utilization.
 func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return nil, err
 	}
@@ -219,7 +266,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	return &e, nil
 }
 
-func (s *Sandbox) connect() (*urpc.Client, error) {
+func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
 	log.Debugf("Connecting to sandbox %q", s.ID)
 	conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
 	if err != nil {
@@ -228,6 +275,15 @@ func (s *Sandbox) connect() (*urpc.Client, error) {
 	return conn, nil
 }
 
+func (s *Sandbox) goferConnect() (*urpc.Client, error) {
+	log.Debugf("Connecting to gofer for sandbox %q", s.ID)
+	conn, err := client.ConnectTo(fsgofer.ControlSocketAddr(s.ID))
+	if err != nil {
+		return nil, s.connError(err)
+	}
+	return conn, nil
+}
+
 func (s *Sandbox) connError(err error) error {
 	return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
 }
@@ -244,31 +300,45 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 
 	// Add root mount and then add any other additional mounts.
 	mountCount := 1
+
+	// Add additional mounts.
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			mountCount++
 		}
 	}
-
 	sandEnds := make([]*os.File, 0, mountCount)
 	goferEnds := make([]*os.File, 0, mountCount)
-	for i := 0; i < mountCount; i++ {
-		// Create socket that connects the sandbox and gofer.
-		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+	// nextFD is the next available file descriptor for the gofer process.
+	// It starts at 3 because 0-2 are used by stdin/stdout/stderr.
+	var nextFD int
+	for nextFD = 3; nextFD-3 < mountCount; nextFD++ {
+		sandEnd, goferEnd, err := createSocketPair()
 		if err != nil {
 			return nil, err
 		}
-		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd"))
-
-		goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd")
 		defer goferEnd.Close()
+		sandEnds = append(sandEnds, sandEnd)
 		goferEnds = append(goferEnds, goferEnd)
+		args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
+	}
 
-		args = append(args, fmt.Sprintf("--io-fds=%d", 3+i))
+	// Create and donate a file descriptor for the control server.
+	addr := fsgofer.ControlSocketAddr(s.ID)
+	serverFD, err := server.CreateSocket(addr)
+	if err != nil {
+		return nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
 	}
 
+	// Add the control server fd.
+	args = append(args, "--controller-fd="+strconv.Itoa(nextFD))
+	nextFD++
+	controllerFile := os.NewFile(uintptr(serverFD), "gofer_control_socket_server")
+	defer controllerFile.Close()
+
 	cmd := exec.Command(binPath, args...)
 	cmd.ExtraFiles = goferEnds
+	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
 
 	// Setup any uid/gid mappings, and create or join the configured user
 	// namespace so the gofer's view of the filesystem aligns with the
@@ -286,6 +356,15 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	return sandEnds, nil
 }
 
+// createSocketPair creates a pair of files wrapping a socket pair.
+func createSocketPair() (*os.File, *os.File, error) {
+	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		return nil, nil, err
+	}
+	return os.NewFile(uintptr(fds[0]), "sandbox io fd"), os.NewFile(uintptr(fds[1]), "gofer io fd"), nil
+}
+
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
 func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error {
@@ -296,7 +375,9 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Create control server socket here and donate FD to child process because
 	// it may be in a different network namespace and won't be reachable from
 	// outside.
-	fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID))
+	addr := boot.ControlSocketAddr(s.ID)
+	fd, err := server.CreateSocket(addr)
+	log.Infof("creating sandbox process with addr: %s", addr)
 	if err != nil {
 		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
 	}
@@ -438,7 +519,7 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 	if err := specutils.WaitForReady(s.Pid, timeout, ready); err != nil {
 		return fmt.Errorf("unexpected error waiting for sandbox %q, err: %v", s.ID, err)
 	}
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -454,7 +535,7 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
 	var ws syscall.WaitStatus
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return ws, err
 	}
@@ -471,7 +552,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
 	var ws syscall.WaitStatus
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return ws, err
 	}
@@ -536,7 +617,7 @@ func (s *Sandbox) Destroy() error {
 // Signal sends the signal to a container in the sandbox.
 func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 	log.Debugf("Signal sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -556,7 +637,7 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 // The statefile will be written to f.
 func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
 	log.Debugf("Checkpoint sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -577,7 +658,7 @@ func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
 // Pause sends the pause call for a container in the sandbox.
 func (s *Sandbox) Pause(cid string) error {
 	log.Debugf("Pause sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -592,7 +673,7 @@ func (s *Sandbox) Pause(cid string) error {
 // Resume sends the resume call for a container in the sandbox.
 func (s *Sandbox) Resume(cid string) error {
 	log.Debugf("Resume sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -630,7 +711,7 @@ func (s *Sandbox) IsRunning() bool {
 // Stacks collects and returns all stacks for the sandbox.
 func (s *Sandbox) Stacks() (string, error) {
 	log.Debugf("Stacks sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return "", err
 	}
-- 
cgit v1.2.3


From da087e66cc0eb1616437e5b729576801671d3696 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 16 Aug 2018 10:54:21 -0700
Subject: Combine functions to search for file under one common function

Bazel adds the build type in front of directories making it hard to
refer to binaries in code.

PiperOrigin-RevId: 209010854
Change-Id: I6c9da1ac3bbe79766868a3b14222dd42d03b4ec5
---
 runsc/container/container_test.go | 32 ++-----------------
 runsc/test/testutil/testutil.go   | 66 ++++++++++++++++++++++++---------------
 2 files changed, 42 insertions(+), 56 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 33c53e189..10b10d100 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -206,27 +206,6 @@ func run(spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
-// findUDSApp finds the uds_test_app binary to be used in the UnixDomainSocket test.
-func findUDSApp() (string, error) {
-	// TODO: Use bazel FindBinary function.
-
-	// uds_test_app is in a directory like:
-	// './linux_amd64_pure_stripped/uds_test_app.go'.
-	//
-	// Since I don't want to construct 'linux_amd64_pure_stripped' based on the
-	// build type, do a quick search for: './*/uds_test_app'
-	// Note: This glob will only succeed when file is one directory deep.
-	matches, err := filepath.Glob("./*/uds_test_app")
-	if err != nil {
-		return "", fmt.Errorf("error globbing: %v", err)
-	}
-	if i := len(matches); i != 1 {
-		return "", fmt.Errorf("error identifying uds_test_app from matches: got %d matches", i)
-	}
-
-	return matches[0], nil
-}
-
 type configOption int
 
 const (
@@ -760,16 +739,9 @@ func TestUnixDomainSockets(t *testing.T) {
 		// Get file path for corresponding output file in sandbox.
 		outputFileSandbox := filepath.Join(goferRoot, output)
 
-		// Need to get working directory, even though not intuitive.
-		wd, _ := os.Getwd()
-		localPath, err := findUDSApp()
+		app, err := testutil.FindFile("runsc/container/uds_test_app")
 		if err != nil {
-			t.Fatalf("error finding localPath: %v", err)
-		}
-		app := filepath.Join(wd, localPath)
-
-		if _, err = os.Stat(app); err != nil {
-			t.Fatalf("error finding the uds_test_app: %v", err)
+			t.Fatal("error finding uds_test_app:", err)
 		}
 
 		socketPath := filepath.Join(dir, socket)
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index d2b39b58c..2553e7453 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -37,39 +37,53 @@ var RaceEnabled = false
 
 // ConfigureExePath configures the executable for runsc in the test environment.
 func ConfigureExePath() error {
-
-	// runsc is in a directory like: 'runsc/linux_amd64_pure_stripped/runsc'.
-	// Since I don't want to construct 'linux_amd64_pure_stripped' based on the
-	// build type, do a quick search for: 'runsc/*/runsc'
-	exePath := ""
-	lv1 := "./runsc"
-	lv1fis, err := ioutil.ReadDir(lv1)
+	path, err := FindFile("runsc/runsc")
 	if err != nil {
 		return err
 	}
-	for _, fi := range lv1fis {
-		if !fi.IsDir() {
-			continue
-		}
-		lv2fis, err := ioutil.ReadDir(filepath.Join(lv1, fi.Name()))
-		if err != nil {
-			return err
+	specutils.ExePath = path
+	return nil
+}
+
+// FindFile searchs for a file inside the test run environment. It returns the
+// full path to the file. It fails if none or more than one file is found.
+func FindFile(path string) (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	// The test root is demarcated by a path element called "__main__". Search for
+	// it backwards from the in the working directory.
+	root := wd
+	for {
+		dir, name := filepath.Split(root)
+		if name == "__main__" {
+			break
 		}
-		for _, candidate := range lv2fis {
-			if !candidate.IsDir() && candidate.Name() == "runsc" {
-				exePath, err = filepath.Abs(filepath.Join(lv1, fi.Name(), candidate.Name()))
-				if err != nil {
-					return err
-				}
-				break
-			}
+		if len(dir) == 0 {
+			return "", fmt.Errorf("directory __main__ not found in %q", wd)
 		}
+		// Remove ending slash to loop around.
+		root = dir[:len(dir)-1]
+	}
+
+	// bazel adds the build type to the directory structure. Since I don't want
+	// to guess what build type it's, just place '*' to match anything.
+	//
+	// The pattern goes like: /test-path/__main__/directories/*/file.
+	pattern := filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path))
+	matches, err := filepath.Glob(pattern)
+	if err != nil {
+		return "", fmt.Errorf("error globbing %q: %v", pattern, err)
 	}
-	if exePath == "" {
-		return fmt.Errorf("path to runsc not found")
+	if len(matches) == 0 {
+		return "", fmt.Errorf("file %q not found", path)
 	}
-	specutils.ExePath = exePath
-	return nil
+	if len(matches) != 1 {
+		return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+	}
+	return matches[0], nil
 }
 
 // TestConfig return the default configuration to use in tests.
-- 
cgit v1.2.3


From 11800311a537bf1286f71ab419fa251a1e81e54f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 17 Aug 2018 13:05:59 -0700
Subject: Add nonExclusiveFS dimension to more tests

The ones using 'kvm' actually mean that they don't want overlay.

PiperOrigin-RevId: 209194318
Change-Id: I941a443cb6d783e2c80cf66eb8d8630bcacdb574
---
 runsc/container/container_test.go | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 10b10d100..843b7f6f8 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -215,6 +215,7 @@ const (
 )
 
 var all = []configOption{overlay, kvm, nonExclusiveFS}
+var noOverlay = []configOption{kvm, nonExclusiveFS}
 
 // configs generates different configurations to run tests.
 func configs(opts ...configOption) []*boot.Config {
@@ -557,7 +558,7 @@ func TestExec(t *testing.T) {
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(kvm) {
+	for _, conf := range configs(noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir("", "checkpoint-test")
@@ -716,7 +717,7 @@ func TestUnixDomainSockets(t *testing.T) {
 	)
 
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(kvm) {
+	for _, conf := range configs(noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir("", "uds-test")
@@ -852,7 +853,7 @@ func TestUnixDomainSockets(t *testing.T) {
 // It will then unpause and confirm that both processes are running. Then it will
 // wait until one sleep completes and check to make sure the other is running.
 func TestPauseResume(t *testing.T) {
-	for _, conf := range configs(kvm) {
+	for _, conf := range configs(noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 		const uid = 343
 		spec := testutil.NewSpecWithArgs("sleep", "20")
@@ -1208,7 +1209,7 @@ func TestConsoleSocket(t *testing.T) {
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
-	for _, conf := range configs(kvm) {
+	for _, conf := range configs(noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("/bin/true")
-- 
cgit v1.2.3


From 0fc7b306959e83ebf14792206c9a626490b02c2d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 20 Aug 2018 11:25:42 -0700
Subject: Standardize mounts in tests

Tests get a readonly rootfs mapped to / (which was the case before)
and writable TEST_TMPDIR. This makes it easier to setup containers to
write to files and to share state between test and containers.

PiperOrigin-RevId: 209453224
Change-Id: I4d988e45dc0909a0450a3bb882fe280cf9c24334
---
 runsc/container/container_test.go | 184 +++++++++++++++-----------------------
 runsc/sandbox/sandbox.go          |   2 +-
 runsc/test/testutil/testutil.go   |  28 +++++-
 3 files changed, 95 insertions(+), 119 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 843b7f6f8..a2da63afd 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -558,17 +558,19 @@ func TestExec(t *testing.T) {
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(noOverlay...) {
+	//
+	// TODO: Skip nonExclusiveFS because $TEST_TMPDIR mount is
+	// mistakenly marked as RO after revalidation.
+	for _, conf := range configs(kvm) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		dir, err := ioutil.TempDir("", "checkpoint-test")
+		dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
 		if err != nil {
 			t.Fatalf("ioutil.TempDir failed: %v", err)
 		}
 		if err := os.Chmod(dir, 0777); err != nil {
 			t.Fatalf("error chmoding file: %q, %v", dir, err)
 		}
-		defer os.RemoveAll(dir)
 
 		outputPath := filepath.Join(dir, "output")
 		outputFile, err := createWriteableOutputFile(outputPath)
@@ -577,14 +579,8 @@ func TestCheckpointRestore(t *testing.T) {
 		}
 		defer outputFile.Close()
 
-		script := "for ((i=0; ;i++)); do echo $i >> /tmp2/output; sleep 1; done"
+		script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath)
 		spec := testutil.NewSpecWithArgs("bash", "-c", script)
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp2",
-			Source:      dir,
-		})
-
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
@@ -711,53 +707,39 @@ func TestCheckpointRestore(t *testing.T) {
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
 	const (
-		output    = "uds_output"
-		goferRoot = "/tmp2"
-		socket    = "uds_socket"
+		output = "uds_output"
+		socket = "uds_socket"
 	)
 
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(noOverlay...) {
+	//
+	// TODO: Skip nonExclusiveFS because $TEST_TMPDIR mount is
+	// mistakenly marked as RO after revalidation.
+	for _, conf := range configs(kvm) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		dir, err := ioutil.TempDir("", "uds-test")
+		dir, err := ioutil.TempDir(testutil.TmpDir(), "uds-test")
 		if err != nil {
 			t.Fatalf("ioutil.TempDir failed: %v", err)
 		}
-		if err := os.Chmod(dir, 0777); err != nil {
-			t.Fatalf("error chmoding file: %q, %v", dir, err)
-		}
 		defer os.RemoveAll(dir)
 
 		outputPath := filepath.Join(dir, output)
-
-		outputFile, err := createWriteableOutputFile(outputPath)
+		outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
 		if err != nil {
 			t.Fatalf("error creating output file: %v", err)
 		}
 		defer outputFile.Close()
 
-		// Get file path for corresponding output file in sandbox.
-		outputFileSandbox := filepath.Join(goferRoot, output)
-
 		app, err := testutil.FindFile("runsc/container/uds_test_app")
 		if err != nil {
 			t.Fatal("error finding uds_test_app:", err)
 		}
 
 		socketPath := filepath.Join(dir, socket)
-		socketPathSandbox := filepath.Join(goferRoot, socket)
 		defer os.Remove(socketPath)
 
-		spec := testutil.NewSpecWithArgs(app, "--file", outputFileSandbox,
-			"--socket", socketPathSandbox)
-
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: goferRoot,
-			Source:      dir,
-		})
-
+		spec := testutil.NewSpecWithArgs(app, "--file", outputPath, "--socket", socketPath)
 		spec.Process.User = specs.User{
 			UID: uint32(os.Getuid()),
 			GID: uint32(os.Getgid()),
@@ -811,7 +793,7 @@ func TestUnixDomainSockets(t *testing.T) {
 		if err := os.Remove(outputPath); err != nil {
 			t.Fatalf("error removing file")
 		}
-		outputFile2, err := createWriteableOutputFile(outputPath)
+		outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
 		if err != nil {
 			t.Fatalf("error creating output file: %v", err)
 		}
@@ -858,20 +840,11 @@ func TestPauseResume(t *testing.T) {
 		const uid = 343
 		spec := testutil.NewSpecWithArgs("sleep", "20")
 
-		dir, err := ioutil.TempDir("", "pause-test")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir failed: %v", err)
-		}
-		lock, err := ioutil.TempFile(dir, "lock")
+		lock, err := ioutil.TempFile(testutil.TmpDir(), "lock")
 		if err != nil {
 			t.Fatalf("error creating output file: %v", err)
 		}
 		defer lock.Close()
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp2",
-			Source:      dir,
-		})
 
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
@@ -908,7 +881,7 @@ func TestPauseResume(t *testing.T) {
 			},
 		}
 
-		script := fmt.Sprintf("while [[ -f /tmp2/%s ]]; do sleep 0.1; done", filepath.Base(lock.Name()))
+		script := fmt.Sprintf("while [[ -f %q ]]; do sleep 0.1; done", lock.Name())
 		execArgs := control.ExecArgs{
 			Filename:         "/bin/bash",
 			Argv:             []string{"bash", "-c", script},
@@ -1040,14 +1013,6 @@ func TestCapabilities(t *testing.T) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("sleep", "100")
-
-		// We generate files in the host temporary directory.
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: os.TempDir(),
-			Source:      os.TempDir(),
-			Type:        "bind",
-		})
-
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
@@ -1218,7 +1183,7 @@ func TestRunNonRoot(t *testing.T) {
 
 		// User that container runs as can't list '$TMP/blocked' and would fail to
 		// mount it.
-		dir, err := ioutil.TempDir("", "blocked")
+		dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
 		if err != nil {
 			t.Fatalf("ioutil.TempDir() failed: %v", err)
 		}
@@ -1230,15 +1195,8 @@ func TestRunNonRoot(t *testing.T) {
 			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
 		}
 
-		// We generate files in the host temporary directory.
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: dir,
-			Source:      dir,
-			Type:        "bind",
-		})
-
 		if err := run(spec, conf); err != nil {
-			t.Fatalf("error running sadbox: %v", err)
+			t.Fatalf("error running sandbox: %v", err)
 		}
 	}
 }
@@ -1249,17 +1207,20 @@ func TestMountNewDir(t *testing.T) {
 	for _, conf := range configs(overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		srcDir := path.Join(os.TempDir(), "src", "newdir", "anotherdir")
+		root, err := ioutil.TempDir(testutil.TmpDir(), "root")
+		if err != nil {
+			t.Fatal("ioutil.TempDir() failed:", err)
+		}
+		if err := os.Chmod(root, 0755); err != nil {
+			t.Fatalf("os.Chmod(%q) failed: %v", root, err)
+		}
+
+		srcDir := path.Join(root, "src", "dir", "anotherdir")
 		if err := os.MkdirAll(srcDir, 0755); err != nil {
 			t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
 		}
 
-		// Attempt to remove dir to ensure it doesn't exist.
-		mountDir := path.Join(os.TempDir(), "newdir")
-		if err := os.RemoveAll(mountDir); err != nil {
-			t.Fatalf("os.RemoveAll(%q) failed: %v", mountDir, err)
-		}
-		mountDir = path.Join(mountDir, "anotherdir")
+		mountDir := path.Join(root, "dir", "anotherdir")
 
 		spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
 		spec.Mounts = append(spec.Mounts, specs.Mount{
@@ -1269,7 +1230,7 @@ func TestMountNewDir(t *testing.T) {
 		})
 
 		if err := run(spec, conf); err != nil {
-			t.Fatalf("error running sadbox: %v", err)
+			t.Fatalf("error running sandbox: %v", err)
 		}
 	}
 }
@@ -1310,13 +1271,13 @@ func TestReadonlyMount(t *testing.T) {
 	for _, conf := range configs(overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo/file")
-		dir, err := ioutil.TempDir("", "ro-mount")
+		dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+		spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
 		if err != nil {
 			t.Fatalf("ioutil.TempDir() failed: %v", err)
 		}
 		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: "/foo",
+			Destination: dir,
 			Source:      dir,
 			Type:        "bind",
 			Options:     []string{"ro"},
@@ -1613,17 +1574,14 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	// the filesystem.
 	spec := testutil.NewSpecWithArgs("sleep", "1000")
 
-	// Mount host temp dir inside the sandbox at '/tmp2'.
-	hostTmpDir, err := ioutil.TempDir("", "root-fs-test")
-	sandboxTmpDir := "/tmp2"
+	// TODO: $TEST_TMPDIR mount is mistakenly marked as RO after
+	// revalidation. Remove when it's fixed.
+	spec.Root.Readonly = false
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "root-fs-test")
 	if err != nil {
 		t.Fatalf("TempDir failed: %v", err)
 	}
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Type:        "bind",
-		Destination: sandboxTmpDir,
-		Source:      hostTmpDir,
-	})
 
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
@@ -1643,105 +1601,103 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// File that will be used to check consistency inside/outside sandbox.
-	hostFilename := filepath.Join(hostTmpDir, "file")
-	sandboxFilename := filepath.Join(sandboxTmpDir, "file")
+	filename := filepath.Join(dir, "file")
 
 	// File does not exist yet. Reading from the sandbox should fail.
 	execArgsTestFile := control.ExecArgs{
 		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", sandboxFilename},
+		Argv:     []string{"test", "-f", filename},
 	}
 	if ws, err := c.Execute(&execArgsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() == 0 {
 		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
 	}
 
 	// Create the file from outside of the sandbox.
-	if err := ioutil.WriteFile(hostFilename, []byte("foobar"), 0777); err != nil {
-		t.Fatalf("error writing to file %q: %v", hostFilename, err)
+	if err := ioutil.WriteFile(filename, []byte("foobar"), 0777); err != nil {
+		t.Fatalf("error writing to file %q: %v", filename, err)
 	}
 
 	// Now we should be able to test the file from within the sandbox.
 	if ws, err := c.Execute(&execArgsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
-		t.Errorf("test %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+		t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus())
 	}
 
 	// Rename the file from outside of the sandbox.
-	newHostFilename := filepath.Join(hostTmpDir, "newfile")
-	newSandboxFilename := filepath.Join(sandboxTmpDir, "newfile")
-	if err := os.Rename(hostFilename, newHostFilename); err != nil {
-		t.Fatalf("os.Rename(%q, %q) failed: %v", hostFilename, newHostFilename, err)
+	newFilename := filepath.Join(dir, "newfile")
+	if err := os.Rename(filename, newFilename); err != nil {
+		t.Fatalf("os.Rename(%q, %q) failed: %v", filename, newFilename, err)
 	}
 
 	// File should no longer exist at the old path within the sandbox.
 	if ws, err := c.Execute(&execArgsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() == 0 {
-		t.Errorf("test %q exited with code %v, wanted not zero", sandboxFilename, ws.ExitStatus())
+		t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus())
 	}
 
 	// We should be able to test the new filename from within the sandbox.
 	execArgsTestNewFile := control.ExecArgs{
 		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", newSandboxFilename},
+		Argv:     []string{"test", "-f", newFilename},
 	}
 	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", newSandboxFilename, err)
+		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
 	} else if ws.ExitStatus() != 0 {
-		t.Errorf("test %q exited with code %v, wanted zero", newSandboxFilename, ws.ExitStatus())
+		t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus())
 	}
 
 	// Delete the renamed file from outside of the sandbox.
-	if err := os.Remove(newHostFilename); err != nil {
-		t.Fatalf("error removing file %q: %v", hostFilename, err)
+	if err := os.Remove(newFilename); err != nil {
+		t.Fatalf("error removing file %q: %v", filename, err)
 	}
 
 	// Renamed file should no longer exist at the old path within the sandbox.
 	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", newSandboxFilename, err)
+		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
 	} else if ws.ExitStatus() == 0 {
-		t.Errorf("test %q exited with code %v, wanted not zero", newSandboxFilename, ws.ExitStatus())
+		t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus())
 	}
 
 	// Now create the file from WITHIN the sandbox.
 	execArgsTouch := control.ExecArgs{
 		Filename: "/usr/bin/touch",
-		Argv:     []string{"touch", sandboxFilename},
+		Argv:     []string{"touch", filename},
 		KUID:     auth.KUID(os.Getuid()),
 		KGID:     auth.KGID(os.Getgid()),
 	}
 	if ws, err := c.Execute(&execArgsTouch); err != nil {
-		t.Fatalf("unexpected error touching file %q: %v", sandboxFilename, err)
+		t.Fatalf("unexpected error touching file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
-		t.Errorf("touch %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+		t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus())
 	}
 
 	// File should exist outside the sandbox.
-	if _, err := os.Stat(hostFilename); err != nil {
-		t.Errorf("stat %q got error %v, wanted nil", hostFilename, err)
+	if _, err := os.Stat(filename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", filename, err)
 	}
 
 	// File should exist outside the sandbox.
-	if _, err := os.Stat(hostFilename); err != nil {
-		t.Errorf("stat %q got error %v, wanted nil", hostFilename, err)
+	if _, err := os.Stat(filename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", filename, err)
 	}
 
 	// Delete the file from within the sandbox.
 	execArgsRemove := control.ExecArgs{
 		Filename: "/bin/rm",
-		Argv:     []string{"rm", sandboxFilename},
+		Argv:     []string{"rm", filename},
 	}
 	if ws, err := c.Execute(&execArgsRemove); err != nil {
-		t.Fatalf("unexpected error removing file %q: %v", sandboxFilename, err)
+		t.Fatalf("unexpected error removing file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
-		t.Errorf("remove %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+		t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus())
 	}
 
 	// File should not exist outside the sandbox.
-	if _, err := os.Stat(hostFilename); !os.IsNotExist(err) {
-		t.Errorf("stat %q got error %v, wanted ErrNotExist", hostFilename, err)
+	if _, err := os.Stat(filename); !os.IsNotExist(err) {
+		t.Errorf("stat %q got error %v, wanted ErrNotExist", filename, err)
 	}
 }
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 83cc94dc4..a10b79856 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -377,7 +377,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// outside.
 	addr := boot.ControlSocketAddr(s.ID)
 	fd, err := server.CreateSocket(addr)
-	log.Infof("creating sandbox process with addr: %s", addr)
+	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
 	if err != nil {
 		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
 	}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 2553e7453..fc3d61e52 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -35,6 +35,16 @@ import (
 // RaceEnabled is set to true if it was built with '--race' option.
 var RaceEnabled = false
 
+// TmpDir returns the absolute path to a writable directory that can be used as
+// scratch by the test.
+func TmpDir() string {
+	dir := os.Getenv("TEST_TMPDIR")
+	if dir == "" {
+		dir = "/tmp"
+	}
+	return dir
+}
+
 // ConfigureExePath configures the executable for runsc in the test environment.
 func ConfigureExePath() error {
 	path, err := FindFile("runsc/runsc")
@@ -102,7 +112,7 @@ func TestConfig() *boot.Config {
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
 // in tests.
 func NewSpecWithArgs(args ...string) *specs.Spec {
-	spec := &specs.Spec{
+	return &specs.Spec{
 		// The host filesystem root is the container root.
 		Root: &specs.Root{
 			Path:     "/",
@@ -114,13 +124,23 @@ func NewSpecWithArgs(args ...string) *specs.Spec {
 				"PATH=" + os.Getenv("PATH"),
 			},
 		},
+		Mounts: []specs.Mount{
+			// Root is readonly, but many tests want to write to tmpdir.
+			// This creates a writable mount inside the root. Also, when tmpdir points
+			// to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs
+			// inside the sentry.
+			specs.Mount{
+				Type:        "bind",
+				Destination: TmpDir(),
+				Source:      TmpDir(),
+			},
+		},
 	}
-	return spec
 }
 
 // SetupRootDir creates a root directory for containers.
 func SetupRootDir() (string, error) {
-	rootDir, err := ioutil.TempDir("", "containers")
+	rootDir, err := ioutil.TempDir(TmpDir(), "containers")
 	if err != nil {
 		return "", fmt.Errorf("error creating root dir: %v", err)
 	}
@@ -141,7 +161,7 @@ func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir str
 // SetupContainerInRoot creates a bundle for the container, generates a test
 // config, and writes the spec to config.json in the bundle dir.
 func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (bundleDir string, err error) {
-	bundleDir, err = ioutil.TempDir("", "bundle")
+	bundleDir, err = ioutil.TempDir(TmpDir(), "bundle")
 	if err != nil {
 		return "", fmt.Errorf("error creating bundle dir: %v", err)
 	}
-- 
cgit v1.2.3


From d6d165cb0b8147461388287ffd4cfee221940123 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 21 Aug 2018 13:13:34 -0700
Subject: Initial change for multi-gofer support

PiperOrigin-RevId: 209647293
Change-Id: I980fca1257ea3fcce796388a049c353b0303a8a5
---
 runsc/cmd/debug.go                |   2 +-
 runsc/container/BUILD             |   2 +-
 runsc/container/container.go      | 111 ++++++++++++++++++++++----------------
 runsc/container/container_test.go |  47 +++++++++++++++-
 runsc/sandbox/BUILD               |  16 +-----
 runsc/sandbox/sandbox.go          |  85 +++++++++--------------------
 runsc/sandbox/sandbox_test.go     |  75 --------------------------
 7 files changed, 139 insertions(+), 199 deletions(-)
 delete mode 100644 runsc/sandbox/sandbox_test.go

(limited to 'runsc')

diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 87ad21c9a..7952489de 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -92,7 +92,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
-	if !c.Sandbox.IsRunning() {
+	if !c.IsRunning() {
 		Fatalf("sandbox %q is not running", c.Sandbox.ID)
 	}
 
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index d4c650892..1171355c8 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -23,10 +23,10 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/sentry/control",
-        "//pkg/syserror",
         "//runsc/boot",
         "//runsc/sandbox",
         "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
diff --git a/runsc/container/container.go b/runsc/container/container.go
index da2ce0d25..8bd47aac1 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -16,6 +16,7 @@
 package container
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
@@ -27,10 +28,10 @@ import (
 	"syscall"
 	"time"
 
+	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/sandbox"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -89,6 +90,10 @@ type Container struct {
 	// Status is the current container Status.
 	Status Status `json:"status"`
 
+	// GoferPid is the pid of the gofer running along side the sandbox. May
+	// be 0 if the gofer has been killed or it's not being used.
+	GoferPid int `json:"goferPid"`
+
 	// Sandbox is the sandbox this container is running in. It will be nil
 	// if the container is not in state Running or Created.
 	Sandbox *sandbox.Sandbox `json:"sandbox"`
@@ -130,12 +135,11 @@ func Load(rootDir, id string) (*Container, error) {
 	// This is inherently racey.
 	if c.Status == Running || c.Status == Created {
 		// Check if the sandbox process is still running.
-		if c.Sandbox.IsRunning() {
+		if c.IsRunning() {
 			// TODO: Send a message into the sandbox to
 			// see if this particular container is still running.
 		} else {
-			// Sandbox no longer exists, so this container
-			// definitely does not exist.
+			// Sandbox no longer exists, so this container definitely does not exist.
 			c.Status = Stopped
 			c.Sandbox = nil
 		}
@@ -221,12 +225,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		log.Debugf("Creating new sandbox for container %q", id)
 		// Start a new sandbox for this container. Any errors after this point
 		// must destroy the container.
-		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
+		s, goferPid, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
 		if err != nil {
 			c.Destroy()
 			return nil, err
 		}
 		c.Sandbox = s
+		c.GoferPid = goferPid
 	} else {
 		// This is sort of confusing. For a sandbox with a root
 		// container and a child container in it, runsc sees:
@@ -398,7 +403,18 @@ func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	if c.Status == Stopped {
 		return 0, fmt.Errorf("container is stopped")
 	}
-	return c.Sandbox.WaitPID(pid, c.ID)
+	ws, err := c.Sandbox.WaitPID(pid, c.ID)
+	if err != nil {
+		return 0, err
+	}
+	if c.Sandbox.IsRootContainer(c.ID) {
+		// If waiting for the root, give some time for the sandbox process to exit
+		// to prevent races with resources that might still be in use.
+		if err := c.waitForStopped(); err != nil {
+			return 0, err
+		}
+	}
+	return ws, nil
 }
 
 // Signal sends the signal to the container.
@@ -502,6 +518,14 @@ func (c *Container) Destroy() error {
 			log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err)
 		}
 	}
+	if c.GoferPid != 0 {
+		log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
+		if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+			log.Warningf("error sending signal %d to pid %d: %v", syscall.SIGKILL, c.GoferPid, err)
+		} else {
+			c.GoferPid = 0
+		}
+	}
 
 	c.Sandbox = nil
 	c.Status = Stopped
@@ -509,29 +533,38 @@ func (c *Container) Destroy() error {
 	return nil
 }
 
-// DestroyAndWait frees all resources associated with the container
-// and waits for destroy to finish before returning.
-func (c *Container) DestroyAndWait() error {
-	sandboxPid := c.Sandbox.Pid
-	goferPid := c.Sandbox.GoferPid
-
-	if err := c.Destroy(); err != nil {
-		return fmt.Errorf("error destroying container %v: %v", c, err)
+// IsRunning returns true if the sandbox or gofer process is running.
+func (c *Container) IsRunning() bool {
+	if c.Status == Stopped {
+		return false
 	}
-
-	if sandboxPid != 0 {
-		if err := waitForDeath(sandboxPid, 5*time.Second); err != nil {
-			return fmt.Errorf("error waiting for sandbox death: %v", err)
-		}
+	if c.Sandbox != nil && c.Sandbox.IsRunning() {
+		return true
 	}
+	if c.GoferPid != 0 {
+		// Send a signal 0 to the gofer process.
+		if err := syscall.Kill(c.GoferPid, 0); err == nil {
+			log.Warningf("Found orphan gofer process, pid: %d", c.GoferPid)
+			// Attempt to kill gofer if it's orphan.
+			syscall.Kill(c.GoferPid, syscall.SIGKILL)
 
-	if goferPid != 0 {
-		if err := waitForDeath(goferPid, 5*time.Second); err != nil {
-			return fmt.Errorf("error waiting for gofer death: %v", err)
+			// Don't wait for gofer to die. Return 'running' and hope gofer is dead
+			// next time around.
+			return true
 		}
 	}
+	return false
+}
 
-	return nil
+// DestroyAndWait frees all resources associated with the container
+// and waits for destroy to finish before returning.
+//
+// TODO: This only works for single container.
+func (c *Container) DestroyAndWait() error {
+	if err := c.Destroy(); err != nil {
+		return fmt.Errorf("error destroying container %v: %v", c, err)
+	}
+	return c.waitForStopped()
 }
 
 // save saves the container metadata to a file.
@@ -551,29 +584,15 @@ func (c *Container) save() error {
 	return nil
 }
 
-// waitForDeath ensures that process is dead before proceeding.
-//
-// This is racy because the kernel can potentially reuse the pid in the time
-// between the process' death and the first check after the process has ended.
-func waitForDeath(pid int, timeout time.Duration) error {
-	backoff := 1 * time.Millisecond
-	for start := time.Now(); time.Now().Sub(start) < timeout; {
-
-		if err := syscall.Kill(pid, 0); err != nil {
-			if err == syserror.ESRCH {
-				// pid does not exist so process must have died
-				return nil
-			}
-			return fmt.Errorf("error killing pid (%d): %v", pid, err)
-		}
-		// pid is still alive.
-
-		// Process continues to run, backoff and retry.
-		time.Sleep(backoff)
-		backoff *= 2
-		if backoff > 1*time.Second {
-			backoff = 1 * time.Second
+func (c *Container) waitForStopped() error {
+	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
+	defer cancel()
+	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+	op := func() error {
+		if !c.IsRunning() {
+			return fmt.Errorf("container is still running")
 		}
+		return nil
 	}
-	return fmt.Errorf("timed out waiting for process (%d)", pid)
+	return backoff.Retry(op, b)
 }
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index a2da63afd..dadf8445b 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -350,7 +350,7 @@ func TestLifecycle(t *testing.T) {
 		// ourselves.
 		p, _ := os.FindProcess(s.Sandbox.Pid)
 		p.Wait()
-		g, _ := os.FindProcess(s.Sandbox.GoferPid)
+		g, _ := os.FindProcess(s.GoferPid)
 		g.Wait()
 
 		// Load the container from disk and check the status.
@@ -1701,3 +1701,48 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 		t.Errorf("stat %q got error %v, wanted ErrNotExist", filename, err)
 	}
 }
+
+func TestGoferExits(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	sandboxProc, err := os.FindProcess(c.Sandbox.Pid)
+	if err != nil {
+		t.Fatalf("error finding sandbox process: %v", err)
+	}
+	gofer, err := os.FindProcess(c.GoferPid)
+	if err != nil {
+		t.Fatalf("error finding sandbox process: %v", err)
+	}
+
+	// Kill sandbox and expect gofer to exit on its own.
+	if err := sandboxProc.Kill(); err != nil {
+		t.Fatalf("error killing sandbox process: %v", err)
+	}
+	if _, err := sandboxProc.Wait(); err != nil {
+		t.Fatalf("error waiting for sandbox process: %v", err)
+	}
+
+	if _, err := gofer.Wait(); err != nil {
+		t.Fatalf("error waiting for gofer process: %v", err)
+	}
+	if c.IsRunning() {
+		t.Errorf("container shouldn't be running, container: %+v", c)
+	}
+}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index cdacc5e22..d26a4dac6 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "sandbox",
@@ -29,17 +29,3 @@ go_library(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
-
-go_test(
-    name = "sandbox_test",
-    size = "small",
-    srcs = ["sandbox_test.go"],
-    data = [
-        "//runsc",
-    ],
-    embed = [":sandbox"],
-    deps = [
-        "//pkg/log",
-        "//runsc/test/testutil",
-    ],
-)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index a10b79856..e5d1f791d 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -40,46 +40,45 @@ import (
 // It is used to start/stop sandbox process (and associated processes like
 // gofers), as well as for running and manipulating containers inside a running
 // sandbox.
+//
+// Note: Sandbox must be immutable because a copy of it is saved for each
+// container and changes would not be synchronized to all of them.
 type Sandbox struct {
-	// ID is the id of the sandbox. By convention, this is the same ID as
-	// the first container run in the sandbox.
+	// ID is the id of the sandbox (immutable). By convention, this is the same
+	// ID as the first container run in the sandbox.
 	ID string `json:"id"`
 
-	// Pid is the pid of the running sandbox. May be 0 is the sandbox is
-	// not running.
+	// Pid is the pid of the running sandbox (immutable). May be 0 is the sandbox
+	// is not running.
 	Pid int `json:"pid"`
-
-	// GoferPid is the pid of the gofer running along side the sandbox. May
-	// be 0 if the gofer has been killed or it's not being used.
-	GoferPid int `json:"goferPid"`
 }
 
 // Create creates the sandbox process.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, error) {
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, int, error) {
 	s := &Sandbox{ID: id}
 
 	binPath, err := specutils.BinPath()
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
 	// Create the gofer process.
-	ioFiles, err := s.createGoferProcess(spec, conf, bundleDir, binPath)
+	goferPid, ioFiles, err := s.createGoferProcess(spec, conf, bundleDir, binPath)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
 	// Create the sandbox process.
 	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles); err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
 	// Wait for the control server to come up (or timeout).
 	if err := s.waitForCreated(10 * time.Second); err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
-	return s, nil
+	return s, goferPid, nil
 }
 
 // StartRoot starts running the root container process inside the sandbox.
@@ -288,10 +287,10 @@ func (s *Sandbox) connError(err error) error {
 	return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
 }
 
-func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) ([]*os.File, error) {
+func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) (int, []*os.File, error) {
 	if conf.FileAccess == boot.FileAccessDirect {
 		// Don't start a gofer. The sandbox will access host FS directly.
-		return nil, nil
+		return 0, nil, nil
 	}
 
 	// Start with the general config flags.
@@ -315,7 +314,7 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	for nextFD = 3; nextFD-3 < mountCount; nextFD++ {
 		sandEnd, goferEnd, err := createSocketPair()
 		if err != nil {
-			return nil, err
+			return 0, nil, err
 		}
 		defer goferEnd.Close()
 		sandEnds = append(sandEnds, sandEnd)
@@ -327,7 +326,7 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	addr := fsgofer.ControlSocketAddr(s.ID)
 	serverFD, err := server.CreateSocket(addr)
 	if err != nil {
-		return nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
+		return 0, nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
 	}
 
 	// Add the control server fd.
@@ -349,11 +348,10 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
 	if err := startInNS(cmd, nss); err != nil {
-		return nil, err
+		return 0, nil, err
 	}
-	s.GoferPid = cmd.Process.Pid
 	log.Infof("Gofer started, pid: %d", cmd.Process.Pid)
-	return sandEnds, nil
+	return cmd.Process.Pid, sandEnds, nil
 }
 
 // createSocketPair creates a pair of files wrapping a socket pair.
@@ -562,24 +560,9 @@ func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) {
 		PID: pid,
 		CID: cid,
 	}
-
 	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
 		return ws, fmt.Errorf("error waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
 	}
-
-	if s.IsRootContainer(cid) {
-		// If waiting for the root, give some time for the sandbox process to exit
-		// to prevent races with resources that might still be in use.
-		timeout := time.Now().Add(time.Second)
-		log.Debugf("Waiting for the sandbox process to exit")
-		for s.IsRunning() {
-			if time.Now().After(timeout) {
-				log.Debugf("Timeout waiting for sandbox process to exit")
-				break
-			}
-			time.Sleep(100 * time.Millisecond)
-		}
-	}
 	return ws, nil
 }
 
@@ -602,15 +585,8 @@ func (s *Sandbox) Destroy() error {
 	if s.Pid != 0 {
 		// TODO: Too harsh?
 		log.Debugf("Killing sandbox %q", s.ID)
-		killProcess(s.Pid, unix.SIGKILL)
-		s.Pid = 0
+		signalProcess(s.Pid, unix.SIGKILL)
 	}
-	if s.GoferPid != 0 {
-		log.Debugf("Killing gofer for sandbox %q", s.ID)
-		killProcess(s.GoferPid, unix.SIGKILL)
-		s.GoferPid = 0
-	}
-
 	return nil
 }
 
@@ -689,19 +665,8 @@ func (s *Sandbox) Resume(cid string) error {
 func (s *Sandbox) IsRunning() bool {
 	if s.Pid != 0 {
 		// Send a signal 0 to the sandbox process.
-		if err := killProcess(s.Pid, 0); err == nil {
-			return true
-		}
-	}
-	if s.GoferPid != 0 {
-		// Send a signal 0 to the gofer process.
-		if err := killProcess(s.GoferPid, 0); err == nil {
-			log.Warningf("Found orphan gofer process, pid: %d", s.GoferPid)
-			// Attempt to kill gofer if it's orphan.
-			killProcess(s.GoferPid, unix.SIGKILL)
-
-			// Don't wait for gofer to die. Return 'running' and hope gofer is dead
-			// next time around.
+		if err := signalProcess(s.Pid, 0); err == nil {
+			// Succeeded, process is running.
 			return true
 		}
 	}
@@ -724,10 +689,10 @@ func (s *Sandbox) Stacks() (string, error) {
 	return stacks, nil
 }
 
-// killProcess sends a signal to the host process (i.e. a sandbox or gofer
+// signalProcess sends a signal to the host process (i.e. a sandbox or gofer
 // process). Sandbox.Signal should be used to send a signal to a process
 // running inside the sandbox.
-func killProcess(pid int, sig syscall.Signal) error {
+func signalProcess(pid int, sig syscall.Signal) error {
 	if err := syscall.Kill(pid, sig); err != nil {
 		return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err)
 	}
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
deleted file mode 100644
index 40337bc53..000000000
--- a/runsc/sandbox/sandbox_test.go
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sandbox
-
-import (
-	"os"
-	"testing"
-
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
-)
-
-func init() {
-	log.SetLevel(log.Debug)
-	if err := testutil.ConfigureExePath(); err != nil {
-		panic(err.Error())
-	}
-}
-
-func TestGoferExits(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// Create, start and wait for the container.
-	s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.StartRoot(spec, conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-
-	sandboxProc, err := os.FindProcess(s.Pid)
-	if err != nil {
-		t.Fatalf("error finding sandbox process: %v", err)
-	}
-	gofer, err := os.FindProcess(s.GoferPid)
-	if err != nil {
-		t.Fatalf("error finding sandbox process: %v", err)
-	}
-
-	// Kill sandbox and expect gofer to exit on its own.
-	if err := sandboxProc.Kill(); err != nil {
-		t.Fatalf("error killing sandbox process: %v", err)
-	}
-	if _, err := sandboxProc.Wait(); err != nil {
-		t.Fatalf("error waiting for sandbox process: %v", err)
-	}
-
-	if _, err := gofer.Wait(); err != nil {
-		t.Fatalf("error waiting for gofer process: %v", err)
-	}
-	if s.IsRunning() {
-		t.Errorf("Sandbox shouldn't be running, sandbox: %+v", s)
-	}
-}
-- 
cgit v1.2.3


From a854678bc36065379ca0b988410b4a8318747a3d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 21 Aug 2018 14:01:14 -0700
Subject: Move container_test to the container package

PiperOrigin-RevId: 209655274
Change-Id: Id381114bdb3197c73e14f74b3f6cf1afd87d60cb
---
 runsc/container/BUILD             |  2 +-
 runsc/container/container_test.go | 89 +++++++++++++++++++--------------------
 2 files changed, 45 insertions(+), 46 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 1171355c8..e40ca4709 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -39,6 +39,7 @@ go_test(
         ":uds_test_app",
         "//runsc",
     ],
+    embed = [":container"],
     tags = [
         "requires-kvm",
     ],
@@ -49,7 +50,6 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
         "//runsc/boot",
-        "//runsc/container",
         "//runsc/specutils",
         "//runsc/test/testutil",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index dadf8445b..f9f7d75ea 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package container_test
+package container
 
 import (
 	"bytes"
@@ -38,7 +38,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/container"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
@@ -51,7 +50,7 @@ func init() {
 }
 
 // waitForProcessList waits for the given process list to show up in the container.
-func waitForProcessList(s *container.Container, expected []*control.Process) error {
+func waitForProcessList(s *Container, expected []*control.Process) error {
 	var got []*control.Process
 	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
 		var err error
@@ -90,7 +89,7 @@ func procListsEqual(got, want []*control.Process) bool {
 
 // getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
 // test for equality. This is because we already confirmed that exec occurred.
-func getAndCheckProcLists(cont *container.Container, want []*control.Process) error {
+func getAndCheckProcLists(cont *Container, want []*control.Process) error {
 	got, err := cont.Processes()
 	if err != nil {
 		return fmt.Errorf("error getting process data from container: %v", err)
@@ -188,7 +187,7 @@ func run(spec *specs.Spec, conf *boot.Config) error {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		return fmt.Errorf("error creating container: %v", err)
 	}
@@ -276,21 +275,21 @@ func TestLifecycle(t *testing.T) {
 		}
 		// Create the container.
 		id := testutil.UniqueContainerID()
-		if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
+		if _, err := Create(id, spec, conf, bundleDir, "", ""); err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
 
 		// Load the container from disk and check the status.
-		s, err := container.Load(rootDir, id)
+		s, err := Load(rootDir, id)
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
-		if got, want := s.Status, container.Created; got != want {
+		if got, want := s.Status, Created; got != want {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
 		// List should return the container id.
-		ids, err := container.List(rootDir)
+		ids, err := List(rootDir)
 		if err != nil {
 			t.Fatalf("error listing containers: %v", err)
 		}
@@ -303,11 +302,11 @@ func TestLifecycle(t *testing.T) {
 			t.Fatalf("error starting container: %v", err)
 		}
 		// Load the container from disk and check the status.
-		s, err = container.Load(rootDir, id)
+		s, err = Load(rootDir, id)
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
-		if got, want := s.Status, container.Running; got != want {
+		if got, want := s.Status, Running; got != want {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
@@ -354,11 +353,11 @@ func TestLifecycle(t *testing.T) {
 		g.Wait()
 
 		// Load the container from disk and check the status.
-		s, err = container.Load(rootDir, id)
+		s, err = Load(rootDir, id)
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
-		if got, want := s.Status, container.Stopped; got != want {
+		if got, want := s.Status, Stopped; got != want {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
@@ -368,7 +367,7 @@ func TestLifecycle(t *testing.T) {
 		}
 
 		// List should not return the container id.
-		ids, err = container.List(rootDir)
+		ids, err = List(rootDir)
 		if err != nil {
 			t.Fatalf("error listing containers: %v", err)
 		}
@@ -377,7 +376,7 @@ func TestLifecycle(t *testing.T) {
 		}
 
 		// Loading the container by id should fail.
-		if _, err = container.Load(rootDir, id); err == nil {
+		if _, err = Load(rootDir, id); err == nil {
 			t.Errorf("expected loading destroyed container to fail, but it did not")
 		}
 	}
@@ -404,7 +403,7 @@ func TestExePath(t *testing.T) {
 				t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
 			}
 
-			ws, err := container.Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+			ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 
 			os.RemoveAll(rootDir)
 			os.RemoveAll(bundleDir)
@@ -437,7 +436,7 @@ func TestAppExitStatus(t *testing.T) {
 	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
-	ws, err := container.Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir, "", "")
+	ws, err := Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
@@ -456,7 +455,7 @@ func TestAppExitStatus(t *testing.T) {
 	defer os.RemoveAll(rootDir2)
 	defer os.RemoveAll(bundleDir2)
 
-	ws, err = container.Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir2, "", "")
+	ws, err = Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir2, "", "")
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
@@ -481,7 +480,7 @@ func TestExec(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -589,7 +588,7 @@ func TestCheckpointRestore(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -635,7 +634,7 @@ func TestCheckpointRestore(t *testing.T) {
 		defer outputFile2.Close()
 
 		// Restore into a new container.
-		cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont2, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -674,7 +673,7 @@ func TestCheckpointRestore(t *testing.T) {
 		defer outputFile3.Close()
 
 		// Restore into a new container.
-		cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont3, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -753,7 +752,7 @@ func TestUnixDomainSockets(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -800,7 +799,7 @@ func TestUnixDomainSockets(t *testing.T) {
 		defer outputFile2.Close()
 
 		// Restore into a new container.
-		contRestore, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		contRestore, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -854,7 +853,7 @@ func TestPauseResume(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -902,7 +901,7 @@ func TestPauseResume(t *testing.T) {
 		if err := cont.Pause(); err != nil {
 			t.Errorf("error pausing container: %v", err)
 		}
-		if got, want := cont.Status, container.Paused; got != want {
+		if got, want := cont.Status, Paused; got != want {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
@@ -922,7 +921,7 @@ func TestPauseResume(t *testing.T) {
 		if err := cont.Resume(); err != nil {
 			t.Errorf("error pausing container: %v", err)
 		}
-		if got, want := cont.Status, container.Running; got != want {
+		if got, want := cont.Status, Running; got != want {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
@@ -957,7 +956,7 @@ func TestPauseResumeStatus(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -970,7 +969,7 @@ func TestPauseResumeStatus(t *testing.T) {
 	if err := cont.Pause(); err != nil {
 		t.Errorf("error pausing container: %v", err)
 	}
-	if got, want := cont.Status, container.Paused; got != want {
+	if got, want := cont.Status, Paused; got != want {
 		t.Errorf("container status got %v, want %v", got, want)
 	}
 
@@ -978,7 +977,7 @@ func TestPauseResumeStatus(t *testing.T) {
 	if err := cont.Pause(); err == nil {
 		t.Errorf("error pausing container that was already paused: %v", err)
 	}
-	if got, want := cont.Status, container.Paused; got != want {
+	if got, want := cont.Status, Paused; got != want {
 		t.Errorf("container status got %v, want %v", got, want)
 	}
 
@@ -986,7 +985,7 @@ func TestPauseResumeStatus(t *testing.T) {
 	if err := cont.Resume(); err != nil {
 		t.Errorf("error resuming container: %v", err)
 	}
-	if got, want := cont.Status, container.Running; got != want {
+	if got, want := cont.Status, Running; got != want {
 		t.Errorf("container status got %v, want %v", got, want)
 	}
 
@@ -994,7 +993,7 @@ func TestPauseResumeStatus(t *testing.T) {
 	if err := cont.Resume(); err == nil {
 		t.Errorf("error resuming container already running: %v", err)
 	}
-	if got, want := cont.Status, container.Running; got != want {
+	if got, want := cont.Status, Running; got != want {
 		t.Errorf("container status got %v, want %v", got, want)
 	}
 }
@@ -1021,7 +1020,7 @@ func TestCapabilities(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1123,7 +1122,7 @@ func TestConsoleSocket(t *testing.T) {
 
 		// Create the container and pass the socket name.
 		id := testutil.UniqueContainerID()
-		s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
+		s, err := Create(id, spec, conf, bundleDir, socketRelPath, "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1249,7 +1248,7 @@ func TestReadonlyRoot(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create, start and wait for the container.
-		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1292,7 +1291,7 @@ func TestReadonlyMount(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create, start and wait for the container.
-		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1334,7 +1333,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := container.Create(cid, spec, conf, bundleDir, "", "")
+		cont, err := Create(cid, spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1351,7 +1350,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 		cids[2]: cids[2],
 	}
 	for shortid, longid := range unambiguous {
-		if _, err := container.Load(rootDir, shortid); err != nil {
+		if _, err := Load(rootDir, shortid); err != nil {
 			t.Errorf("%q should resolve to %q: %v", shortid, longid, err)
 		}
 	}
@@ -1362,7 +1361,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 		"ba",
 	}
 	for _, shortid := range ambiguous {
-		if s, err := container.Load(rootDir, shortid); err == nil {
+		if s, err := Load(rootDir, shortid); err == nil {
 			t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID)
 		}
 	}
@@ -1398,7 +1397,7 @@ func TestMultiContainerSanity(t *testing.T) {
 		defer os.RemoveAll(rootDir)
 
 		// Setup the containers.
-		containers := make([]*container.Container, 0, len(containerIDs))
+		containers := make([]*Container, 0, len(containerIDs))
 		for i, annotations := range containerAnnotations {
 			spec := testutil.NewSpecWithArgs("sleep", "100")
 			spec.Annotations = annotations
@@ -1407,7 +1406,7 @@ func TestMultiContainerSanity(t *testing.T) {
 				t.Fatalf("error setting up container: %v", err)
 			}
 			defer os.RemoveAll(bundleDir)
-			cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+			cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
 			if err != nil {
 				t.Fatalf("error creating container: %v", err)
 			}
@@ -1475,7 +1474,7 @@ func TestMultiContainerWait(t *testing.T) {
 	defer os.RemoveAll(rootDir)
 
 	// Setup the containers.
-	containers := make([]*container.Container, 0, len(containerIDs))
+	containers := make([]*Container, 0, len(containerIDs))
 	for i, annotations := range containerAnnotations {
 		spec := testutil.NewSpecWithArgs(args[i][0], args[i][1])
 		spec.Annotations = annotations
@@ -1485,7 +1484,7 @@ func TestMultiContainerWait(t *testing.T) {
 			t.Fatalf("error setting up container: %v", err)
 		}
 		defer os.RemoveAll(bundleDir)
-		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+		cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1591,7 +1590,7 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	c, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -1713,7 +1712,7 @@ func TestGoferExits(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	c, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
-- 
cgit v1.2.3


From 19ef2ad1fe82093548edbb00de536d4bcf328f2b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 21 Aug 2018 14:34:00 -0700
Subject: nonExclusiveFS is causing timeout with --race

Not sure why, just removed for now to unblock the tests.

PiperOrigin-RevId: 209661403
Change-Id: I72785c071687d54e22bda9073d36b447d52a7018
---
 runsc/container/container_test.go | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index f9f7d75ea..4d44e7abe 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -213,8 +213,10 @@ const (
 	nonExclusiveFS
 )
 
-var all = []configOption{overlay, kvm, nonExclusiveFS}
-var noOverlay = []configOption{kvm, nonExclusiveFS}
+// TODO: nonExclusiveFS was removed because it causes timeout
+// with --race. Put it back when bug is fixed.
+var all = []configOption{overlay, kvm}
+var noOverlay = []configOption{kvm}
 
 // configs generates different configurations to run tests.
 func configs(opts ...configOption) []*boot.Config {
@@ -557,10 +559,7 @@ func TestExec(t *testing.T) {
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	//
-	// TODO: Skip nonExclusiveFS because $TEST_TMPDIR mount is
-	// mistakenly marked as RO after revalidation.
-	for _, conf := range configs(kvm) {
+	for _, conf := range configs(noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
@@ -711,10 +710,7 @@ func TestUnixDomainSockets(t *testing.T) {
 	)
 
 	// Skip overlay because test requires writing to host file.
-	//
-	// TODO: Skip nonExclusiveFS because $TEST_TMPDIR mount is
-	// mistakenly marked as RO after revalidation.
-	for _, conf := range configs(kvm) {
+	for _, conf := range configs(noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "uds-test")
-- 
cgit v1.2.3


From ae68e9e7513083411875110bd31bd89ac3a58cb7 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Aug 2018 16:19:59 -0700
Subject: Temporarily skip multi-container tests in container_test until
 deflaked.

PiperOrigin-RevId: 209679235
Change-Id: I527e779eeb113d0c162f5e27a2841b9486f0e39f
---
 runsc/container/container_test.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 4d44e7abe..7f2bac4b8 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1366,6 +1366,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
+	t.Skip("Test is flakey.") // TODO: Remove.
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
@@ -1438,6 +1439,7 @@ func TestMultiContainerSanity(t *testing.T) {
 }
 
 func TestMultiContainerWait(t *testing.T) {
+	t.Skip("Test is flakey.") // TODO: Remove.
 	containerIDs := []string{
 		testutil.UniqueContainerID(),
 		testutil.UniqueContainerID(),
-- 
cgit v1.2.3


From e2ab7ec39e500627126fe8be8e37400711410cde Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 21 Aug 2018 23:06:11 -0700
Subject: Fix TestUnixDomainSockets failure when path is too large

UDS has a lower size limit than regular files. When running under bazel
this limit is exceeded. Test was changed to always mount /tmp and use
it for the test.

PiperOrigin-RevId: 209717830
Change-Id: I1dbe19fe2051ffdddbaa32b188a9167f446ed193
---
 runsc/container/container_test.go | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7f2bac4b8..d847dca97 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -704,22 +704,20 @@ func TestCheckpointRestore(t *testing.T) {
 // TestUnixDomainSockets checks that Checkpoint/Restore works in cases
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
-	const (
-		output = "uds_output"
-		socket = "uds_socket"
-	)
-
 	// Skip overlay because test requires writing to host file.
 	for _, conf := range configs(noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "uds-test")
+		// UDS path is limited to 108 chars for compatibility with older systems.
+		// Use '/tmp' (instead of testutil.TmpDir) to to ensure the size limit is
+		// not exceeded. Assumes '/tmp' exists in the system.
+		dir, err := ioutil.TempDir("/tmp", "uds-test")
 		if err != nil {
 			t.Fatalf("ioutil.TempDir failed: %v", err)
 		}
 		defer os.RemoveAll(dir)
 
-		outputPath := filepath.Join(dir, output)
+		outputPath := filepath.Join(dir, "uds_output")
 		outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
 		if err != nil {
 			t.Fatalf("error creating output file: %v", err)
@@ -731,7 +729,7 @@ func TestUnixDomainSockets(t *testing.T) {
 			t.Fatal("error finding uds_test_app:", err)
 		}
 
-		socketPath := filepath.Join(dir, socket)
+		socketPath := filepath.Join(dir, "uds_socket")
 		defer os.Remove(socketPath)
 
 		spec := testutil.NewSpecWithArgs(app, "--file", outputPath, "--socket", socketPath)
@@ -739,6 +737,13 @@ func TestUnixDomainSockets(t *testing.T) {
 			UID: uint32(os.Getuid()),
 			GID: uint32(os.Getgid()),
 		}
+		spec.Mounts = []specs.Mount{
+			specs.Mount{
+				Type:        "bind",
+				Destination: "/tmp",
+				Source:      "/tmp",
+			},
+		}
 
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
-- 
cgit v1.2.3


From a78df1d874f376c0924d5a8f91e9e2b5458cca0f Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 22 Aug 2018 17:54:18 -0700
Subject: runsc: De-flakes container_test TestMultiContainerSanity.

The bug was caused by os.File's finalizer, which closes the file. Because
fsgofer.serve() was passed a file descriptor as an int rather than a os.File,
callers would pass os.File.Fd(), and the os.File would go out of scope. Thus,
the file would get GC'd and finalized nondeterministically, causing failures
when the file was used.

PiperOrigin-RevId: 209861834
Change-Id: Idf24d5c1f04c9b28659e62c97202ab3b4d72e994
---
 runsc/container/container_test.go |  1 -
 runsc/fsgofer/control.go          | 19 ++++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index d847dca97..6d84700ce 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1371,7 +1371,6 @@ func TestAbbreviatedIDs(t *testing.T) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	t.Skip("Test is flakey.") // TODO: Remove.
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
diff --git a/runsc/fsgofer/control.go b/runsc/fsgofer/control.go
index 8ce8ee8a0..8cb2f67ac 100644
--- a/runsc/fsgofer/control.go
+++ b/runsc/fsgofer/control.go
@@ -16,6 +16,7 @@ package fsgofer
 
 import (
 	"fmt"
+	"os"
 	"path/filepath"
 	"sync"
 
@@ -67,13 +68,13 @@ func (cr *Controller) Wait() {
 }
 
 // Serve starts serving each Attacher in ats via its corresponding file
-// descriptor in ioFDs.
+// descriptor in ioFDs. This takes ownership of the FDs in ioFDs.
 func (cr *Controller) Serve(ats []p9.Attacher, ioFDs []int) error {
 	if len(ats) != len(ioFDs) {
 		return fmt.Errorf("number of attach points does not match the number of IO FDs (%d and %d)", len(ats), len(ioFDs))
 	}
 	for i, _ := range ats {
-		cr.api.serve(ats[i], ioFDs[i])
+		cr.api.serve(ats[i], os.NewFile(uintptr(ioFDs[i]), "io fd"))
 	}
 	return nil
 }
@@ -181,23 +182,23 @@ func (api *api) ServeDirectory(req *ServeDirectoryRequest, _ *struct{}) error {
 		ROMount:          req.IsReadOnly,
 		LazyOpenForWrite: true,
 	})
-	api.serve(at, int(req.FilePayload.Files[0].Fd()))
+	api.serve(at, req.FilePayload.Files[0])
 
 	return nil
 }
 
 // serve begins serving a directory via a file descriptor.
-func (api *api) serve(at p9.Attacher, ioFD int) {
+func (api *api) serve(at p9.Attacher, ioFile *os.File) {
 	api.p9wg.Add(1)
-	go func(ioFD int, at p9.Attacher) {
-		socket, err := unet.NewSocket(ioFD)
+	go func() {
+		socket, err := unet.NewSocket(int(ioFile.Fd()))
 		if err != nil {
-			panic(fmt.Sprintf("err creating server on FD %d: %v", ioFD, err))
+			panic(fmt.Sprintf("err creating server on FD %d: %v", ioFile.Fd(), err))
 		}
 		s := p9.NewServer(at)
 		if err := s.Handle(socket); err != nil {
-			panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err))
+			panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFile.Fd(), err))
 		}
 		api.p9wg.Done()
-	}(ioFD, at)
+	}()
 }
-- 
cgit v1.2.3


From 001a4c2493b13a43d62c7511fb509a959ae4abc2 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 23 Aug 2018 11:14:02 -0700
Subject: Clean up syscall filters

Removed syscalls that are only used by whitelistfs
which has its own set of filters.

PiperOrigin-RevId: 209967259
Change-Id: Idb2e1b9d0201043d7cd25d96894f354729dbd089
---
 runsc/boot/filter/config.go | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 0ce49b3b2..e45e599c3 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -38,7 +38,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_EXIT:          {},
 	syscall.SYS_EXIT_GROUP:    {},
 	syscall.SYS_FALLOCATE:     {},
-	syscall.SYS_FCHMOD:        {},
 	syscall.SYS_FCNTL:         {},
 	syscall.SYS_FSTAT:         {},
 	syscall.SYS_FSYNC:         {},
@@ -60,15 +59,12 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_MMAP:            {},
 	syscall.SYS_MPROTECT:        {},
 	syscall.SYS_MUNMAP:          {},
-	syscall.SYS_NEWFSTATAT:      {},
 	syscall.SYS_POLL:            {},
 	syscall.SYS_PREAD64:         {},
 	syscall.SYS_PWRITE64:        {},
 	syscall.SYS_READ:            {},
-	syscall.SYS_READLINKAT:      {},
 	syscall.SYS_READV:           {},
 	syscall.SYS_RECVMSG:         {},
-	syscall.SYS_RENAMEAT:        {},
 	syscall.SYS_RESTART_SYSCALL: {},
 	syscall.SYS_RT_SIGACTION:    {},
 	syscall.SYS_RT_SIGPROCMASK:  {},
@@ -80,7 +76,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SIGALTSTACK:     {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
 	syscall.SYS_TGKILL:          {},
-	syscall.SYS_UTIMENSAT:       {},
 	syscall.SYS_WRITE:           {},
 	syscall.SYS_WRITEV:          {},
 }
-- 
cgit v1.2.3


From a81a4402a265aec6715172cd3502ee7eebbf64aa Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 24 Aug 2018 10:16:38 -0700
Subject: Add option to panic gofer if writes are attempted over RO mounts

This is used when '--overlay=true' to guarantee writes are not sent to gofer.

PiperOrigin-RevId: 210116288
Change-Id: I7616008c4c0e8d3668e07a205207f46e2144bf30
---
 runsc/cmd/gofer.go            |  7 ++++++-
 runsc/fsgofer/fsgofer.go      | 24 ++++++++++++++++++++++++
 runsc/fsgofer/fsgofer_test.go | 25 +++++++++++++++++++++++++
 runsc/sandbox/sandbox.go      |  4 ++++
 4 files changed, 59 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index ed4b1d29c..e23f64d12 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -38,6 +38,8 @@ type Gofer struct {
 	// controllerFD is the file descriptor of a stream socket for the
 	// control server that is donated to this process.
 	controllerFD int
+
+	panicOnWrite bool
 }
 
 // Name implements subcommands.Command.
@@ -61,6 +63,7 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
 	f.IntVar(&g.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+	f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
 }
 
 // Execute implements subcommands.Command.
@@ -110,7 +113,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
 	p := absPath(g.bundleDir, spec.Root.Path)
 	ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
-		ROMount: spec.Root.Readonly,
+		ROMount:      spec.Root.Readonly,
+		PanicOnWrite: g.panicOnWrite,
 		// Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when
 		// each file is opened as writable. Thus, we open files lazily to avoid copy-up.
 		LazyOpenForWrite: true,
@@ -123,6 +127,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			p = absPath(g.bundleDir, m.Source)
 			ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
 				ROMount:          isReadonlyMount(m.Options),
+				PanicOnWrite:     g.panicOnWrite,
 				LazyOpenForWrite: false,
 			}))
 
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 38263896a..1316dc618 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -76,6 +76,9 @@ type Config struct {
 	// ROMount is set to true if this is a readonly mount.
 	ROMount bool
 
+	// PanicOnWrite panics on attempts to write to RO mounts.
+	PanicOnWrite bool
+
 	// LazyOpenForWrite makes the underlying file to be opened in RDONLY
 	// mode initially and be reopened in case write access is desired.
 	// This is done to workaround the behavior in 'overlay2' that
@@ -375,6 +378,9 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 // Create implements p9.File.
 func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
 	if l.conf.ROMount {
+		if l.conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
 		return nil, nil, p9.QID{}, 0, syscall.EBADF
 	}
 	if !isNameValid(name) {
@@ -429,6 +435,9 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 // Mkdir implements p9.File.
 func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
 	if l.conf.ROMount {
+		if l.conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
 		return p9.QID{}, syscall.EBADF
 	}
 
@@ -585,6 +594,9 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
 // an error happens.
 func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	if l.conf.ROMount {
+		if l.conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
 		return syscall.EBADF
 	}
 
@@ -722,6 +734,9 @@ func (*localFile) Remove() error {
 // Rename implements p9.File.
 func (l *localFile) Rename(directory p9.File, name string) error {
 	if l.conf.ROMount {
+		if l.conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
 		return syscall.EBADF
 	}
 	if !isNameValid(name) {
@@ -789,6 +804,9 @@ func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 // Symlink implements p9.File.
 func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
 	if l.conf.ROMount {
+		if l.conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
 		return p9.QID{}, syscall.EBADF
 	}
 	if !isNameValid(newName) {
@@ -819,6 +837,9 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 // Link implements p9.File.
 func (l *localFile) Link(target p9.File, newName string) error {
 	if l.conf.ROMount {
+		if l.conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
 		return syscall.EBADF
 	}
 	if !isNameValid(newName) {
@@ -842,6 +863,9 @@ func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _
 // UnlinkAt implements p9.File.
 func (l *localFile) UnlinkAt(name string, flags uint32) error {
 	if l.conf.ROMount {
+		if l.conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
 		return syscall.EBADF
 	}
 	if !isNameValid(name) {
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 8d038eaf6..fcece4e83 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -34,6 +34,15 @@ func init() {
 	allConfs = append(allConfs, roConfs...)
 }
 
+func assertPanic(t *testing.T, f func()) {
+	defer func() {
+		if r := recover(); r == nil {
+			t.Errorf("function did not panic")
+		}
+	}()
+	f()
+}
+
 var (
 	allTypes = []fileType{regular, directory, symlink}
 
@@ -434,6 +443,22 @@ func TestROMountChecks(t *testing.T) {
 	})
 }
 
+func TestROMountPanics(t *testing.T) {
+	conf := Config{ROMount: true, PanicOnWrite: true}
+	runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
+		assertPanic(t, func() { s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.Rename(s.file, "..") })
+		assertPanic(t, func() { s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.UnlinkAt("..", 0) })
+		assertPanic(t, func() { s.file.Link(s.file, "..") })
+
+		valid := p9.SetAttrMask{Size: true}
+		attr := p9.SetAttr{Size: 0}
+		assertPanic(t, func() { s.file.SetAttr(valid, attr) })
+	})
+}
+
 func TestInvalidName(t *testing.T) {
 	runCustom(t, []fileType{regular}, rwConfs, func(t *testing.T, s state) {
 		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e5d1f791d..7789608f8 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -345,6 +345,10 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	setUIDGIDMappings(cmd, spec)
 	nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
 
+	if conf.Overlay {
+		args = append(args, "--panic-on-write=true")
+	}
+
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
 	if err := startInNS(cmd, nss); err != nil {
-- 
cgit v1.2.3


From 02dfceab6d4c4a2a3342ef69be0265b7ab03e5d7 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 24 Aug 2018 14:41:38 -0700
Subject: runsc: Allow runsc to properly search the PATH for executable name.

Previously, runsc improperly attempted to find an executable in the container's
PATH.

We now search the PATH via the container's fsgofer rather than the host FS,
eliminating the confusing differences between paths on the host and within a
container.

PiperOrigin-RevId: 210159488
Change-Id: I228174dbebc4c5356599036d6efaa59f28ff28d2
---
 runsc/boot/fs.go             | 45 ++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/loader.go         |  4 ++++
 runsc/specutils/specutils.go | 25 ++++++++++++++----------
 3 files changed, 64 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 8996b1398..6f5379a6d 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,6 +16,7 @@ package boot
 
 import (
 	"fmt"
+	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -701,3 +702,47 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	procArgs.Root = containerRootDirent
 	return nil
 }
+
+// GetExecutablePathInternal traverses the *container's* filesystem to resolve
+// exec's absolute path. For example, if the container is being served files by
+// the fsgofer serving /foo/bar as the container root, it will search within
+// /foo/bar, not the host root.
+// TODO: Unit test this.
+func GetExecutablePathInternal(ctx context.Context, procArgs *kernel.CreateProcessArgs) (string, error) {
+	exec := filepath.Clean(procArgs.Filename)
+
+	// Don't search PATH if exec is a path to a file (absolute or relative).
+	if strings.IndexByte(exec, '/') >= 0 {
+		return exec, nil
+	}
+
+	// Search the PATH for a file whose name matches the one we are looking
+	// for.
+	pathDirs := specutils.GetPath(procArgs.Envv)
+	for _, p := range pathDirs {
+		// Walk to the end of the path.
+		curDir := procArgs.Root
+		for _, pc := range strings.Split(p, "/") {
+			var err error
+			if curDir, err = curDir.Walk(ctx, curDir, pc); err != nil {
+				break
+			}
+		}
+		if curDir == nil {
+			continue
+		}
+		// Check for the executable in the path directory.
+		dirent, err := curDir.Walk(ctx, curDir, exec)
+		if err != nil {
+			continue
+		}
+		// Check whether we can read and execute the file in question.
+		if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
+			log.Infof("Found executable at %q, but user cannot execute it: %v", path.Join(p, exec), err)
+			continue
+		}
+		return path.Join("/", p, exec), nil
+	}
+
+	return "", fmt.Errorf("could not find executable %s in path %v", exec, pathDirs)
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 7debf0ac2..2f212c704 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -428,6 +428,10 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
 
+	if procArgs.Filename, err = GetExecutablePathInternal(procArgs.NewContext(k), &procArgs); err != nil {
+		return 0, err
+	}
+
 	tg, err := l.k.CreateProcess(procArgs)
 	if err != nil {
 		return 0, fmt.Errorf("failed to create process in sentry: %v", err)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 27441cbde..5fb53edb2 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -126,6 +126,8 @@ func ReadSpec(bundleDir string) (*specs.Spec, error) {
 // GetExecutablePath returns the absolute path to the executable, relative to
 // the root. It searches the environment PATH for the first file that exists
 // with the given name.
+// TODO: Remove this in favor of finding executables via
+// boot.GetExecutablePathInternal.
 func GetExecutablePath(exec, root string, env []string) (string, error) {
 	exec = filepath.Clean(exec)
 
@@ -134,18 +136,9 @@ func GetExecutablePath(exec, root string, env []string) (string, error) {
 		return exec, nil
 	}
 
-	// Get the PATH from the environment.
-	const prefix = "PATH="
-	var path []string
-	for _, e := range env {
-		if strings.HasPrefix(e, prefix) {
-			path = strings.Split(strings.TrimPrefix(e, prefix), ":")
-			break
-		}
-	}
-
 	// Search the PATH for a file whose name matches the one we are looking
 	// for.
+	path := GetPath(env)
 	for _, p := range path {
 		abs := filepath.Join(root, p, exec)
 		// Do not follow symlink link because the target is in the container
@@ -161,6 +154,18 @@ func GetExecutablePath(exec, root string, env []string) (string, error) {
 	return exec, nil
 }
 
+// GetPath returns the PATH as a slice of strings given the environemnt
+// variables.
+func GetPath(env []string) []string {
+	const prefix = "PATH="
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			return strings.Split(strings.TrimPrefix(e, prefix), ":")
+		}
+	}
+	return nil
+}
+
 // Capabilities takes in spec and returns a TaskCapabilities corresponding to
 // the spec.
 func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
-- 
cgit v1.2.3


From 106de2182d34197d76fb68863cd4a102ebac2dbb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 24 Aug 2018 17:42:30 -0700
Subject: runsc: Terminal support for "docker exec -ti".

This CL adds terminal support for "docker exec".  We previously only supported
consoles for the container process, but not exec processes.

The SYS_IOCTL syscall was added to the default seccomp filter list, but only
for ioctls that get/set winsize and termios structs. We need to allow these
ioctl for all containers because it's possible to run "exec -ti" on a
container that was started without an attached console, after the filters
have been installed.

Note that control-character signals are still not properly supported.

Tested with:
	$ docker run --runtime=runsc -it alpine
In another terminial:
	$ docker exec -it <containerid> /bin/sh

PiperOrigin-RevId: 210185456
Change-Id: I6d2401e53a7697bb988c120a8961505c335f96d9
---
 pkg/abi/linux/ioctl.go             |  6 +++-
 pkg/abi/linux/tty.go               |  8 +++++
 pkg/sentry/control/proc.go         | 17 +++++------
 pkg/sentry/fs/host/BUILD           |  1 -
 pkg/sentry/fs/host/file.go         | 19 ++++++------
 pkg/sentry/fs/host/ioctl_unsafe.go | 19 +++++++++++-
 runsc/boot/controller.go           |  2 +-
 runsc/boot/filter/BUILD            |  1 +
 runsc/boot/filter/config.go        | 38 +++++++++++++++++++-----
 runsc/boot/filter/filter.go        |  6 +---
 runsc/boot/loader.go               |  2 +-
 runsc/cmd/BUILD                    |  1 +
 runsc/cmd/exec.go                  | 39 ++++++++++++++++++++++--
 runsc/console/BUILD                | 16 ++++++++++
 runsc/console/console.go           | 61 ++++++++++++++++++++++++++++++++++++++
 runsc/sandbox/BUILD                |  3 +-
 runsc/sandbox/console.go           | 60 -------------------------------------
 runsc/sandbox/sandbox.go           | 20 +++++++------
 18 files changed, 207 insertions(+), 112 deletions(-)
 create mode 100644 runsc/console/BUILD
 create mode 100644 runsc/console/console.go
 delete mode 100644 runsc/sandbox/console.go

(limited to 'runsc')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 3ef046562..4d7a2dfd7 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -21,8 +21,12 @@ const (
 	TCGETS     = 0x00005401
 	TCSETS     = 0x00005402
 	TCSETSW    = 0x00005403
-	TIOCINQ    = 0x0000541b
+	TIOCGPGRP  = 0x0000540f
+	TIOCSPGRP  = 0x00005410
 	TIOCOUTQ   = 0x00005411
+	TIOCGWINSZ = 0x00005413
+	TIOCSWINSZ = 0x00005414
+	TIOCINQ    = 0x0000541b
 	FIONREAD   = TIOCINQ
 	FIONBIO    = 0x00005421
 	TIOCGPTN   = 0x80045430
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 8c611d22a..81156867c 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -26,6 +26,14 @@ const (
 	disabledChar = 0
 )
 
+// Winsize is struct winsize, defined in uapi/asm-generic/termios.h.
+type Winsize struct {
+	Row    uint16
+	Col    uint16
+	Xpixel uint16
+	Ypixel uint16
+}
+
 // Termios is struct termios, defined in uapi/asm-generic/termbits.h.
 type Termios struct {
 	InputFlags        uint32
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index d94ae560f..2493c5175 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -19,7 +19,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"sort"
-	"syscall"
 	"text/tabwriter"
 	"time"
 
@@ -73,6 +72,10 @@ type ExecArgs struct {
 	// Capabilities is the list of capabilities to give to the process.
 	Capabilities *auth.TaskCapabilities
 
+	// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host
+	// pty fd.
+	StdioIsPty bool
+
 	// FilePayload determines the files to give to the new process.
 	urpc.FilePayload
 }
@@ -108,17 +111,11 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 	mounter := fs.FileOwnerFromContext(ctx)
 
 	for appFD, f := range args.FilePayload.Files {
-		// Copy the underlying FD.
-		newFD, err := syscall.Dup(int(f.Fd()))
-		if err != nil {
-			return err
-		}
-		f.Close()
+		enableIoctl := args.StdioIsPty && appFD <= 2
 
-		// Install the given file as an FD.
-		file, err := host.NewFile(ctx, newFD, mounter)
+		// Import the given file FD. This dups the FD as well.
+		file, err := host.ImportFile(ctx, int(f.Fd()), mounter, enableIoctl)
 		if err != nil {
-			syscall.Close(newFD)
 			return err
 		}
 		defer file.DecRef()
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 29c79284a..f1252b0f2 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -48,7 +48,6 @@ go_library(
         "//pkg/unet",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index f9bef6d93..8d2463c78 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"syscall"
 
-	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -296,7 +295,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 	fd := f.iops.fileState.FD()
 	ioctl := args[1].Uint64()
 	switch ioctl {
-	case unix.TCGETS:
+	case linux.TCGETS:
 		termios, err := ioctlGetTermios(fd)
 		if err != nil {
 			return 0, err
@@ -306,7 +305,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		})
 		return 0, err
 
-	case unix.TCSETS, unix.TCSETSW:
+	case linux.TCSETS, linux.TCSETSW:
 		var termios linux.Termios
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
 			AddressSpaceActive: true,
@@ -316,7 +315,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		err := ioctlSetTermios(fd, ioctl, &termios)
 		return 0, err
 
-	case unix.TIOCGPGRP:
+	case linux.TIOCGPGRP:
 		// Args: pid_t *argp
 		// When successful, equivalent to *argp = tcgetpgrp(fd).
 		// Get the process group ID of the foreground process group on
@@ -332,7 +331,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		})
 		return 0, err
 
-	case unix.TIOCSPGRP:
+	case linux.TIOCSPGRP:
 		// Args: const pid_t *argp
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
@@ -343,10 +342,10 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		log.Warningf("Ignoring application ioctl(TIOCSPGRP) call")
 		return 0, nil
 
-	case unix.TIOCGWINSZ:
+	case linux.TIOCGWINSZ:
 		// Args: struct winsize *argp
 		// Get window size.
-		winsize, err := unix.IoctlGetWinsize(fd, unix.TIOCGWINSZ)
+		winsize, err := ioctlGetWinsize(fd)
 		if err != nil {
 			return 0, err
 		}
@@ -355,16 +354,16 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		})
 		return 0, err
 
-	case unix.TIOCSWINSZ:
+	case linux.TIOCSWINSZ:
 		// Args: const struct winsize *argp
 		// Set window size.
-		var winsize unix.Winsize
+		var winsize linux.Winsize
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
 			AddressSpaceActive: true,
 		}); err != nil {
 			return 0, err
 		}
-		err := unix.IoctlSetWinsize(fd, unix.TIOCSWINSZ, &winsize)
+		err := ioctlSetWinsize(fd, &winsize)
 		return 0, err
 
 	default:
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index 3c07c3850..bc965a1c2 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -23,7 +23,7 @@ import (
 
 func ioctlGetTermios(fd int) (*linux.Termios, error) {
 	var t linux.Termios
-	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TCGETS, uintptr(unsafe.Pointer(&t)))
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
 	if errno != 0 {
 		return nil, errno
 	}
@@ -37,3 +37,20 @@ func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
 	}
 	return nil
 }
+
+func ioctlGetWinsize(fd int) (*linux.Winsize, error) {
+	var w linux.Winsize
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &w, nil
+}
+
+func ioctlSetWinsize(fd int, w *linux.Winsize) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 69e88d8e0..2d6b507b3 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -227,7 +227,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 
 // Execute runs a command on a created or running sandbox.
 func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
-	log.Debugf("containerManager.Execute")
+	log.Debugf("containerManager.Execute: %+v", *e)
 	proc := control.Proc{Kernel: cm.l.k}
 	if err := proc.Exec(e, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index c9837c236..96be051fe 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -18,6 +18,7 @@ go_library(
         "//runsc/boot:__subpackages__",
     ],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/seccomp",
         "//pkg/sentry/platform",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index e45e599c3..db2e3f9d8 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -18,6 +18,7 @@ import (
 	"syscall"
 
 	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 )
 
@@ -78,15 +79,36 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_TGKILL:          {},
 	syscall.SYS_WRITE:           {},
 	syscall.SYS_WRITEV:          {},
-}
 
-// TODO: Ioctl is needed in order to support tty consoles.
-// Once filters support argument-checking, we should only allow ioctl
-// with tty-related arguments.
-func consoleFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_IOCTL: {},
-	}
+	// SYS_IOCTL is needed for terminal support, but we only allow
+	// setting/getting termios and winsize.
+	syscall.SYS_IOCTL: []seccomp.Rule{
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCGETS),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETS),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETSW),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TIOCSWINSZ),
+			seccomp.AllowAny{}, /* winsize struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TIOCGWINSZ),
+			seccomp.AllowAny{}, /* winsize struct */
+		},
+	},
 }
 
 // whitelistFSFilters returns syscalls made by whitelistFS. Using WhitelistFS
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index 6ea9c464e..c57bbd2e5 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -28,7 +28,7 @@ import (
 )
 
 // Install installs seccomp filters for based on the given platform.
-func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error {
+func Install(p platform.Platform, whitelistFS, hostNetwork bool) error {
 	s := allowedSyscalls
 
 	// Set of additional filters used by -race and -msan. Returns empty
@@ -39,10 +39,6 @@ func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error
 		Report("direct file access allows unrestricted file access!")
 		s.Merge(whitelistFSFilters())
 	}
-	if console {
-		Report("console is enabled: syscall filters less restrictive!")
-		s.Merge(consoleFilters())
-	}
 	if hostNetwork {
 		Report("host networking enabled: syscall filters less restrictive!")
 		s.Merge(hostInetFilters())
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 2f212c704..0e94cf215 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -340,7 +340,7 @@ func (l *Loader) run() error {
 	} else {
 		whitelistFS := l.conf.FileAccess == FileAccessDirect
 		hostNet := l.conf.Network == NetworkHost
-		if err := filter.Install(l.k.Platform, whitelistFS, l.console, hostNet); err != nil {
+		if err := filter.Install(l.k.Platform, whitelistFS, hostNet); err != nil {
 			return fmt.Errorf("Failed to install seccomp filters: %v", err)
 		}
 	}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index c45784749..b9ef4022f 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -38,6 +38,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/console",
         "//runsc/container",
         "//runsc/fsgofer",
         "//runsc/specutils",
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 4ee370656..b84a80119 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/console"
 	"gvisor.googlesource.com/gvisor/runsc/container"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -50,6 +51,11 @@ type Exec struct {
 	detach      bool
 	processPath string
 	pidFile     string
+
+	// consoleSocket is the path to an AF_UNIX socket which will receive a
+	// file descriptor referencing the master end of the console's
+	// pseudoterminal.
+	consoleSocket string
 }
 
 // Name implements subcommands.Command.Name.
@@ -91,6 +97,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
 	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
+	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
@@ -178,11 +185,35 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 			args = append(args, a)
 		}
 	}
-
 	cmd := exec.Command(binPath, args...)
+
+	// Exec stdio defaults to current process stdio.
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
+
+	// If the console control socket file is provided, then create a new
+	// pty master/slave pair and set the tty on the sandbox process.
+	if ex.consoleSocket != "" {
+		// Create a new tty pair and send the master on the provided
+		// socket.
+		tty, err := console.NewWithSocket(ex.consoleSocket)
+		if err != nil {
+			Fatalf("error setting up console with socket %q: %v", ex.consoleSocket, err)
+		}
+		defer tty.Close()
+
+		// Set stdio to the new tty slave.
+		cmd.Stdin = tty
+		cmd.Stdout = tty
+		cmd.Stderr = tty
+		cmd.SysProcAttr = &syscall.SysProcAttr{
+			Setsid:  true,
+			Setctty: true,
+			Ctty:    int(tty.Fd()),
+		}
+	}
+
 	if err := cmd.Start(); err != nil {
 		Fatalf("failure to start child exec process, err: %v", err)
 	}
@@ -252,11 +283,12 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
 	return &control.ExecArgs{
 		Argv:             argv,
 		WorkingDirectory: ex.cwd,
-		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
 		KUID:             ex.user.kuid,
 		KGID:             ex.user.kgid,
 		ExtraKGIDs:       extraKGIDs,
 		Capabilities:     caps,
+		StdioIsPty:       ex.consoleSocket != "",
+		FilePayload:      urpc.FilePayload{[]*os.File{os.Stdin, os.Stdout, os.Stderr}},
 	}, nil
 }
 
@@ -292,11 +324,12 @@ func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
 		Argv:             p.Args,
 		Envv:             p.Env,
 		WorkingDirectory: p.Cwd,
-		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
 		KUID:             auth.KUID(p.User.UID),
 		KGID:             auth.KGID(p.User.GID),
 		ExtraKGIDs:       extraKGIDs,
 		Capabilities:     caps,
+		StdioIsPty:       p.Terminal,
+		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
 	}, nil
 }
 
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
new file mode 100644
index 000000000..fa1a7d430
--- /dev/null
+++ b/runsc/console/BUILD
@@ -0,0 +1,16 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "console",
+    srcs = ["console.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/console",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "@com_github_kr_pty//:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/console/console.go b/runsc/console/console.go
new file mode 100644
index 000000000..2f2745b2b
--- /dev/null
+++ b/runsc/console/console.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package console contains utilities for working with pty consols in runsc.
+package console
+
+import (
+	"fmt"
+	"net"
+	"os"
+
+	"github.com/kr/pty"
+	"golang.org/x/sys/unix"
+)
+
+// NewWithSocket creates pty master/slave pair, sends the master FD over the given
+// socket, and returns the slave.
+func NewWithSocket(socketPath string) (*os.File, error) {
+	// Create a new pty master and slave.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		return nil, fmt.Errorf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+
+	// Get a connection to the socket path.
+	conn, err := net.Dial("unix", socketPath)
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
+	}
+	uc, ok := conn.(*net.UnixConn)
+	if !ok {
+		ptySlave.Close()
+		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
+	}
+	socket, err := uc.File()
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
+	}
+
+	// Send the master FD over the connection.
+	msg := unix.UnixRights(int(ptyMaster.Fd()))
+	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err)
+	}
+	return ptySlave, nil
+}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index d26a4dac6..e9a39f797 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
     name = "sandbox",
     srcs = [
-        "console.go",
         "namespace.go",
         "network.go",
         "sandbox.go",
@@ -21,9 +20,9 @@ go_library(
         "//pkg/sentry/control",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/console",
         "//runsc/fsgofer",
         "//runsc/specutils",
-        "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_vishvananda_netlink//:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/sandbox/console.go b/runsc/sandbox/console.go
deleted file mode 100644
index 3f133e12a..000000000
--- a/runsc/sandbox/console.go
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sandbox
-
-import (
-	"fmt"
-	"net"
-	"os"
-
-	"github.com/kr/pty"
-	"golang.org/x/sys/unix"
-)
-
-// setupConsole creates pty master/slave pair, sends the master FD over the
-// given socket, and returns the slave.
-func setupConsole(socketPath string) (*os.File, error) {
-	// Create a new pty master and slave.
-	ptyMaster, ptySlave, err := pty.Open()
-	if err != nil {
-		return nil, fmt.Errorf("error opening pty: %v", err)
-	}
-	defer ptyMaster.Close()
-
-	// Get a connection to the socket path.
-	conn, err := net.Dial("unix", socketPath)
-	if err != nil {
-		ptySlave.Close()
-		return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
-	}
-	uc, ok := conn.(*net.UnixConn)
-	if !ok {
-		ptySlave.Close()
-		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
-	}
-	socket, err := uc.File()
-	if err != nil {
-		ptySlave.Close()
-		return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
-	}
-
-	// Send the master FD over the connection.
-	msg := unix.UnixRights(int(ptyMaster.Fd()))
-	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
-		ptySlave.Close()
-		return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err)
-	}
-	return ptySlave, nil
-}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 7789608f8..e54ba4ba3 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/console"
 	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -392,7 +393,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		"boot",
 		"--bundle", bundleDir,
 		"--controller-fd="+strconv.Itoa(nextFD),
-		fmt.Sprintf("--console=%t", consoleEnabled))
+		"--console="+strconv.FormatBool(consoleEnabled))
 	nextFD++
 
 	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
@@ -407,14 +408,19 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nextFD++
 	}
 
+	// Sandbox stdio defaults to current process stdio.
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
 	// If the console control socket file is provided, then create a new
 	// pty master/slave pair and set the tty on the sandbox process.
 	if consoleEnabled {
-		// setupConsole will send the master on the socket, and return
-		// the slave.
-		tty, err := setupConsole(consoleSocket)
+		// console.NewWithSocket will send the master on the socket,
+		// and return the slave.
+		tty, err := console.NewWithSocket(consoleSocket)
 		if err != nil {
-			return fmt.Errorf("error setting up control socket %q: %v", consoleSocket, err)
+			return fmt.Errorf("error setting up console with socket %q: %v", consoleSocket, err)
 		}
 		defer tty.Close()
 
@@ -423,10 +429,6 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		cmd.Stderr = tty
 		cmd.SysProcAttr.Setctty = true
 		cmd.SysProcAttr.Ctty = int(tty.Fd())
-	} else {
-		cmd.Stdin = os.Stdin
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
 	}
 
 	// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
-- 
cgit v1.2.3


From db81c0b02f2f947ae837a3e16471a148a66436eb Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 27 Aug 2018 11:09:06 -0700
Subject: Put fsgofer inside chroot

Now each container gets its own dedicated gofer that is chroot'd to the
rootfs path. This is done to add an extra layer of security in case the
gofer gets compromised.

PiperOrigin-RevId: 210396476
Change-Id: Iba21360a59dfe90875d61000db103f8609157ca0
---
 runsc/boot/controller.go          |   6 +-
 runsc/boot/fs.go                  |   6 +-
 runsc/boot/loader.go              |  20 +++-
 runsc/boot/loader_test.go         | 106 +++++++-------------
 runsc/cmd/BUILD                   |   1 +
 runsc/cmd/gofer.go                |  84 ++++++++++------
 runsc/cmd/state.go                |   5 +-
 runsc/container/BUILD             |   7 +-
 runsc/container/container.go      |  96 +++++++++++++++---
 runsc/container/container_test.go |   7 +-
 runsc/container/fs.go             | 198 ++++++++++++++++++++++++++++++++++++
 runsc/container/fs_test.go        | 158 +++++++++++++++++++++++++++++
 runsc/fsgofer/BUILD               |   4 -
 runsc/fsgofer/control.go          | 204 --------------------------------------
 runsc/sandbox/BUILD               |   2 -
 runsc/sandbox/namespace.go        | 204 --------------------------------------
 runsc/sandbox/network.go          |   3 +-
 runsc/sandbox/sandbox.go          | 176 ++++----------------------------
 runsc/specutils/BUILD             |   6 +-
 runsc/specutils/namespace.go      | 204 ++++++++++++++++++++++++++++++++++++++
 runsc/test/testutil/BUILD         |   1 +
 runsc/test/testutil/docker.go     |   6 +-
 runsc/test/testutil/testutil.go   |  57 +++++++++++
 23 files changed, 858 insertions(+), 703 deletions(-)
 create mode 100644 runsc/container/fs.go
 create mode 100644 runsc/container/fs_test.go
 delete mode 100644 runsc/fsgofer/control.go
 delete mode 100644 runsc/sandbox/namespace.go
 create mode 100644 runsc/specutils/namespace.go

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 2d6b507b3..fdb6be5b1 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -212,11 +212,11 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if path.Clean(args.CID) != args.CID {
 		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
 	}
-	if len(args.FilePayload.Files) != 1 {
-		return fmt.Errorf("start arguments must contain one file for the container root")
+	if len(args.FilePayload.Files) == 0 {
+		return fmt.Errorf("start arguments must contain at least one file for the container root")
 	}
 
-	tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files[0])
+	tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
 		return err
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 6f5379a6d..20d0e42ef 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -510,8 +510,6 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 		MountSources: make(map[string][]fs.MountArgs),
 	}
 
-	mounts := compileMounts(spec)
-
 	// Add root mount.
 	fd := fds.remove()
 	opts := p9MountOptions(conf, fd)
@@ -528,8 +526,8 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	}
 	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
 
-	// Add submounts
-	for _, m := range mounts {
+	// Add submounts.
+	for _, m := range compileMounts(spec) {
 		if err := addRestoreMount(conf, renv, m, fds); err != nil {
 			return nil, err
 		}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0e94cf215..3963ed55d 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -23,6 +23,7 @@ import (
 	"runtime"
 	"sync"
 	"sync/atomic"
+	"syscall"
 	gtime "time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -377,7 +378,7 @@ func (l *Loader) run() error {
 
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process.
-func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) {
+func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) (kernel.ThreadID, error) {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
@@ -414,11 +415,23 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	if err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
+
+	// Can't take ownership away from os.File. dup them to get a new FDs.
+	var ioFDs []int
+	for _, f := range files {
+		fd, err := syscall.Dup(int(f.Fd()))
+		if err != nil {
+			return 0, fmt.Errorf("failed to dup file: %v", err)
+		}
+		f.Close()
+		ioFDs = append(ioFDs, fd)
+	}
+
 	err = setFileSystemForProcess(
 		&procArgs,
 		spec,
 		conf,
-		[]int{int(file.Fd())}, // ioFDs
+		ioFDs,
 		false,
 		creds,
 		procArgs.Limits,
@@ -453,8 +466,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	return tgid, nil
 }
 
-// TODO: Per-container namespaces must be supported
-// for -pid.
+// TODO: Per-container namespaces must be supported for -pid.
 
 // waitContainer waits for the root process of a container to exit.
 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index f2f690b5d..2396d52c8 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"io/ioutil"
 	"math/rand"
 	"os"
 	"reflect"
@@ -36,6 +35,15 @@ func init() {
 	rand.Seed(time.Now().UnixNano())
 }
 
+func testConfig() *Config {
+	return &Config{
+		RootDir:        "unused_root_dir",
+		Network:        NetworkNone,
+		FileAccess:     FileAccessDirect,
+		DisableSeccomp: true,
+	}
+}
+
 // testSpec returns a simple spec that can be used in tests.
 func testSpec() *specs.Spec {
 	return &specs.Spec{
@@ -55,12 +63,7 @@ func createLoader() (*Loader, error) {
 	if err != nil {
 		return nil, err
 	}
-	conf := &Config{
-		RootDir:        "unused_root_dir",
-		Network:        NetworkNone,
-		FileAccess:     FileAccessDirect,
-		DisableSeccomp: true,
-	}
+	conf := testConfig()
 	spec := testSpec()
 	return New(spec, conf, fd, nil, false)
 }
@@ -152,18 +155,6 @@ func TestStartSignal(t *testing.T) {
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespace(t *testing.T) {
-	conf := &Config{
-		RootDir:        "unused_root_dir",
-		FileAccess:     FileAccessDirect,
-		DisableSeccomp: true,
-	}
-
-	testFile, err := ioutil.TempFile(os.TempDir(), "create-mount-namespace-")
-	if err != nil {
-		t.Fatalf("ioutil.TempFile() failed, err: %v", err)
-	}
-	defer os.RemoveAll(testFile.Name())
-
 	testCases := []struct {
 		name string
 		// Spec that will be used to create the mount manager.  Note
@@ -234,8 +225,7 @@ func TestCreateMountNamespace(t *testing.T) {
 					},
 					{
 						Destination: "/foo/qux",
-						Source:      testFile.Name(),
-						Type:        "bind",
+						Type:        "tmpfs",
 					},
 					{
 						// File mounts with the same prefix.
@@ -284,8 +274,7 @@ func TestCreateMountNamespace(t *testing.T) {
 					{
 						// Mount with the same prefix.
 						Destination: "/dev/fd-foo",
-						Source:      testFile.Name(),
-						Type:        "bind",
+						Type:        "tmpfs",
 					},
 					{
 						// Unsupported fs type.
@@ -298,8 +287,7 @@ func TestCreateMountNamespace(t *testing.T) {
 					},
 					{
 						Destination: "/dev/bar",
-						Source:      testFile.Name(),
-						Type:        "bind",
+						Type:        "tmpfs",
 					},
 				},
 			},
@@ -339,19 +327,22 @@ func TestCreateMountNamespace(t *testing.T) {
 	}
 
 	for _, tc := range testCases {
-		ctx := contexttest.Context(t)
-		mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, nil)
-		if err != nil {
-			t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
-		}
-		defer mm.DecRef()
-		root := mm.Root()
-		defer root.DecRef()
-		for _, p := range tc.expectedPaths {
-			if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
-				t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+		t.Run(tc.name, func(t *testing.T) {
+			conf := testConfig()
+			ctx := contexttest.Context(t)
+			mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, nil)
+			if err != nil {
+				t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
 			}
-		}
+			defer mm.DecRef()
+			root := mm.Root()
+			defer root.DecRef()
+			for _, p := range tc.expectedPaths {
+				if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
+					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				}
+			}
+		})
 	}
 }
 
@@ -361,7 +352,7 @@ func TestRestoreEnvironment(t *testing.T) {
 	testCases := []struct {
 		name          string
 		spec          *specs.Spec
-		conf          *Config
+		fileAccess    FileAccessType
 		ioFDs         []int
 		errorExpected bool
 		expectedRenv  fs.RestoreEnvironment
@@ -384,12 +375,7 @@ func TestRestoreEnvironment(t *testing.T) {
 					},
 				},
 			},
-			conf: &Config{
-				RootDir:        "unused_root_dir",
-				Network:        NetworkNone,
-				FileAccess:     FileAccessProxy,
-				DisableSeccomp: true,
-			},
+			fileAccess:    FileAccessProxy,
 			ioFDs:         []int{0},
 			errorExpected: false,
 			expectedRenv: fs.RestoreEnvironment{
@@ -444,12 +430,7 @@ func TestRestoreEnvironment(t *testing.T) {
 					},
 				},
 			},
-			conf: &Config{
-				RootDir:        "unused_root_dir",
-				Network:        NetworkNone,
-				FileAccess:     FileAccessProxy,
-				DisableSeccomp: true,
-			},
+			fileAccess:    FileAccessProxy,
 			ioFDs:         []int{0, 1},
 			errorExpected: false,
 			expectedRenv: fs.RestoreEnvironment{
@@ -508,12 +489,7 @@ func TestRestoreEnvironment(t *testing.T) {
 					},
 				},
 			},
-			conf: &Config{
-				RootDir:        "unused_root_dir",
-				Network:        NetworkNone,
-				FileAccess:     FileAccessProxy,
-				DisableSeccomp: true,
-			},
+			fileAccess:    FileAccessProxy,
 			ioFDs:         []int{0},
 			errorExpected: false,
 			expectedRenv: fs.RestoreEnvironment{
@@ -572,12 +548,7 @@ func TestRestoreEnvironment(t *testing.T) {
 					},
 				},
 			},
-			conf: &Config{
-				RootDir:        "unused_root_dir",
-				Network:        NetworkNone,
-				FileAccess:     FileAccessDirect,
-				DisableSeccomp: true,
-			},
+			fileAccess:    FileAccessDirect,
 			ioFDs:         []int{0, 1},
 			errorExpected: true,
 		},
@@ -596,20 +567,17 @@ func TestRestoreEnvironment(t *testing.T) {
 					},
 				},
 			},
-			conf: &Config{
-				RootDir:        "unused_root_dir",
-				Network:        NetworkNone,
-				FileAccess:     FileAccessDirect,
-				DisableSeccomp: true,
-			},
+			fileAccess:    FileAccessDirect,
 			ioFDs:         []int{0},
 			errorExpected: true,
 		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
+			conf := testConfig()
+			conf.FileAccess = tc.fileAccess
 			fds := &fdDispenser{fds: tc.ioFDs}
-			actualRenv, err := createRestoreEnvironment(tc.spec, tc.conf, fds)
+			actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
 			} else if tc.errorExpected {
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index b9ef4022f..5dee26a5c 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -36,6 +36,7 @@ go_library(
         "//pkg/p9",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
+        "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/console",
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index e23f64d12..ab76734fc 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -16,6 +16,8 @@ package cmd
 
 import (
 	"os"
+	"path"
+	"sync"
 	"syscall"
 
 	"context"
@@ -24,6 +26,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -35,10 +38,6 @@ type Gofer struct {
 	ioFDs     intFlags
 	applyCaps bool
 
-	// controllerFD is the file descriptor of a stream socket for the
-	// control server that is donated to this process.
-	controllerFD int
-
 	panicOnWrite bool
 }
 
@@ -62,26 +61,16 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
 	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
-	f.IntVar(&g.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
 	f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
 }
 
 // Execute implements subcommands.Command.
 func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.controllerFD == -1 {
+	if g.bundleDir == "" || len(g.ioFDs) < 1 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
 
-	// fsgofer should run with a umask of 0, because we want to preserve file
-	// modes exactly as sent by the sandbox, which will have applied its own umask.
-	syscall.Umask(0)
-
-	spec, err := specutils.ReadSpec(g.bundleDir)
-	if err != nil {
-		Fatalf("error reading spec: %v", err)
-	}
-
 	if g.applyCaps {
 		// Minimal set of capabilities needed by the Gofer to operate on files.
 		caps := []string{
@@ -107,49 +96,84 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		panic("unreachable")
 	}
 
+	spec, err := specutils.ReadSpec(g.bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
 	specutils.LogSpec(spec)
 
-	// Start with root mount, then add any other addition mount as needed.
+	// fsgofer should run with a umask of 0, because we want to preserve file
+	// modes exactly as sent by the sandbox, which will have applied its own umask.
+	syscall.Umask(0)
+
+	// Find what path is going to be served by this gofer.
+	root := absPath(g.bundleDir, spec.Root.Path)
+	if err := syscall.Chroot(root); err != nil {
+		Fatalf("failed to chroot to %q: %v", root, err)
+	}
+	if err := syscall.Chdir("/"); err != nil {
+		Fatalf("failed to change working dir: %v", err)
+	}
+	log.Infof("Process chroot'd to %q", root)
+
+	// Start with root mount, then add any other additional mount as needed.
 	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
-	p := absPath(g.bundleDir, spec.Root.Path)
-	ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
+	ats = append(ats, fsgofer.NewAttachPoint("/", fsgofer.Config{
 		ROMount:      spec.Root.Readonly,
 		PanicOnWrite: g.panicOnWrite,
 		// Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when
 		// each file is opened as writable. Thus, we open files lazily to avoid copy-up.
 		LazyOpenForWrite: true,
 	}))
-	log.Infof("Serving %q mapped to %q on FD %d", "/", p, g.ioFDs[0])
+	log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly)
 
 	mountIdx := 1 // first one is the root
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
-			p = absPath(g.bundleDir, m.Source)
-			ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
+			if !path.IsAbs(m.Destination) {
+				Fatalf("destination must be absolute path: %v", m.Destination)
+			}
+			cfg := fsgofer.Config{
 				ROMount:          isReadonlyMount(m.Options),
 				PanicOnWrite:     g.panicOnWrite,
 				LazyOpenForWrite: false,
-			}))
+			}
+			ats = append(ats, fsgofer.NewAttachPoint(m.Destination, cfg))
 
 			if mountIdx >= len(g.ioFDs) {
 				Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
 			}
-			log.Infof("Serving %q mapped to %q on FD %d", m.Destination, p, g.ioFDs[mountIdx])
+			log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfg.ROMount)
 			mountIdx++
 		}
 	}
 	if mountIdx != len(g.ioFDs) {
-		Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
+		Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
 	}
 
-	ctrl, err := fsgofer.NewController(g.controllerFD, g.bundleDir)
+	runServers(ats, g.ioFDs)
+	return subcommands.ExitSuccess
+}
 
-	if err := ctrl.Serve(ats, g.ioFDs); err != nil {
-		Fatalf("Failed to serve via P9: %v", err)
+func runServers(ats []p9.Attacher, ioFDs []int) {
+	// Run the loops and wait for all to exit.
+	var wg sync.WaitGroup
+	for i, ioFD := range ioFDs {
+		wg.Add(1)
+		go func(ioFD int, at p9.Attacher) {
+			socket, err := unet.NewSocket(ioFD)
+			if err != nil {
+				Fatalf("err creating server on FD %d: %v", ioFD, err)
+			}
+			s := p9.NewServer(at)
+			if err := s.Handle(socket); err != nil {
+				Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
+			}
+			wg.Done()
+		}(ioFD, ats[i])
 	}
-	ctrl.Wait()
-
-	return subcommands.ExitSuccess
+	wg.Wait()
+	log.Infof("All 9P servers exited.")
 }
 
 func isReadonlyMount(opts []string) bool {
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index 28752d95e..265014e1b 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -63,8 +63,11 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 	log.Debugf("Returning state for container %+v", c)
 
+	state := c.State()
+	log.Debugf("State: %+v", state)
+
 	// Write json-encoded state directly to stdout.
-	b, err := json.MarshalIndent(c.State(), "", "  ")
+	b, err := json.MarshalIndent(state, "", "  ")
 	if err != nil {
 		Fatalf("error marshaling container state: %v", err)
 	}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index e40ca4709..cba418d0c 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -13,6 +13,7 @@ go_library(
     name = "container",
     srcs = [
         "container.go",
+        "fs.go",
         "hook.go",
         "status.go",
     ],
@@ -28,13 +29,17 @@ go_library(
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
 go_test(
     name = "container_test",
     size = "medium",
-    srcs = ["container_test.go"],
+    srcs = [
+        "container_test.go",
+        "fs_test.go",
+    ],
     data = [
         ":uds_test_app",
         "//runsc",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 8bd47aac1..16af66d3e 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"regexp"
 	"strconv"
@@ -223,15 +224,19 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// init container in the sandbox.
 	if specutils.ShouldCreateSandbox(spec) || !conf.MultiContainer {
 		log.Debugf("Creating new sandbox for container %q", id)
+		ioFiles, err := c.createGoferProcess(spec, conf, bundleDir)
+		if err != nil {
+			return nil, err
+		}
+
 		// Start a new sandbox for this container. Any errors after this point
 		// must destroy the container.
-		s, goferPid, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
+		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles)
 		if err != nil {
 			c.Destroy()
 			return nil, err
 		}
 		c.Sandbox = s
-		c.GoferPid = goferPid
 	} else {
 		// This is sort of confusing. For a sandbox with a root
 		// container and a child container in it, runsc sees:
@@ -254,13 +259,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 			return nil, err
 		}
 		c.Sandbox = sb.Sandbox
-
-		// Prepare the gofer to serve the container's filesystem.
-		err = sb.Sandbox.CreateChild(c.ID, bundleDir)
-		if err != nil {
-			c.Destroy()
-			return nil, err
-		}
 	}
 	c.Status = Created
 
@@ -304,7 +302,12 @@ func (c *Container) Start(conf *boot.Config) error {
 			return err
 		}
 	} else {
-		if err := c.Sandbox.Start(c.Spec, conf, c.ID); err != nil {
+		// Create the gofer process.
+		ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+		if err != nil {
+			return err
+		}
+		if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil {
 			c.Destroy()
 			return err
 		}
@@ -518,6 +521,8 @@ func (c *Container) Destroy() error {
 			log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err)
 		}
 	}
+	c.Sandbox = nil
+
 	if c.GoferPid != 0 {
 		log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
 		if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
@@ -527,9 +532,7 @@ func (c *Container) Destroy() error {
 		}
 	}
 
-	c.Sandbox = nil
 	c.Status = Stopped
-
 	return nil
 }
 
@@ -596,3 +599,72 @@ func (c *Container) waitForStopped() error {
 	}
 	return backoff.Retry(op, b)
 }
+
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, error) {
+	if conf.FileAccess == boot.FileAccessDirect {
+		// Don't start a gofer. The sandbox will access host FS directly.
+		return nil, nil
+	}
+
+	if err := setupFS(spec, conf, bundleDir); err != nil {
+		return nil, fmt.Errorf("failed to setup mounts: %v", err)
+	}
+
+	// Start with the general config flags.
+	args := conf.ToFlags()
+	args = append(args, "gofer", "--bundle", bundleDir)
+	if conf.Overlay {
+		args = append(args, "--panic-on-write=true")
+	}
+
+	// Add root mount and then add any other additional mounts.
+	mountCount := 1
+
+	// Add additional mounts.
+	for _, m := range spec.Mounts {
+		if specutils.Is9PMount(m) {
+			mountCount++
+		}
+	}
+	sandEnds := make([]*os.File, 0, mountCount)
+	goferEnds := make([]*os.File, 0, mountCount)
+
+	// nextFD is the next available file descriptor for the gofer process.
+	// It starts at 3 because 0-2 are used by stdin/stdout/stderr.
+	nextFD := 3
+	for ; nextFD-3 < mountCount; nextFD++ {
+		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+		if err != nil {
+			return nil, err
+		}
+		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd"))
+
+		goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd")
+		defer goferEnd.Close()
+		goferEnds = append(goferEnds, goferEnd)
+
+		args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
+	}
+
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		return nil, err
+	}
+	cmd := exec.Command(binPath, args...)
+	cmd.ExtraFiles = goferEnds
+
+	// Setup any uid/gid mappings, and create or join the configured user
+	// namespace so the gofer's view of the filesystem aligns with the
+	// users in the sandbox.
+	specutils.SetUIDGIDMappings(cmd, spec)
+	nss := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
+
+	// Start the gofer in the given namespace.
+	log.Debugf("Starting gofer: %s %v", binPath, args)
+	if err := specutils.StartInNS(cmd, nss); err != nil {
+		return nil, err
+	}
+	log.Infof("Gofer started, pid: %d", cmd.Process.Pid)
+	c.GoferPid = cmd.Process.Pid
+	return sandEnds, nil
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 6d84700ce..25aaf3f86 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1211,9 +1211,6 @@ func TestMountNewDir(t *testing.T) {
 		if err != nil {
 			t.Fatal("ioutil.TempDir() failed:", err)
 		}
-		if err := os.Chmod(root, 0755); err != nil {
-			t.Fatalf("os.Chmod(%q) failed: %v", root, err)
-		}
 
 		srcDir := path.Join(root, "src", "dir", "anotherdir")
 		if err := os.MkdirAll(srcDir, 0755); err != nil {
@@ -1747,3 +1744,7 @@ func TestGoferExits(t *testing.T) {
 		t.Errorf("container shouldn't be running, container: %+v", c)
 	}
 }
+
+func TestMain(m *testing.M) {
+	testutil.RunAsRoot(m)
+}
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
new file mode 100644
index 000000000..652f81bbf
--- /dev/null
+++ b/runsc/container/fs.go
@@ -0,0 +1,198 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+type mapping struct {
+	set bool
+	val uint32
+}
+
+var optionsMap = map[string]mapping{
+	"acl":           {set: true, val: syscall.MS_POSIXACL},
+	"async":         {set: false, val: syscall.MS_SYNCHRONOUS},
+	"atime":         {set: false, val: syscall.MS_NOATIME},
+	"bind":          {set: true, val: syscall.MS_BIND},
+	"defaults":      {set: true, val: 0},
+	"dev":           {set: false, val: syscall.MS_NODEV},
+	"diratime":      {set: false, val: syscall.MS_NODIRATIME},
+	"dirsync":       {set: true, val: syscall.MS_DIRSYNC},
+	"exec":          {set: false, val: syscall.MS_NOEXEC},
+	"iversion":      {set: true, val: syscall.MS_I_VERSION},
+	"loud":          {set: false, val: syscall.MS_SILENT},
+	"mand":          {set: true, val: syscall.MS_MANDLOCK},
+	"noacl":         {set: false, val: syscall.MS_POSIXACL},
+	"noatime":       {set: true, val: syscall.MS_NOATIME},
+	"nodev":         {set: true, val: syscall.MS_NODEV},
+	"nodiratime":    {set: true, val: syscall.MS_NODIRATIME},
+	"noexec":        {set: true, val: syscall.MS_NOEXEC},
+	"noiversion":    {set: false, val: syscall.MS_I_VERSION},
+	"nomand":        {set: false, val: syscall.MS_MANDLOCK},
+	"norelatime":    {set: false, val: syscall.MS_RELATIME},
+	"nostrictatime": {set: false, val: syscall.MS_STRICTATIME},
+	"nosuid":        {set: true, val: syscall.MS_NOSUID},
+	"private":       {set: true, val: syscall.MS_PRIVATE},
+	"rbind":         {set: true, val: syscall.MS_BIND | syscall.MS_REC},
+	"relatime":      {set: true, val: syscall.MS_RELATIME},
+	"remount":       {set: true, val: syscall.MS_REMOUNT},
+	"ro":            {set: true, val: syscall.MS_RDONLY},
+	"rprivate":      {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC},
+	"rw":            {set: false, val: syscall.MS_RDONLY},
+	"silent":        {set: true, val: syscall.MS_SILENT},
+	"strictatime":   {set: true, val: syscall.MS_STRICTATIME},
+	"suid":          {set: false, val: syscall.MS_NOSUID},
+	"sync":          {set: true, val: syscall.MS_SYNCHRONOUS},
+}
+
+// setupFS creates the container directory structure under 'spec.Root.Path'.
+// This allows the gofer serving the containers to be chroot under this
+// directory to create an extra layer to security in case the gofer gets
+// compromised.
+func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
+	for _, m := range spec.Mounts {
+		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+			continue
+		}
+		src := m.Source
+		if !filepath.IsAbs(src) {
+			src = filepath.Join(bundleDir, src)
+		}
+		srcfi, err := os.Stat(src)
+		if err != nil {
+			return err
+		}
+
+		// It's possible that 'm.Destination' follows symlinks inside the
+		// container.
+		dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
+		if err != nil {
+			return err
+		}
+
+		// Create mount point if it doesn't exits
+		if _, err := os.Stat(dst); os.IsNotExist(err) {
+			if srcfi.IsDir() {
+				if err := os.MkdirAll(dst, 0755); err != nil {
+					return err
+				}
+			} else {
+				if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
+					return err
+				}
+				f, err := os.OpenFile(dst, os.O_CREATE, 0755)
+				if err != nil {
+					return err
+				}
+				f.Close()
+			}
+		}
+
+		flags := optionsToFlags(m.Options)
+		flags |= syscall.MS_BIND
+		log.Infof("Mounting src: %q, dst: %q, flags: %#x", src, dst, flags)
+		if err := syscall.Mount(src, dst, m.Type, uintptr(flags), ""); err != nil {
+			return err
+		}
+	}
+
+	// Remount root as readonly after setup is done, if requested.
+	if spec.Root.Readonly {
+		log.Infof("Remounting root as readonly: %q", spec.Root.Path)
+		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
+		return unix.Mount(spec.Root.Path, spec.Root.Path, "bind", flags, "")
+	}
+	return nil
+}
+
+// resolveSymlinks walks 'rel' having 'root' as the root directory. If there are
+// symlinks, they are evaluated relative to 'root' to ensure the end result is
+// the same as if the process was running inside the container.
+func resolveSymlinks(root, rel string) (string, error) {
+	return resolveSymlinksImpl(root, root, rel, 255)
+}
+
+func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
+	if followCount == 0 {
+		return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
+	}
+
+	rel = filepath.Clean(rel)
+	for _, name := range strings.Split(rel, string(filepath.Separator)) {
+		if name == "" {
+			continue
+		}
+		// Note that Join() resolves things like ".." and returns a clean path.
+		path := filepath.Join(base, name)
+		if !strings.HasPrefix(path, root) {
+			// One cannot '..' their way out of root.
+			path = root
+			continue
+		}
+		fi, err := os.Lstat(path)
+		if err != nil {
+			if !os.IsNotExist(err) {
+				return "", err
+			}
+			// Not found means there is no symlink to check. Just keep walking dirs.
+			base = path
+			continue
+		}
+		if fi.Mode()&os.ModeSymlink != 0 {
+			link, err := os.Readlink(path)
+			if err != nil {
+				return "", err
+			}
+			if filepath.IsAbs(link) {
+				base = root
+			}
+			base, err = resolveSymlinksImpl(root, base, link, followCount-1)
+			if err != nil {
+				return "", err
+			}
+			continue
+		}
+		base = path
+	}
+	return base, nil
+}
+
+func optionsToFlags(opts []string) uint32 {
+	var rv uint32
+	for _, opt := range opts {
+		if m, ok := optionsMap[opt]; ok {
+			if m.set {
+				rv |= m.val
+			} else {
+				rv ^= m.val
+			}
+		} else {
+			log.Warningf("Ignoring mount option %q", opt)
+		}
+	}
+	return rv
+}
diff --git a/runsc/container/fs_test.go b/runsc/container/fs_test.go
new file mode 100644
index 000000000..84bde18fb
--- /dev/null
+++ b/runsc/container/fs_test.go
@@ -0,0 +1,158 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"path/filepath"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+type dir struct {
+	rel  string
+	link string
+}
+
+func construct(root string, dirs []dir) error {
+	for _, d := range dirs {
+		p := path.Join(root, d.rel)
+		if d.link == "" {
+			if err := os.MkdirAll(p, 0755); err != nil {
+				return fmt.Errorf("error creating dir: %v", err)
+			}
+		} else {
+			if err := os.MkdirAll(path.Dir(p), 0755); err != nil {
+				return fmt.Errorf("error creating dir: %v", err)
+			}
+			if err := os.Symlink(d.link, p); err != nil {
+				return fmt.Errorf("error creating symlink: %v", err)
+			}
+		}
+	}
+	return nil
+}
+
+func TestResolveSymlinks(t *testing.T) {
+	root, err := ioutil.TempDir(testutil.TmpDir(), "root")
+	if err != nil {
+		t.Fatal("ioutil.TempDir() failed:", err)
+	}
+	dirs := []dir{
+		{"dir1/dir11/dir111/dir1111", ""}, // Just a boring dir
+		{"dir1/lnk12", "dir11"},           // Link to sibling
+		{"dir1/lnk13", "./dir11"},         // Link to sibling through self
+		{"dir1/lnk14", "../dir1/dir11"},   // Link to sibling through parent
+		{"dir1/dir15/lnk151", ".."},       // Link to parent
+		{"dir1/lnk16", "dir11/dir111"},    // Link to child
+		{"dir1/lnk17", "."},               // Link to self
+		{"dir1/lnk18", "lnk13"},           // Link to link
+		{"lnk2", "dir1/lnk13"},            // Link to link to link
+		{"dir3/dir21/lnk211", "../.."},    // Link to root relative
+		{"dir3/lnk22", "/"},               // Link to root absolute
+		{"dir3/lnk23", "/dir1"},           // Link to dir absolute
+		{"dir3/lnk24", "/dir1/lnk12"},     // Link to link absolute
+		{"lnk5", "../../.."},              // Link outside root
+	}
+	if err := construct(root, dirs); err != nil {
+		t.Fatal("construct failed:", err)
+	}
+
+	tests := []struct {
+		name        string
+		rel         string
+		want        string
+		compareHost bool
+	}{
+		{name: "root", rel: "/", want: "/", compareHost: true},
+		{name: "basic dir", rel: "/dir1/dir11/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dot 1", rel: "/dir1/dir11/./dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dot 2", rel: "/dir1/././dir11/./././././dir111/.", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dotdot 1", rel: "/dir1/dir11/../dir15", want: "/dir1/dir15", compareHost: true},
+		{name: "dotdot 2", rel: "/dir1/dir11/dir1111/../..", want: "/dir1", compareHost: true},
+
+		{name: "link sibling", rel: "/dir1/lnk12", want: "/dir1/dir11", compareHost: true},
+		{name: "link sibling + dir", rel: "/dir1/lnk12/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link sibling through self", rel: "/dir1/lnk13", want: "/dir1/dir11", compareHost: true},
+		{name: "link sibling through parent", rel: "/dir1/lnk14", want: "/dir1/dir11", compareHost: true},
+
+		{name: "link parent", rel: "/dir1/dir15/lnk151", want: "/dir1", compareHost: true},
+		{name: "link parent + dir", rel: "/dir1/dir15/lnk151/dir11", want: "/dir1/dir11", compareHost: true},
+		{name: "link child", rel: "/dir1/lnk16", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link child + dir", rel: "/dir1/lnk16/dir1111", want: "/dir1/dir11/dir111/dir1111", compareHost: true},
+		{name: "link self", rel: "/dir1/lnk17", want: "/dir1", compareHost: true},
+		{name: "link self + dir", rel: "/dir1/lnk17/dir11", want: "/dir1/dir11", compareHost: true},
+
+		{name: "link^2", rel: "/dir1/lnk18", want: "/dir1/dir11", compareHost: true},
+		{name: "link^2 + dir", rel: "/dir1/lnk18/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link^3", rel: "/lnk2", want: "/dir1/dir11", compareHost: true},
+		{name: "link^3 + dir", rel: "/lnk2/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+
+		{name: "link abs", rel: "/dir3/lnk23", want: "/dir1"},
+		{name: "link abs + dir", rel: "/dir3/lnk23/dir11", want: "/dir1/dir11"},
+		{name: "link^2 abs", rel: "/dir3/lnk24", want: "/dir1/dir11"},
+		{name: "link^2 abs + dir", rel: "/dir3/lnk24/dir111", want: "/dir1/dir11/dir111"},
+
+		{name: "root link rel", rel: "/dir3/dir21/lnk211", want: "/", compareHost: true},
+		{name: "root link abs", rel: "/dir3/lnk22", want: "/"},
+		{name: "root contain link", rel: "/lnk5/dir1", want: "/dir1"},
+		{name: "root contain dotdot", rel: "/dir1/dir11/../../../../../../../..", want: "/"},
+
+		{name: "crazy", rel: "/dir3/dir21/lnk211/dir3/lnk22/dir1/dir11/../../lnk5/dir3/../dir3/lnk24/dir111/dir1111/..", want: "/dir1/dir11/dir111"},
+	}
+	for _, tst := range tests {
+		t.Run(tst.name, func(t *testing.T) {
+			got, err := resolveSymlinks(root, tst.rel)
+			if err != nil {
+				t.Errorf("resolveSymlinks(root, %q) failed: %v", tst.rel, err)
+			}
+			want := path.Join(root, tst.want)
+			if got != want {
+				t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, got, want)
+			}
+			if tst.compareHost {
+				// Check that host got to the same end result.
+				host, err := filepath.EvalSymlinks(path.Join(root, tst.rel))
+				if err != nil {
+					t.Errorf("path.EvalSymlinks(root, %q) failed: %v", tst.rel, err)
+				}
+				if host != got {
+					t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, host, got)
+				}
+			}
+		})
+	}
+}
+
+func TestResolveSymlinksLoop(t *testing.T) {
+	root, err := ioutil.TempDir(testutil.TmpDir(), "root")
+	if err != nil {
+		t.Fatal("ioutil.TempDir() failed:", err)
+	}
+	dirs := []dir{
+		{"loop1", "loop2"},
+		{"loop2", "loop1"},
+	}
+	if err := construct(root, dirs); err != nil {
+		t.Fatal("construct failed:", err)
+	}
+	if _, err := resolveSymlinks(root, "loop1"); err == nil {
+		t.Errorf("resolveSymlinks() should have failed")
+	}
+}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 0bc682b5f..24e172f48 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "fsgofer",
     srcs = [
-        "control.go",
         "fsgofer.go",
         "fsgofer_unsafe.go",
     ],
@@ -15,12 +14,9 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/control/server",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
-        "//pkg/unet",
-        "//pkg/urpc",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/control.go b/runsc/fsgofer/control.go
deleted file mode 100644
index 8cb2f67ac..000000000
--- a/runsc/fsgofer/control.go
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fsgofer
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/control/server"
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/p9"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
-	"gvisor.googlesource.com/gvisor/pkg/urpc"
-)
-
-// Controller manages the fsgofer's control server.
-type Controller struct {
-	// api holds the control server's URPC endpoints.
-	api api
-
-	// srv is the control server.
-	srv *server.Server
-}
-
-// NewController creates a new Controller and starts it listenting
-func NewController(fd int, rootBundleDir string) (*Controller, error) {
-	if !filepath.IsAbs(rootBundleDir) {
-		return nil, fmt.Errorf("NewController should receive an absolute bundle dir path, but got %q", rootBundleDir)
-	}
-
-	srv, err := server.CreateFromFD(fd)
-	if err != nil {
-		return nil, err
-	}
-
-	cr := &Controller{srv: srv}
-	cr.api.rootBundleDir = rootBundleDir
-	cr.api.bundleDirs = make(map[string]string)
-	srv.Register(&cr.api)
-
-	if err := srv.StartServing(); err != nil {
-		return nil, err
-	}
-
-	return cr, nil
-}
-
-// Wait waits for all the p9 servers to finish, then shuts down the control
-// server.
-func (cr *Controller) Wait() {
-	cr.api.p9wg.Wait()
-	cr.srv.Stop()
-	log.Infof("All 9P servers exited.")
-}
-
-// Serve starts serving each Attacher in ats via its corresponding file
-// descriptor in ioFDs. This takes ownership of the FDs in ioFDs.
-func (cr *Controller) Serve(ats []p9.Attacher, ioFDs []int) error {
-	if len(ats) != len(ioFDs) {
-		return fmt.Errorf("number of attach points does not match the number of IO FDs (%d and %d)", len(ats), len(ioFDs))
-	}
-	for i, _ := range ats {
-		cr.api.serve(ats[i], os.NewFile(uintptr(ioFDs[i]), "io fd"))
-	}
-	return nil
-}
-
-// api URPC methods.
-const (
-	// AddBundleDirs readies the gofer to serve from a new bundle
-	// directory. It should be called during runsc create.
-	AddBundleDirs = "api.AddBundleDirs"
-
-	// ServeDirectory serves a new directory via the fsgofer. It should be
-	// called during runsc start.
-	ServeDirectory = "api.ServeDirectory"
-)
-
-// API defines and implements the URPC endpoints for the gofer.
-type api struct {
-	// p9wg waits for all the goroutines serving the sentry via p9. When its
-	// counter is 0, the gofer is out of work and exits.
-	p9wg sync.WaitGroup
-
-	// bundleDirs maps from container ID to bundle directory for each
-	// container.
-	bundleDirs map[string]string
-
-	// rootBundleDir is the bundle directory of the root container.
-	rootBundleDir string
-}
-
-// AddBundleDirsRequest is the URPC argument to AddBundleDirs.
-type AddBundleDirsRequest struct {
-	// BundleDirs is a map of container IDs to bundle directories to add to
-	// the gofer.
-	BundleDirs map[string]string
-}
-
-// AddBundleDirsRequest adds bundle directories that for the gofer to serve.
-func (api *api) AddBundleDirs(req *AddBundleDirsRequest, _ *struct{}) error {
-	log.Debugf("fsgofer.AddBundleDirs")
-	for cid, bd := range req.BundleDirs {
-		if _, ok := api.bundleDirs[cid]; ok {
-			return fmt.Errorf("fsgofer already has a bundleDir for container %q", cid)
-		}
-		api.bundleDirs[cid] = bd
-	}
-	return nil
-}
-
-// ServeDirectoryRequest is the URPC argument to ServeDirectory.
-type ServeDirectoryRequest struct {
-	// Dir is the absolute path to a directory to be served to the sentry.
-	Dir string
-
-	// IsReadOnly specifies whether the directory should be served in
-	// read-only mode.
-	IsReadOnly bool
-
-	// CID is the container ID of the container that needs to serve a
-	// directory.
-	CID string
-
-	// FilePayload contains the socket over which the sentry will request
-	// files from Dir.
-	urpc.FilePayload
-}
-
-// ServeDirectory begins serving a directory via a file descriptor for the
-// sentry. Directories must be added via AddBundleDirsRequest before
-// ServeDirectory is called.
-func (api *api) ServeDirectory(req *ServeDirectoryRequest, _ *struct{}) error {
-	log.Debugf("fsgofer.ServeDirectory: %+v", req)
-
-	if req.Dir == "" {
-		return fmt.Errorf("ServeDirectory should receive a directory argument, but was empty")
-	}
-	if req.CID == "" {
-		return fmt.Errorf("ServeDirectory should receive a CID argument, but was empty")
-	}
-	// Prevent CIDs containing ".." from confusing the sentry when creating
-	// /containers/<cid> directory.
-	// TODO: Once we have multiple independant roots, this
-	// check won't be necessary.
-	if filepath.Clean(req.CID) != req.CID {
-		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", req.CID)
-	}
-	if nFiles := len(req.FilePayload.Files); nFiles != 1 {
-		return fmt.Errorf("ServeDirectory should receive 1 file descriptor, but got %d", nFiles)
-	}
-
-	bd, ok := api.bundleDirs[req.CID]
-	if !ok {
-		// If there's no entry in bundleDirs for the container ID, this
-		// is the root container.
-		bd = api.rootBundleDir
-	}
-
-	// Relative paths are served relative to the bundle directory.
-	absDir := req.Dir
-	if !filepath.IsAbs(absDir) {
-		absDir = filepath.Join(bd, req.Dir)
-	}
-
-	// Create the attach point and start serving.
-	at := NewAttachPoint(absDir, Config{
-		ROMount:          req.IsReadOnly,
-		LazyOpenForWrite: true,
-	})
-	api.serve(at, req.FilePayload.Files[0])
-
-	return nil
-}
-
-// serve begins serving a directory via a file descriptor.
-func (api *api) serve(at p9.Attacher, ioFile *os.File) {
-	api.p9wg.Add(1)
-	go func() {
-		socket, err := unet.NewSocket(int(ioFile.Fd()))
-		if err != nil {
-			panic(fmt.Sprintf("err creating server on FD %d: %v", ioFile.Fd(), err))
-		}
-		s := p9.NewServer(at)
-		if err := s.Handle(socket); err != nil {
-			panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFile.Fd(), err))
-		}
-		api.p9wg.Done()
-	}()
-}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index e9a39f797..9317b1c14 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
     name = "sandbox",
     srcs = [
-        "namespace.go",
         "network.go",
         "sandbox.go",
     ],
@@ -21,7 +20,6 @@ go_library(
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/console",
-        "//runsc/fsgofer",
         "//runsc/specutils",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_vishvananda_netlink//:go_default_library",
diff --git a/runsc/sandbox/namespace.go b/runsc/sandbox/namespace.go
deleted file mode 100644
index 1d3bcfbb5..000000000
--- a/runsc/sandbox/namespace.go
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sandbox
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"runtime"
-	"syscall"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
-	"gvisor.googlesource.com/gvisor/pkg/log"
-)
-
-// nsCloneFlag returns the clone flag that can be used to set a namespace of
-// the given type.
-func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
-	switch nst {
-	case specs.IPCNamespace:
-		return syscall.CLONE_NEWIPC
-	case specs.MountNamespace:
-		return syscall.CLONE_NEWNS
-	case specs.NetworkNamespace:
-		return syscall.CLONE_NEWNET
-	case specs.PIDNamespace:
-		return syscall.CLONE_NEWPID
-	case specs.UTSNamespace:
-		return syscall.CLONE_NEWUTS
-	case specs.UserNamespace:
-		return syscall.CLONE_NEWUSER
-	case specs.CgroupNamespace:
-		panic("cgroup namespace has no associated clone flag")
-	default:
-		panic(fmt.Sprintf("unknown namespace %v", nst))
-	}
-}
-
-// nsPath returns the path of the namespace for the current process and the
-// given namespace.
-func nsPath(nst specs.LinuxNamespaceType) string {
-	base := "/proc/self/ns"
-	switch nst {
-	case specs.CgroupNamespace:
-		return filepath.Join(base, "cgroup")
-	case specs.IPCNamespace:
-		return filepath.Join(base, "ipc")
-	case specs.MountNamespace:
-		return filepath.Join(base, "mnt")
-	case specs.NetworkNamespace:
-		return filepath.Join(base, "net")
-	case specs.PIDNamespace:
-		return filepath.Join(base, "pid")
-	case specs.UserNamespace:
-		return filepath.Join(base, "user")
-	case specs.UTSNamespace:
-		return filepath.Join(base, "uts")
-	default:
-		panic(fmt.Sprintf("unknown namespace %v", nst))
-	}
-}
-
-// getNS returns true and the namespace with the given type from the slice of
-// namespaces in the spec.  It returns false if the slice does not contain a
-// namespace with the type.
-func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
-	if s.Linux == nil {
-		return specs.LinuxNamespace{}, false
-	}
-	for _, ns := range s.Linux.Namespaces {
-		if ns.Type == nst {
-			return ns, true
-		}
-	}
-	return specs.LinuxNamespace{}, false
-}
-
-// filterNS returns a slice of namespaces from the spec with types that match
-// those in the `filter` slice.
-func filterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
-	if s.Linux == nil {
-		return nil
-	}
-	var out []specs.LinuxNamespace
-	for _, nst := range filter {
-		if ns, ok := getNS(nst, s); ok {
-			out = append(out, ns)
-		}
-	}
-	return out
-}
-
-// setNS sets the namespace of the given type.  It must be called with
-// OSThreadLocked.
-func setNS(fd, nsType uintptr) error {
-	if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
-		return err
-	}
-	return nil
-}
-
-// applyNS applies the namespace on the current thread and returns a function
-// that will restore the namespace to the original value.
-//
-// Preconditions: Must be called with os thread locked.
-func applyNS(ns specs.LinuxNamespace) (func(), error) {
-	log.Infof("applying namespace %v at path %q", ns.Type, ns.Path)
-	newNS, err := os.Open(ns.Path)
-	if err != nil {
-		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
-	}
-	defer newNS.Close()
-
-	// Store current netns to restore back after child is started.
-	curPath := nsPath(ns.Type)
-	oldNS, err := os.Open(curPath)
-	if err != nil {
-		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
-	}
-
-	// Set netns to the one requested and setup function to restore it back.
-	flag := nsCloneFlag(ns.Type)
-	if err := setNS(newNS.Fd(), flag); err != nil {
-		oldNS.Close()
-		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
-	}
-	return func() {
-		log.Infof("restoring namespace %v", ns.Type)
-		defer oldNS.Close()
-		if err := setNS(oldNS.Fd(), flag); err != nil {
-			panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
-		}
-	}, nil
-}
-
-// startInNS joins or creates the given namespaces and calls cmd.Start before
-// restoring the namespaces to the original values.
-func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
-	// We are about to setup namespaces, which requires the os thread being
-	// locked so that Go doesn't change the thread out from under us.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	if cmd.SysProcAttr == nil {
-		cmd.SysProcAttr = &syscall.SysProcAttr{}
-	}
-
-	for _, ns := range nss {
-		if ns.Path == "" {
-			// No path.  Just set a flag to create a new namespace.
-			cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
-			continue
-		}
-		// Join the given namespace, and restore the current namespace
-		// before exiting.
-		restoreNS, err := applyNS(ns)
-		if err != nil {
-			return err
-		}
-		defer restoreNS()
-	}
-
-	return cmd.Start()
-}
-
-// setUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
-func setUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
-	if s.Linux == nil {
-		return
-	}
-	if cmd.SysProcAttr == nil {
-		cmd.SysProcAttr = &syscall.SysProcAttr{}
-	}
-	for _, idMap := range s.Linux.UIDMappings {
-		log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
-		cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
-			ContainerID: int(idMap.ContainerID),
-			HostID:      int(idMap.HostID),
-			Size:        int(idMap.Size),
-		})
-	}
-	for _, idMap := range s.Linux.GIDMappings {
-		log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
-		cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
-			ContainerID: int(idMap.ContainerID),
-			HostID:      int(idMap.HostID),
-			Size:        int(idMap.Size),
-		})
-	}
-}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index d0ce6228b..8694ba755 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 const (
@@ -132,7 +133,7 @@ func createDefaultLoopbackInterface(conn *urpc.Client) error {
 
 func joinNetNS(nsPath string) (func(), error) {
 	runtime.LockOSThread()
-	restoreNS, err := applyNS(specs.LinuxNamespace{
+	restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{
 		Type: specs.NetworkNamespace,
 		Path: nsPath,
 	})
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e54ba4ba3..f14a2f8c9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -32,7 +32,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/console"
-	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -55,31 +54,20 @@ type Sandbox struct {
 }
 
 // Create creates the sandbox process.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, int, error) {
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
 
-	binPath, err := specutils.BinPath()
-	if err != nil {
-		return nil, 0, err
-	}
-
-	// Create the gofer process.
-	goferPid, ioFiles, err := s.createGoferProcess(spec, conf, bundleDir, binPath)
-	if err != nil {
-		return nil, 0, err
-	}
-
 	// Create the sandbox process.
-	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles); err != nil {
-		return nil, 0, err
+	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, ioFiles); err != nil {
+		return nil, err
 	}
 
 	// Wait for the control server to come up (or timeout).
 	if err := s.waitForCreated(10 * time.Second); err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 
-	return s, goferPid, nil
+	return s, nil
 }
 
 // StartRoot starts running the root container process inside the sandbox.
@@ -105,70 +93,29 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
-// CreateChild creates a non-root container inside the sandbox.
-func (s *Sandbox) CreateChild(cid, bundleDir string) error {
-	log.Debugf("Create non-root container sandbox %q, pid: %d for container %q with bundle directory %q", s.ID, s.Pid, cid, bundleDir)
-
-	// Connect to the gofer and prepare it to serve from bundleDir for this
-	// container.
-	goferConn, err := s.goferConnect()
-	if err != nil {
-		return fmt.Errorf("couldn't connect to gofer: %v", err)
-	}
-	defer goferConn.Close()
-	goferReq := fsgofer.AddBundleDirsRequest{BundleDirs: map[string]string{cid: bundleDir}}
-	if err := goferConn.Call(fsgofer.AddBundleDirs, &goferReq, nil); err != nil {
-		return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err)
+// Start starts running a non-root container inside the sandbox.
+func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, ioFiles []*os.File) error {
+	for _, f := range ioFiles {
+		defer f.Close()
 	}
 
-	return nil
-}
-
-// Start starts running a non-root container inside the sandbox.
-func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error {
 	log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid)
-
 	sandboxConn, err := s.sandboxConnect()
 	if err != nil {
 		return fmt.Errorf("couldn't connect to sandbox: %v", err)
 	}
 	defer sandboxConn.Close()
-	goferConn, err := s.goferConnect()
-	if err != nil {
-		return fmt.Errorf("couldn't connect to gofer: %v", err)
-	}
-	defer goferConn.Close()
-
-	// Create socket that connects the sandbox and gofer.
-	sandEnd, goferEnd, err := createSocketPair()
-	if err != nil {
-		return err
-	}
-	defer sandEnd.Close()
-	defer goferEnd.Close()
-
-	// Tell the Gofer about the new filesystem it needs to serve.
-	goferReq := fsgofer.ServeDirectoryRequest{
-		Dir:         spec.Root.Path,
-		IsReadOnly:  spec.Root.Readonly,
-		CID:         cid,
-		FilePayload: urpc.FilePayload{Files: []*os.File{goferEnd}},
-	}
-	if err := goferConn.Call(fsgofer.ServeDirectory, &goferReq, nil); err != nil {
-		return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err)
-	}
 
 	// Start running the container.
 	args := boot.StartArgs{
 		Spec:        spec,
 		Conf:        conf,
 		CID:         cid,
-		FilePayload: urpc.FilePayload{Files: []*os.File{sandEnd}},
+		FilePayload: urpc.FilePayload{Files: ioFiles},
 	}
 	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
 		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
 	}
-
 	return nil
 }
 
@@ -275,102 +222,13 @@ func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
 	return conn, nil
 }
 
-func (s *Sandbox) goferConnect() (*urpc.Client, error) {
-	log.Debugf("Connecting to gofer for sandbox %q", s.ID)
-	conn, err := client.ConnectTo(fsgofer.ControlSocketAddr(s.ID))
-	if err != nil {
-		return nil, s.connError(err)
-	}
-	return conn, nil
-}
-
 func (s *Sandbox) connError(err error) error {
 	return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
 }
 
-func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) (int, []*os.File, error) {
-	if conf.FileAccess == boot.FileAccessDirect {
-		// Don't start a gofer. The sandbox will access host FS directly.
-		return 0, nil, nil
-	}
-
-	// Start with the general config flags.
-	args := conf.ToFlags()
-	args = append(args, "gofer", "--bundle", bundleDir)
-
-	// Add root mount and then add any other additional mounts.
-	mountCount := 1
-
-	// Add additional mounts.
-	for _, m := range spec.Mounts {
-		if specutils.Is9PMount(m) {
-			mountCount++
-		}
-	}
-	sandEnds := make([]*os.File, 0, mountCount)
-	goferEnds := make([]*os.File, 0, mountCount)
-	// nextFD is the next available file descriptor for the gofer process.
-	// It starts at 3 because 0-2 are used by stdin/stdout/stderr.
-	var nextFD int
-	for nextFD = 3; nextFD-3 < mountCount; nextFD++ {
-		sandEnd, goferEnd, err := createSocketPair()
-		if err != nil {
-			return 0, nil, err
-		}
-		defer goferEnd.Close()
-		sandEnds = append(sandEnds, sandEnd)
-		goferEnds = append(goferEnds, goferEnd)
-		args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
-	}
-
-	// Create and donate a file descriptor for the control server.
-	addr := fsgofer.ControlSocketAddr(s.ID)
-	serverFD, err := server.CreateSocket(addr)
-	if err != nil {
-		return 0, nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
-	}
-
-	// Add the control server fd.
-	args = append(args, "--controller-fd="+strconv.Itoa(nextFD))
-	nextFD++
-	controllerFile := os.NewFile(uintptr(serverFD), "gofer_control_socket_server")
-	defer controllerFile.Close()
-
-	cmd := exec.Command(binPath, args...)
-	cmd.ExtraFiles = goferEnds
-	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
-
-	// Setup any uid/gid mappings, and create or join the configured user
-	// namespace so the gofer's view of the filesystem aligns with the
-	// users in the sandbox.
-	setUIDGIDMappings(cmd, spec)
-	nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
-
-	if conf.Overlay {
-		args = append(args, "--panic-on-write=true")
-	}
-
-	// Start the gofer in the given namespace.
-	log.Debugf("Starting gofer: %s %v", binPath, args)
-	if err := startInNS(cmd, nss); err != nil {
-		return 0, nil, err
-	}
-	log.Infof("Gofer started, pid: %d", cmd.Process.Pid)
-	return cmd.Process.Pid, sandEnds, nil
-}
-
-// createSocketPair creates a pair of files wrapping a socket pair.
-func createSocketPair() (*os.File, *os.File, error) {
-	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		return nil, nil, err
-	}
-	return os.NewFile(uintptr(fds[0]), "sandbox io fd"), os.NewFile(uintptr(fds[1]), "gofer io fd"), nil
-}
-
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error {
+func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -387,6 +245,10 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 
 	consoleEnabled := consoleSocket != ""
 
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		return err
+	}
 	cmd := exec.Command(binPath, conf.ToFlags()...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{}
 	cmd.Args = append(cmd.Args,
@@ -464,7 +326,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Joins the network namespace if network is enabled. the sandbox talks
 	// directly to the host network, which may have been configured in the
 	// namespace.
-	if ns, ok := getNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone {
+	if ns, ok := specutils.GetNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone {
 		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
 		nss = append(nss, ns)
 	} else {
@@ -478,10 +340,10 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	//   - Gofer: when using a Gofer, the sandbox process can run isolated in an
 	//       empty namespace.
 	if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect {
-		if userns, ok := getNS(specs.UserNamespace, spec); ok {
+		if userns, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
 			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
 			nss = append(nss, userns)
-			setUIDGIDMappings(cmd, spec)
+			specutils.SetUIDGIDMappings(cmd, spec)
 		} else {
 			log.Infof("Sandbox will be started in the current user namespace")
 		}
@@ -496,7 +358,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
-	if err := startInNS(cmd, nss); err != nil {
+	if err := specutils.StartInNS(cmd, nss); err != nil {
 		return err
 	}
 	s.Pid = cmd.Process.Pid
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index a22ab789a..97a504b20 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -4,7 +4,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
 go_library(
     name = "specutils",
-    srcs = ["specutils.go"],
+    srcs = [
+        "namespace.go",
+        "specutils.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
     visibility = [
         "//runsc:__subpackages__",
@@ -15,6 +18,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
new file mode 100644
index 000000000..80eaad965
--- /dev/null
+++ b/runsc/specutils/namespace.go
@@ -0,0 +1,204 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// nsCloneFlag returns the clone flag that can be used to set a namespace of
+// the given type.
+func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
+	switch nst {
+	case specs.IPCNamespace:
+		return syscall.CLONE_NEWIPC
+	case specs.MountNamespace:
+		return syscall.CLONE_NEWNS
+	case specs.NetworkNamespace:
+		return syscall.CLONE_NEWNET
+	case specs.PIDNamespace:
+		return syscall.CLONE_NEWPID
+	case specs.UTSNamespace:
+		return syscall.CLONE_NEWUTS
+	case specs.UserNamespace:
+		return syscall.CLONE_NEWUSER
+	case specs.CgroupNamespace:
+		panic("cgroup namespace has no associated clone flag")
+	default:
+		panic(fmt.Sprintf("unknown namespace %v", nst))
+	}
+}
+
+// nsPath returns the path of the namespace for the current process and the
+// given namespace.
+func nsPath(nst specs.LinuxNamespaceType) string {
+	base := "/proc/self/ns"
+	switch nst {
+	case specs.CgroupNamespace:
+		return filepath.Join(base, "cgroup")
+	case specs.IPCNamespace:
+		return filepath.Join(base, "ipc")
+	case specs.MountNamespace:
+		return filepath.Join(base, "mnt")
+	case specs.NetworkNamespace:
+		return filepath.Join(base, "net")
+	case specs.PIDNamespace:
+		return filepath.Join(base, "pid")
+	case specs.UserNamespace:
+		return filepath.Join(base, "user")
+	case specs.UTSNamespace:
+		return filepath.Join(base, "uts")
+	default:
+		panic(fmt.Sprintf("unknown namespace %v", nst))
+	}
+}
+
+// GetNS returns true and the namespace with the given type from the slice of
+// namespaces in the spec.  It returns false if the slice does not contain a
+// namespace with the type.
+func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
+	if s.Linux == nil {
+		return specs.LinuxNamespace{}, false
+	}
+	for _, ns := range s.Linux.Namespaces {
+		if ns.Type == nst {
+			return ns, true
+		}
+	}
+	return specs.LinuxNamespace{}, false
+}
+
+// FilterNS returns a slice of namespaces from the spec with types that match
+// those in the `filter` slice.
+func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
+	if s.Linux == nil {
+		return nil
+	}
+	var out []specs.LinuxNamespace
+	for _, nst := range filter {
+		if ns, ok := GetNS(nst, s); ok {
+			out = append(out, ns)
+		}
+	}
+	return out
+}
+
+// SetNS sets the namespace of the given type.  It must be called with
+// OSThreadLocked.
+func SetNS(fd, nsType uintptr) error {
+	if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
+		return err
+	}
+	return nil
+}
+
+// ApplyNS applies the namespace on the current thread and returns a function
+// that will restore the namespace to the original value.
+//
+// Preconditions: Must be called with os thread locked.
+func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
+	log.Infof("applying namespace %v at path %q", ns.Type, ns.Path)
+	newNS, err := os.Open(ns.Path)
+	if err != nil {
+		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
+	}
+	defer newNS.Close()
+
+	// Store current netns to restore back after child is started.
+	curPath := nsPath(ns.Type)
+	oldNS, err := os.Open(curPath)
+	if err != nil {
+		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
+	}
+
+	// Set netns to the one requested and setup function to restore it back.
+	flag := nsCloneFlag(ns.Type)
+	if err := SetNS(newNS.Fd(), flag); err != nil {
+		oldNS.Close()
+		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
+	}
+	return func() {
+		log.Infof("restoring namespace %v", ns.Type)
+		defer oldNS.Close()
+		if err := SetNS(oldNS.Fd(), flag); err != nil {
+			panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
+		}
+	}, nil
+}
+
+// StartInNS joins or creates the given namespaces and calls cmd.Start before
+// restoring the namespaces to the original values.
+func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
+	// We are about to setup namespaces, which requires the os thread being
+	// locked so that Go doesn't change the thread out from under us.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if cmd.SysProcAttr == nil {
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+
+	for _, ns := range nss {
+		if ns.Path == "" {
+			// No path.  Just set a flag to create a new namespace.
+			cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
+			continue
+		}
+		// Join the given namespace, and restore the current namespace
+		// before exiting.
+		restoreNS, err := ApplyNS(ns)
+		if err != nil {
+			return err
+		}
+		defer restoreNS()
+	}
+
+	return cmd.Start()
+}
+
+// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
+func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
+	if s.Linux == nil {
+		return
+	}
+	if cmd.SysProcAttr == nil {
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+	for _, idMap := range s.Linux.UIDMappings {
+		log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+		cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
+			ContainerID: int(idMap.ContainerID),
+			HostID:      int(idMap.HostID),
+			Size:        int(idMap.Size),
+		})
+	}
+	for _, idMap := range s.Linux.GIDMappings {
+		log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+		cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
+			ContainerID: int(idMap.ContainerID),
+			HostID:      int(idMap.HostID),
+			Size:        int(idMap.Size),
+		})
+	}
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 03ab3c4ac..ca91e07ff 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -18,5 +18,6 @@ go_library(
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index b7d60e712..fc67c174a 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -32,7 +32,7 @@ func init() {
 	rand.Seed(time.Now().UnixNano())
 }
 
-func runtime() string {
+func getRuntime() string {
 	r := os.Getenv("RUNSC_RUNTIME")
 	if r == "" {
 		return "runsc-test"
@@ -43,7 +43,7 @@ func runtime() string {
 // IsPauseResumeSupported returns true if Pause/Resume is supported by runtime.
 func IsPauseResumeSupported() bool {
 	// Native host network stack can't be saved.
-	return !strings.Contains(runtime(), "hostnet")
+	return !strings.Contains(getRuntime(), "hostnet")
 }
 
 // EnsureSupportedDockerVersion checks if correct docker is installed.
@@ -128,7 +128,7 @@ type Docker struct {
 // Names of containers will be unique.
 func MakeDocker(namePrefix string) Docker {
 	suffix := fmt.Sprintf("-%06d", rand.Int())[:7]
-	return Docker{Name: namePrefix + suffix, Runtime: runtime()}
+	return Docker{Name: namePrefix + suffix, Runtime: getRuntime()}
 }
 
 // Create calls 'docker create' with the arguments provided.
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index fc3d61e52..e90ab5ad5 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -23,11 +23,16 @@ import (
 	"io/ioutil"
 	"net/http"
 	"os"
+	"os/exec"
 	"path/filepath"
+	"runtime"
+	"syscall"
+	"testing"
 	"time"
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -227,3 +232,55 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 	}
 	return Poll(cb, timeout)
 }
+
+// RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create
+// a new user namespace and reexecute the test as root inside of the namespace.
+func RunAsRoot(m *testing.M) {
+	caps, err := capability.NewPid2(os.Getpid())
+	if err != nil {
+		panic(err.Error())
+	}
+	if err := caps.Load(); err != nil {
+		panic(err.Error())
+	}
+	if caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN) {
+		// Capability: check! Good to run.
+		os.Exit(m.Run())
+	}
+
+	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
+	// as root inside that namespace to get it.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	cmd := exec.Command("/proc/self/exe", os.Args...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+		// Set current user/group as root inside the namespace.
+		UidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
+		},
+		GidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
+		},
+		GidMappingsEnableSetgroups: false,
+		Credential: &syscall.Credential{
+			Uid: 0,
+			Gid: 0,
+		},
+	}
+	cmd.Env = os.Environ()
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		if exit, ok := err.(*exec.ExitError); ok {
+			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
+				os.Exit(ws.ExitStatus())
+			}
+			os.Exit(-1)
+		}
+		panic(fmt.Sprint("error running child process:", err.Error()))
+	}
+	os.Exit(0)
+}
-- 
cgit v1.2.3


From b9ded9bf399422d09f2f2bd32cd4960d24b424bf Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Aug 2018 13:35:50 -0700
Subject: Add runsc-race target.

PiperOrigin-RevId: 210422178
Change-Id: I984dd348d467908bc3180a20fc79b8387fcca05e
---
 runsc/BUILD | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index a033c7caf..660cb2a06 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -19,3 +19,32 @@ go_binary(
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
+
+# The runsc-race target is a race-compatible BUILD target. This must be built
+# via "bazel build --features=race //runsc:runsc-race", since the race feature
+# must apply to all dependencies due a bug in gazelle file selection.  The pure
+# attribute must be off because the race detector requires linking with non-Go
+# components, although we still require a static binary.
+#
+# Note that in the future this might be convertible to a compatible target by
+# using the pure and static attributes within a select function, but select is
+# not currently compatible with string attributes [1].
+#
+# [1] https://github.com/bazelbuild/bazel/issues/1698
+go_binary(
+    name = "runsc-race",
+    srcs = [
+        "main.go",
+    ],
+    static = "on",
+    visibility = [
+        "//visibility:public",
+    ],
+    x_defs = {"main.gitRevision": "{GIT_REVISION}"},
+    deps = [
+        "//pkg/log",
+        "//runsc/boot",
+        "//runsc/cmd",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
+)
-- 
cgit v1.2.3


From 5999767d53d6c00d7e0f1966700e2876879f490e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 27 Aug 2018 13:50:50 -0700
Subject: runsc: fsgofer should return a unique QID.Path for each file.

Previously, we were only using the host inode id as the QID path. But the host
filesystem can have multiple devices with conflicting inode ids. This resulted
in duplicate inode ids in the sentry.

This CL generates a unique QID for each <host inode, host device> pair.

PiperOrigin-RevId: 210424813
Change-Id: I16d106f61c7c8f910c0da4ceec562a010ffca2fb
---
 runsc/fsgofer/fsgofer.go | 128 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 88 insertions(+), 40 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 1316dc618..b325afa63 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -91,14 +91,29 @@ type attachPoint struct {
 	prefix string
 	conf   Config
 
-	mu       sync.Mutex
-	attached bool
+	// attachedMu protects attached.
+	attachedMu sync.Mutex
+	attached   bool
+
+	// deviceMu protects devices and nextDevice.
+	deviceMu sync.Mutex
+
+	// nextDevice is the next device id that will be allocated.
+	nextDevice uint8
+
+	// devices is a map from actual host devices to "small" integers that
+	// can be combined with host inode to form a unique virtual inode id.
+	devices map[uint64]uint8
 }
 
 // NewAttachPoint creates a new attacher that gives local file
 // access to all files under 'prefix'.
 func NewAttachPoint(prefix string, c Config) p9.Attacher {
-	return &attachPoint{prefix: prefix, conf: c}
+	return &attachPoint{
+		prefix:  prefix,
+		conf:    c,
+		devices: make(map[uint64]uint8),
+	}
 }
 
 // Attach implements p9.Attacher.
@@ -131,21 +146,45 @@ func (a *attachPoint) Attach(appPath string) (p9.File, error) {
 		return nil, fmt.Errorf("failed to stat file %q, err: %v", root, err)
 	}
 
-	a.mu.Lock()
-	defer a.mu.Unlock()
+	a.attachedMu.Lock()
+	defer a.attachedMu.Unlock()
 	if a.attached {
 		f.Close()
 		return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
 	}
 	a.attached = true
 
-	return newLocalFile(a.conf, f, root, stat)
+	return newLocalFile(a, f, root, stat)
 }
 
-func makeQID(stat syscall.Stat_t) p9.QID {
+// makeQID returns a unique QID for the given stat buffer.
+func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
+	a.deviceMu.Lock()
+	defer a.deviceMu.Unlock()
+
+	// First map the host device id to a unique 8-bit integer.
+	dev, ok := a.devices[stat.Dev]
+	if !ok {
+		a.devices[stat.Dev] = a.nextDevice
+		dev = a.nextDevice
+		a.nextDevice++
+		if a.nextDevice < dev {
+			panic(fmt.Sprintf("device id overflow! map: %+v", a.devices))
+		}
+	}
+
+	// Construct a "virtual" inode id with the uint8 device number in the
+	// first 8 bits, and the rest of the bits from the host inode id.
+	maskedIno := stat.Ino & 0x00ffffffffffffff
+	if maskedIno != stat.Ino {
+		log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino)
+	}
+	ino := uint64(dev)<<56 | maskedIno
+	log.Debugf("host inode %x on device %x mapped to virtual inode %x", stat.Ino, stat.Dev, ino)
+
 	return p9.QID{
 		Type: p9.FileMode(stat.Mode).QIDType(),
-		Path: stat.Ino,
+		Path: ino,
 	}
 }
 
@@ -193,6 +232,9 @@ func isNameValid(name string) bool {
 type localFile struct {
 	p9.DefaultWalkGetAttr
 
+	// attachPoint is the attachPoint that serves this localFile.
+	attachPoint *attachPoint
+
 	// mu protects 'hostPath' when file is renamed.
 	mu sync.Mutex
 
@@ -213,8 +255,6 @@ type localFile struct {
 
 	ft fileType
 
-	conf Config
-
 	// readDirMu protects against concurrent Readdir calls.
 	readDirMu sync.Mutex
 }
@@ -228,7 +268,8 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 	symlinkIdx := len(modes) - 1
 
 	startIdx := 0
-	if parent.conf.ROMount || parent.conf.LazyOpenForWrite {
+	conf := parent.attachPoint.conf
+	if conf.ROMount || conf.LazyOpenForWrite {
 		// Skip attempt to open in RDWR based on configuration.
 		startIdx = 1
 	}
@@ -269,7 +310,7 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 	return os.NewFile(uintptr(fd), newPath), newPath, nil
 }
 
-func newLocalFile(conf Config, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) {
+func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) {
 	var ft fileType
 	switch stat.Mode & syscall.S_IFMT {
 	case syscall.S_IFREG:
@@ -282,9 +323,9 @@ func newLocalFile(conf Config, file *os.File, path string, stat syscall.Stat_t)
 		return nil, syscall.EINVAL
 	}
 	return &localFile{
+		attachPoint: a,
 		hostPath:    path,
 		controlFile: file,
-		conf:        conf,
 		mode:        invalidMode,
 		ft:          ft,
 	}, nil
@@ -338,7 +379,7 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 
 	// Check if control file can be used or if a new open must be created.
 	var newFile *os.File
-	if mode == p9.ReadOnly || !l.conf.LazyOpenForWrite {
+	if mode == p9.ReadOnly || !l.attachPoint.conf.LazyOpenForWrite {
 		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name())
 		newFile = l.controlFile
 	} else {
@@ -372,13 +413,14 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	// Set fields on success
 	l.openedFile = newFile
 	l.mode = mode
-	return fd, makeQID(stat), 0, nil
+	return fd, l.attachPoint.makeQID(stat), 0, nil
 }
 
 // Create implements p9.File.
 func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
-	if l.conf.ROMount {
-		if l.conf.PanicOnWrite {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
 			panic("attempt to write to RO mount")
 		}
 		return nil, nil, p9.QID{}, 0, syscall.EBADF
@@ -423,19 +465,20 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	cPath := path.Join(l.hostPath, name)
 	f := os.NewFile(uintptr(fd), cPath)
 	c := &localFile{
+		attachPoint: l.attachPoint,
 		hostPath:    cPath,
 		controlFile: f,
 		openedFile:  f,
 		mode:        mode,
-		conf:        l.conf,
 	}
-	return newFDMaybe(c.openedFile), c, makeQID(stat), 0, nil
+	return newFDMaybe(c.openedFile), c, l.attachPoint.makeQID(stat), 0, nil
 }
 
 // Mkdir implements p9.File.
 func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	if l.conf.ROMount {
-		if l.conf.PanicOnWrite {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
 			panic("attempt to write to RO mount")
 		}
 		return p9.QID{}, syscall.EBADF
@@ -464,7 +507,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	return makeQID(stat), nil
+	return l.attachPoint.makeQID(stat), nil
 }
 
 // Walk implements p9.File.
@@ -485,12 +528,12 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		defer l.mu.Unlock()
 
 		c := &localFile{
+			attachPoint: l.attachPoint,
 			hostPath:    l.hostPath,
 			controlFile: os.NewFile(uintptr(newFd), l.hostPath),
 			mode:        invalidMode,
-			conf:        l.conf,
 		}
-		return []p9.QID{makeQID(stat)}, c, nil
+		return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
 	}
 
 	var qids []p9.QID
@@ -508,12 +551,12 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
-		c, err := newLocalFile(last.conf, f, path, stat)
+		c, err := newLocalFile(last.attachPoint, f, path, stat)
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
 
-		qids = append(qids, makeQID(stat))
+		qids = append(qids, l.attachPoint.makeQID(stat))
 		last = c
 	}
 	return qids, last, nil
@@ -586,15 +629,16 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
 		CTime:  true,
 	}
 
-	return makeQID(stat), valid, attr, nil
+	return l.attachPoint.makeQID(stat), valid, attr, nil
 }
 
 // SetAttr implements p9.File. Due to mismatch in file API, options
 // cannot be changed atomicaly and user may see partial changes when
 // an error happens.
 func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
-	if l.conf.ROMount {
-		if l.conf.PanicOnWrite {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
 			panic("attempt to write to RO mount")
 		}
 		return syscall.EBADF
@@ -624,7 +668,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	}
 
 	fd := l.controlFD()
-	if l.conf.LazyOpenForWrite && l.ft == regular {
+	if conf.LazyOpenForWrite && l.ft == regular {
 		// Regular files are opened in RO mode when lazy open is set.
 		// Thus it needs to be reopened here for write.
 		f, err := os.OpenFile(l.hostPath, openFlags|os.O_WRONLY, 0)
@@ -733,8 +777,9 @@ func (*localFile) Remove() error {
 
 // Rename implements p9.File.
 func (l *localFile) Rename(directory p9.File, name string) error {
-	if l.conf.ROMount {
-		if l.conf.PanicOnWrite {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
 			panic("attempt to write to RO mount")
 		}
 		return syscall.EBADF
@@ -803,8 +848,9 @@ func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 
 // Symlink implements p9.File.
 func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	if l.conf.ROMount {
-		if l.conf.PanicOnWrite {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
 			panic("attempt to write to RO mount")
 		}
 		return p9.QID{}, syscall.EBADF
@@ -831,13 +877,14 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	return makeQID(stat), nil
+	return l.attachPoint.makeQID(stat), nil
 }
 
 // Link implements p9.File.
 func (l *localFile) Link(target p9.File, newName string) error {
-	if l.conf.ROMount {
-		if l.conf.PanicOnWrite {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
 			panic("attempt to write to RO mount")
 		}
 		return syscall.EBADF
@@ -862,8 +909,9 @@ func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _
 
 // UnlinkAt implements p9.File.
 func (l *localFile) UnlinkAt(name string, flags uint32) error {
-	if l.conf.ROMount {
-		if l.conf.PanicOnWrite {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
 			panic("attempt to write to RO mount")
 		}
 		return syscall.EBADF
@@ -906,7 +954,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 		if err != nil {
 			continue
 		}
-		qid := makeQID(stat)
+		qid := l.attachPoint.makeQID(stat)
 		dirents = append(dirents, p9.Dirent{
 			QID:    qid,
 			Type:   qid.Type,
-- 
cgit v1.2.3


From 0b3bfe2ea30d491a6533f8ee74eb6e3cea707f06 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 27 Aug 2018 14:25:21 -0700
Subject: fs: Fix remote-revalidate cache policy.

When revalidating a Dirent, if the inode id is the same, then we don't need to
throw away the entire Dirent. We can just update the unstable attributes in
place.

If the inode id has changed, then the remote file has been deleted or moved,
and we have no choice but to throw away the dirent we have a look up another.
In this case, we may still end up losing a mounted dirent that is a child of
the revalidated dirent. However, that seems appropriate here because the entire
mount point has been pulled out from underneath us.

Because gVisor's overlay is at the Inode level rather than the Dirent level, we
must pass the parent Inode and name along with the Inode that is being
revalidated.

PiperOrigin-RevId: 210431270
Change-Id: I705caef9c68900234972d5aac4ae3a78c61c7d42
---
 pkg/sentry/fs/attr.go                | 19 --------
 pkg/sentry/fs/dirent.go              |  2 +-
 pkg/sentry/fs/fsutil/inode_cached.go | 41 ++++++++++++++++
 pkg/sentry/fs/gofer/cache_policy.go  | 64 ++++++++++++++++++++----
 pkg/sentry/fs/gofer/gofer_test.go    | 95 ++++++++++++++++++++++++++----------
 pkg/sentry/fs/gofer/session.go       |  6 +--
 pkg/sentry/fs/mock.go                |  2 +-
 pkg/sentry/fs/mount.go               | 13 +++--
 pkg/sentry/fs/mount_overlay.go       | 20 +++++---
 pkg/sentry/fs/tty/fs.go              |  2 +-
 runsc/container/container_test.go    | 18 +++----
 11 files changed, 201 insertions(+), 81 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 4178f18b2..091f4ac63 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -213,25 +213,6 @@ func (a AttrMask) Empty() bool {
 	return a == AttrMask{}
 }
 
-// Union returns an AttrMask containing the inclusive disjunction of fields in a and b.
-func (a AttrMask) Union(b AttrMask) AttrMask {
-	return AttrMask{
-		Type:             a.Type || b.Type,
-		DeviceID:         a.DeviceID || b.DeviceID,
-		InodeID:          a.InodeID || b.InodeID,
-		BlockSize:        a.BlockSize || b.BlockSize,
-		Size:             a.Size || b.Size,
-		Usage:            a.Usage || b.Usage,
-		Perms:            a.Perms || b.Perms,
-		UID:              a.UID || b.UID,
-		GID:              a.GID || b.GID,
-		AccessTime:       a.AccessTime || b.AccessTime,
-		ModificationTime: a.ModificationTime || b.ModificationTime,
-		StatusChangeTime: a.StatusChangeTime || b.StatusChangeTime,
-		Links:            a.Links || b.Links,
-	}
-}
-
 // PermMask are file access permissions.
 //
 // +stateify savable
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index c1dfa0de7..5587582b5 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -499,7 +499,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 			//
 			// We never allow the file system to revalidate mounts, that could cause them
 			// to unexpectedly drop out before umount.
-			if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, cd.Inode) {
+			if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, name, d.Inode, cd.Inode) {
 				// Good to go. This is the fast-path.
 				return cd, nil
 			}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 0a320e2d8..6777c8bf7 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -427,6 +427,47 @@ func (c *CachingInodeOperations) touchStatusChangeTimeLocked(ctx context.Context
 	c.dirtyAttr.StatusChangeTime = true
 }
 
+// UpdateUnstable updates the cached unstable attributes. Only non-dirty
+// attributes are updated.
+func (c *CachingInodeOperations) UpdateUnstable(attr fs.UnstableAttr) {
+	// All attributes are protected by attrMu.
+	c.attrMu.Lock()
+
+	if !c.dirtyAttr.Usage {
+		c.attr.Usage = attr.Usage
+	}
+	if !c.dirtyAttr.Perms {
+		c.attr.Perms = attr.Perms
+	}
+	if !c.dirtyAttr.UID {
+		c.attr.Owner.UID = attr.Owner.UID
+	}
+	if !c.dirtyAttr.GID {
+		c.attr.Owner.GID = attr.Owner.GID
+	}
+	if !c.dirtyAttr.AccessTime {
+		c.attr.AccessTime = attr.AccessTime
+	}
+	if !c.dirtyAttr.ModificationTime {
+		c.attr.ModificationTime = attr.ModificationTime
+	}
+	if !c.dirtyAttr.StatusChangeTime {
+		c.attr.StatusChangeTime = attr.StatusChangeTime
+	}
+	if !c.dirtyAttr.Links {
+		c.attr.Links = attr.Links
+	}
+
+	// Size requires holding attrMu and dataMu.
+	c.dataMu.Lock()
+	if !c.dirtyAttr.Size {
+		c.attr.Size = attr.Size
+	}
+	c.dataMu.Unlock()
+
+	c.attrMu.Unlock()
+}
+
 // Read reads from frames and otherwise directly from the backing file
 // into dst starting at offset until dst is full, EOF is reached, or an
 // error is encountered.
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index fa8abf51c..98f43c578 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -17,6 +17,7 @@ package gofer
 import (
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 )
 
@@ -108,25 +109,68 @@ func (cp cachePolicy) writeThrough(inode *fs.Inode) bool {
 	return cp == cacheNone || cp == cacheAllWritethrough
 }
 
-// revalidateDirent indicates that a dirent should be revalidated after a
-// lookup, because the looked up version may be stale.
-func (cp cachePolicy) revalidateDirent() bool {
+// revalidate revalidates the child Inode if the cache policy allows it.
+//
+// Depending on the cache policy, revalidate will walk from the parent to the
+// child inode, and if any unstable attributes have changed, will update the
+// cached attributes on the child inode. If the walk fails, or the returned
+// inode id is different from the one being revalidated, then the entire Dirent
+// must be reloaded.
+func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child *fs.Inode) bool {
 	if cp == cacheAll || cp == cacheAllWritethrough {
 		return false
 	}
 
-	// TODO: The cacheRemoteRevalidating policy should only
-	// return true if the remote file's attributes have changed.
-	return true
+	if cp == cacheNone {
+		return true
+	}
+
+	childIops, ok := child.InodeOperations.(*inodeOperations)
+	if !ok {
+		panic(fmt.Sprintf("revalidating inode operations of unknown type %T", child.InodeOperations))
+	}
+	parentIops, ok := parent.InodeOperations.(*inodeOperations)
+	if !ok {
+		panic(fmt.Sprintf("revalidating inode operations with parent of unknown type %T", parent.InodeOperations))
+	}
+
+	// Walk from parent to child again.
+	//
+	// TODO: If we have a directory FD in the parent
+	// inodeOperations, then we can use fstatat(2) to get the inode
+	// attributes instead of making this RPC.
+	qids, _, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name})
+	if err != nil {
+		// Can't look up the name. Trigger reload.
+		return true
+	}
+
+	// If the Path has changed, then we are not looking at the file file.
+	// We must reload.
+	if qids[0].Path != childIops.fileState.key.Inode {
+		return true
+	}
+
+	// If we are not caching unstable attrs, then there is nothing to
+	// update on this inode.
+	if !cp.cacheUAttrs(child) {
+		return false
+	}
+
+	// Update the inode's cached unstable attrs.
+	s := childIops.session()
+	childIops.cachingInodeOps.UpdateUnstable(unstable(ctx, mask, attr, s.mounter, s.client))
+
+	return false
 }
 
-// keepDirent indicates that dirents should be kept pinned in the dirent tree
-// even if there are no application references on the file.
-func (cp cachePolicy) keepDirent(inode *fs.Inode) bool {
+// keep indicates that dirents should be kept pinned in the dirent tree even if
+// there are no application references on the file.
+func (cp cachePolicy) keep(d *fs.Dirent) bool {
 	if cp == cacheNone {
 		return false
 	}
-	sattr := inode.StableAttr
+	sattr := d.Inode.StableAttr
 	// NOTE: Only cache files, directories, and symlinks.
 	return fs.IsFile(sattr) || fs.IsDir(sattr) || fs.IsSymlink(sattr)
 }
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 45fdaacfd..c8d7bd773 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -151,41 +151,60 @@ func TestLookup(t *testing.T) {
 
 func TestRevalidation(t *testing.T) {
 	tests := []struct {
-		cachePolicy               cachePolicy
-		preModificationWantReval  bool
-		postModificationWantReval bool
+		cachePolicy cachePolicy
+
+		// Whether dirent should be reloaded before any modifications.
+		preModificationWantReload bool
+
+		// Whether dirent should be reloaded after updating an unstable
+		// attribute on the remote fs.
+		postModificationWantReload bool
+
+		// Whether dirent unstable attributes should be updated after
+		// updating an attribute on the remote fs.
+		postModificationWantUpdatedAttrs bool
+
+		// Whether dirent should be reloaded after the remote has
+		// removed the file.
+		postRemovalWantReload bool
 	}{
 		{
 			// Policy cacheNone causes Revalidate to always return
 			// true.
-			cachePolicy:               cacheNone,
-			preModificationWantReval:  true,
-			postModificationWantReval: true,
+			cachePolicy:                      cacheNone,
+			preModificationWantReload:        true,
+			postModificationWantReload:       true,
+			postModificationWantUpdatedAttrs: true,
+			postRemovalWantReload:            true,
 		},
 		{
 			// Policy cacheAll causes Revalidate to always return
 			// false.
-			cachePolicy:               cacheAll,
-			preModificationWantReval:  false,
-			postModificationWantReval: false,
+			cachePolicy:                      cacheAll,
+			preModificationWantReload:        false,
+			postModificationWantReload:       false,
+			postModificationWantUpdatedAttrs: false,
+			postRemovalWantReload:            false,
 		},
 		{
 			// Policy cacheAllWritethrough causes Revalidate to
 			// always return false.
-			cachePolicy:               cacheAllWritethrough,
-			preModificationWantReval:  false,
-			postModificationWantReval: false,
+			cachePolicy:                      cacheAllWritethrough,
+			preModificationWantReload:        false,
+			postModificationWantReload:       false,
+			postModificationWantUpdatedAttrs: false,
+			postRemovalWantReload:            false,
 		},
 		{
 			// Policy cacheRemoteRevalidating causes Revalidate to
-			// always return true.
-			//
-			// TODO: The cacheRemoteRevalidating
-			// policy should only return true if the remote file's
-			// attributes have changed.
-			cachePolicy:               cacheRemoteRevalidating,
-			preModificationWantReval:  true,
-			postModificationWantReval: true,
+			// return update cached unstable attrs, and returns
+			// true only when the remote inode itself has been
+			// removed or replaced.
+			cachePolicy:                      cacheRemoteRevalidating,
+			preModificationWantReload:        false,
+			postModificationWantReload:       false,
+			postModificationWantUpdatedAttrs: true,
+			postRemovalWantReload:            true,
 		},
 	}
 
@@ -227,15 +246,17 @@ func TestRevalidation(t *testing.T) {
 			if err != nil {
 				t.Fatalf("Lookup(%q) failed: %v", name, err)
 			}
-			if test.preModificationWantReval && dirent == newDirent {
+			if test.preModificationWantReload && dirent == newDirent {
 				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
 			}
-			if !test.preModificationWantReval && dirent != newDirent {
+			if !test.preModificationWantReload && dirent != newDirent {
 				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
 			}
 
 			// Modify the underlying mocked file's modification time.
-			file.GetAttrMock.Attr.MTimeSeconds = uint64(time.Now().Unix())
+			nowSeconds := time.Now().Unix()
+			rootFile.WalkGetAttrMock.Attr.MTimeSeconds = uint64(nowSeconds)
+			file.GetAttrMock.Attr.MTimeSeconds = uint64(nowSeconds)
 
 			// Walk again. Depending on the cache policy, we may get a new
 			// dirent.
@@ -243,12 +264,36 @@ func TestRevalidation(t *testing.T) {
 			if err != nil {
 				t.Fatalf("Lookup(%q) failed: %v", name, err)
 			}
-			if test.postModificationWantReval && dirent == newDirent {
+			if test.postModificationWantReload && dirent == newDirent {
 				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
 			}
-			if !test.postModificationWantReval && dirent != newDirent {
+			if !test.postModificationWantReload && dirent != newDirent {
 				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
 			}
+			uattrs, err := newDirent.Inode.UnstableAttr(ctx)
+			if err != nil {
+				t.Fatalf("Error getting unstable attrs: %v", err)
+			}
+			gotModTimeSeconds := uattrs.ModificationTime.Seconds()
+			if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds {
+				t.Fatalf("Lookup(%q) with cachePolicy=%s got new modification time %v, wanted %v", name, test.cachePolicy, gotModTimeSeconds, nowSeconds)
+			}
+
+			// Make WalkGetAttr return ENOENT. This simulates
+			// removing the file from the remote fs.
+			rootFile.WalkGetAttrMock = p9test.WalkGetAttrMock{
+				Err: syscall.ENOENT,
+			}
+
+			// Walk again. Depending on the cache policy, we may
+			// get ENOENT.
+			newDirent, err = rootDir.Walk(ctx, rootDir, name)
+			if test.postRemovalWantReload && err == nil {
+				t.Errorf("Lookup(%q) with cachePolicy=%s got nil error, wanted ENOENT", name, test.cachePolicy)
+			}
+			if !test.postRemovalWantReload && (err != nil || dirent != newDirent) {
+				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v and error %v, wanted old dirent %v and nil error", name, test.cachePolicy, newDirent, err, dirent)
+			}
 		})
 	}
 }
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index eeb9087e9..49d27ee88 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -146,13 +146,13 @@ func (s *session) Destroy() {
 }
 
 // Revalidate implements MountSource.Revalidate.
-func (s *session) Revalidate(ctx context.Context, i *fs.Inode) bool {
-	return s.cachePolicy.revalidateDirent()
+func (s *session) Revalidate(ctx context.Context, name string, parent, child *fs.Inode) bool {
+	return s.cachePolicy.revalidate(ctx, name, parent, child)
 }
 
 // Keep implements MountSource.Keep.
 func (s *session) Keep(d *fs.Dirent) bool {
-	return s.cachePolicy.keepDirent(d.Inode)
+	return s.cachePolicy.keep(d)
 }
 
 // ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 89a0103ba..846b6e8bb 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -68,7 +68,7 @@ func NewMockMountSource(cache *DirentCache) *MountSource {
 }
 
 // Revalidate implements fs.MountSourceOperations.Revalidate.
-func (n *MockMountSourceOps) Revalidate(context.Context, *Inode) bool {
+func (n *MockMountSourceOps) Revalidate(context.Context, string, *Inode, *Inode) bool {
 	return n.revalidate
 }
 
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 455f5b35c..8345876fc 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -27,10 +27,13 @@ import (
 // DirentOperations provide file systems greater control over how long a Dirent stays pinned
 // in core. Implementations must not take Dirent.mu.
 type DirentOperations interface {
-	// Revalidate returns true if the Inode is stale and its
-	// InodeOperations needs to be reloaded. Revalidate will never be
-	// called on a Inode that is mounted.
-	Revalidate(ctx context.Context, inode *Inode) bool
+	// Revalidate is called during lookup each time we encounter a Dirent
+	// in the cache. Implementations may update stale properties of the
+	// child Inode. If Revalidate returns true, then the entire Inode will
+	// be reloaded.
+	//
+	// Revalidate will never be called on a Inode that is mounted.
+	Revalidate(ctx context.Context, name string, parent, child *Inode) bool
 
 	// Keep returns true if the Dirent should be kept in memory for as long
 	// as possible beyond any active references.
@@ -281,7 +284,7 @@ type SimpleMountSourceOperations struct {
 }
 
 // Revalidate implements MountSourceOperations.Revalidate.
-func (smo *SimpleMountSourceOperations) Revalidate(context.Context, *Inode) bool {
+func (smo *SimpleMountSourceOperations) Revalidate(context.Context, string, *Inode, *Inode) bool {
 	return smo.revalidate
 }
 
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 9fa87c10f..dbc608c7e 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -41,23 +41,29 @@ func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *M
 // delegating to the upper filesystem's Revalidate method. We cannot reload
 // files from the lower filesystem, so we panic if the lower filesystem's
 // Revalidate method returns true.
-func (o *overlayMountSourceOperations) Revalidate(ctx context.Context, inode *Inode) bool {
-	if inode.overlay == nil {
+func (o *overlayMountSourceOperations) Revalidate(ctx context.Context, name string, parent, child *Inode) bool {
+	if child.overlay == nil {
 		panic("overlay cannot revalidate inode that is not an overlay")
 	}
 
-	// Should we bother checking this, or just ignore?
-	if inode.overlay.lower != nil && o.lower.Revalidate(ctx, inode.overlay.lower) {
+	// Revalidate is never called on a mount point, so parent and child
+	// must be from the same mount, and thus must both be overlay inodes.
+	if parent.overlay == nil {
+		panic("trying to revalidate an overlay inode but the parent is not an overlay")
+	}
+
+	// We can't revalidate from the lower filesystem.
+	if child.overlay.lower != nil && o.lower.Revalidate(ctx, name, parent.overlay.lower, child.overlay.lower) {
 		panic("an overlay cannot revalidate file objects from the lower fs")
 	}
 
-	if inode.overlay.upper == nil {
-		// Nothing to revalidate.
+	// Do we have anything to revalidate?
+	if child.overlay.upper == nil {
 		return false
 	}
 
 	// Does the upper require revalidation?
-	return o.upper.Revalidate(ctx, inode.overlay.upper)
+	return o.upper.Revalidate(ctx, name, parent.overlay.upper, child.overlay.upper)
 }
 
 // Keep implements MountSourceOperations by delegating to the upper
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index fe7da05b5..d9f8f02f3 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -82,7 +82,7 @@ type superOperations struct{}
 // Slave entries are dropped from dir when their master is closed, so an
 // existing slave Dirent in the tree is not sufficient to guarantee that it
 // still exists on the filesystem.
-func (superOperations) Revalidate(context.Context, *fs.Inode) bool {
+func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool {
 	return true
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 25aaf3f86..4ce3afc91 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -134,7 +134,13 @@ func waitForFile(f *os.File) error {
 		}
 		return nil
 	}
-	return testutil.Poll(op, 5*time.Second)
+
+	timeout := 5 * time.Second
+	if testutil.RaceEnabled {
+		// Race makes slow things even slow, so bump the timeout.
+		timeout = 3 * timeout
+	}
+	return testutil.Poll(op, timeout)
 }
 
 // readOutputNum reads a file at given filepath and returns the int at the
@@ -213,10 +219,8 @@ const (
 	nonExclusiveFS
 )
 
-// TODO: nonExclusiveFS was removed because it causes timeout
-// with --race. Put it back when bug is fixed.
-var all = []configOption{overlay, kvm}
-var noOverlay = []configOption{kvm}
+var noOverlay = []configOption{kvm, nonExclusiveFS}
+var all = append(noOverlay, overlay)
 
 // configs generates different configurations to run tests.
 func configs(opts ...configOption) []*boot.Config {
@@ -1572,10 +1576,6 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	// the filesystem.
 	spec := testutil.NewSpecWithArgs("sleep", "1000")
 
-	// TODO: $TEST_TMPDIR mount is mistakenly marked as RO after
-	// revalidation. Remove when it's fixed.
-	spec.Root.Readonly = false
-
 	dir, err := ioutil.TempDir(testutil.TmpDir(), "root-fs-test")
 	if err != nil {
 		t.Fatalf("TempDir failed: %v", err)
-- 
cgit v1.2.3


From a4529c1b5b485f6283367bfdc0e4228bbbd3e51f Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Aug 2018 20:33:38 -0700
Subject: runsc: Fix readonly filesystem causing failure to create containers.

For readonly filesystems specified via relative path, we were forgetting to
mount relative to the container's bundle directory.

PiperOrigin-RevId: 210483388
Change-Id: I84809fce4b1f2056d0e225547cb611add5f74177
---
 runsc/container/BUILD |  1 -
 runsc/container/fs.go | 21 +++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index cba418d0c..b86974d41 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -29,7 +29,6 @@ go_library(
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index 652f81bbf..c12f5c331 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -22,7 +22,6 @@ import (
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -84,29 +83,29 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		}
 		srcfi, err := os.Stat(src)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to stat() mount source: %v", err)
 		}
 
 		// It's possible that 'm.Destination' follows symlinks inside the
 		// container.
 		dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to resolve symlinks: %v", err)
 		}
 
 		// Create mount point if it doesn't exits
 		if _, err := os.Stat(dst); os.IsNotExist(err) {
 			if srcfi.IsDir() {
 				if err := os.MkdirAll(dst, 0755); err != nil {
-					return err
+					return fmt.Errorf("failed to make mount directory %q: %v", dst, err)
 				}
 			} else {
 				if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
-					return err
+					return fmt.Errorf("failed to make mount directory for file %q: %v", filepath.Dir(dst), err)
 				}
 				f, err := os.OpenFile(dst, os.O_CREATE, 0755)
 				if err != nil {
-					return err
+					return fmt.Errorf("failed to open mount file %q: %v", dst, err)
 				}
 				f.Close()
 			}
@@ -116,7 +115,7 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		flags |= syscall.MS_BIND
 		log.Infof("Mounting src: %q, dst: %q, flags: %#x", src, dst, flags)
 		if err := syscall.Mount(src, dst, m.Type, uintptr(flags), ""); err != nil {
-			return err
+			return fmt.Errorf("failed to mount src: %q, dst: %q, flags: %#x, err: %v", src, dst, flags, err)
 		}
 	}
 
@@ -124,7 +123,13 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 	if spec.Root.Readonly {
 		log.Infof("Remounting root as readonly: %q", spec.Root.Path)
 		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
-		return unix.Mount(spec.Root.Path, spec.Root.Path, "bind", flags, "")
+		src := spec.Root.Path
+		if !filepath.IsAbs(src) {
+			src = filepath.Join(bundleDir, src)
+		}
+		if err := syscall.Mount(src, src, "bind", flags, ""); err != nil {
+			return fmt.Errorf("failed to remount root as readonly with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
+		}
 	}
 	return nil
 }
-- 
cgit v1.2.3


From ae648bafda2d82a6641e4a28bed34dae40d426ec Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 27 Aug 2018 20:35:00 -0700
Subject: Add command-line parameter to trigger panic on signal

This is to troubleshoot problems with a hung process that is
not responding to 'runsc debug --stack' command.

PiperOrigin-RevId: 210483513
Change-Id: I4377b210b4e51bc8a281ad34fd94f3df13d9187d
---
 pkg/sentry/sighandling/sighandling.go |  5 ++---
 runsc/boot/config.go                  |  6 ++++++
 runsc/boot/loader.go                  | 14 +++++++++++++-
 runsc/cmd/debug.go                    | 10 ++++++++++
 runsc/main.go                         |  2 ++
 5 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 25295440c..5bac3a4e1 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -103,7 +103,7 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 // PrepareForwarding ensures that synchronous signals are forwarded to k and
 // returns a callback that starts signal delivery, which itself returns a
 // callback that stops signal forwarding.
-func PrepareForwarding(k *kernel.Kernel, enablePanicSignal bool) func() func() {
+func PrepareForwarding(k *kernel.Kernel, skipSignal syscall.Signal) func() func() {
 	start := make(chan struct{})
 	stop := make(chan struct{})
 
@@ -119,8 +119,7 @@ func PrepareForwarding(k *kernel.Kernel, enablePanicSignal bool) func() func() {
 		sigchan := make(chan os.Signal, 1)
 		sigchans = append(sigchans, sigchan)
 
-		// SignalPanic is handled by Run.
-		if enablePanicSignal && linux.Signal(sig) == kernel.SignalPanic {
+		if syscall.Signal(sig) == skipSignal {
 			continue
 		}
 
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index bc392deb3..efb8563ea 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -204,7 +204,12 @@ type Config struct {
 	// TODO: Remove this when multiple container is fully supported.
 	MultiContainer bool
 
+	// WatchdogAction sets what action the watchdog takes when triggered.
 	WatchdogAction watchdog.Action
+
+	// PanicSignal register signal handling that panics. Usually set to
+	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
+	PanicSignal int
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -225,5 +230,6 @@ func (c *Config) ToFlags() []string {
 		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
 		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
 		"--watchdog-action=" + c.WatchdogAction.String(),
+		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
 	}
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 3963ed55d..0ad830a6b 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"math/rand"
 	"os"
+	"os/signal"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -229,7 +230,18 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
 	}
 	// Ensure that signals received are forwarded to the emulated kernel.
-	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
+	ps := syscall.Signal(conf.PanicSignal)
+	stopSignalForwarding := sighandling.PrepareForwarding(k, ps)()
+	if conf.PanicSignal != -1 {
+		// Panics if the sentry receives 'conf.PanicSignal'.
+		panicChan := make(chan os.Signal, 1)
+		signal.Notify(panicChan, ps)
+		go func() { // S/R-SAFE: causes sentry panic.
+			<-panicChan
+			panic("Signal-induced panic")
+		}()
+		log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal)
+	}
 
 	procArgs, err := newProcess(spec, creds, utsns, ipcns, k)
 	if err != nil {
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 7952489de..b20987b2c 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -15,6 +15,8 @@
 package cmd
 
 import (
+	"syscall"
+
 	"context"
 	"flag"
 	"github.com/google/subcommands"
@@ -27,6 +29,7 @@ import (
 type Debug struct {
 	pid    int
 	stacks bool
+	signal int
 }
 
 // Name implements subcommands.Command.
@@ -48,6 +51,7 @@ func (*Debug) Usage() string {
 func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set")
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
+	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -96,6 +100,12 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("sandbox %q is not running", c.Sandbox.ID)
 	}
 
+	if d.signal > 0 {
+		log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid)
+		if err := syscall.Kill(c.Sandbox.Pid, syscall.Signal(d.signal)); err != nil {
+			Fatalf("failed to send signal %d to processs %d", d.signal, c.Sandbox.Pid)
+		}
+	}
 	if d.stacks {
 		log.Infof("Retrieving sandbox stacks")
 		stacks, err := c.Sandbox.Stacks()
diff --git a/runsc/main.go b/runsc/main.go
index 0a2cbca6c..773ec6486 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -61,6 +61,7 @@ var (
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 )
 
 var gitRevision = ""
@@ -139,6 +140,7 @@ func main() {
 		StraceLogSize:  *straceLogSize,
 		MultiContainer: *multiContainer,
 		WatchdogAction: wa,
+		PanicSignal:    *panicSignal,
 	}
 	if len(*straceSyscalls) != 0 {
 		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
-- 
cgit v1.2.3


From f7366e4e6465530ecc1641312011fd82a94f55f8 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 28 Aug 2018 11:29:53 -0700
Subject: Consolidate image tests into a single file

This is to keep it consistent with other test, and
it's easier to maintain them in single file.
Also increase python test timeout to deflake it.

PiperOrigin-RevId: 210575042
Change-Id: I2ef5bcd5d97c08549f0c5f645c4b694253ef0b4d
---
 runsc/test/image/BUILD          |  2 --
 runsc/test/image/image_test.go  | 64 +++++++++++++++++++++++++++++++++++++++++
 runsc/test/image/python_test.go | 56 ------------------------------------
 runsc/test/image/tomcat_test.go | 56 ------------------------------------
 4 files changed, 64 insertions(+), 114 deletions(-)
 delete mode 100644 runsc/test/image/python_test.go
 delete mode 100644 runsc/test/image/tomcat_test.go

(limited to 'runsc')

diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index fda6f2d9c..5854eec12 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -7,8 +7,6 @@ go_test(
     size = "large",
     srcs = [
         "image_test.go",
-        "python_test.go",
-        "tomcat_test.go",
     ],
     data = [
         "latin10k.txt",
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 962c31b24..b1e5e726a 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -192,6 +192,70 @@ func TestMysql(t *testing.T) {
 	}
 }
 
+func TestPythonHello(t *testing.T) {
+	if err := testutil.Pull("google/python-hello"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("python-hello-test")
+	if _, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 20*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	}
+
+	// Ensure that content is being served.
+	url := fmt.Sprintf("http://localhost:%d", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Errorf("Error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+}
+
+func TestTomcat(t *testing.T) {
+	if err := testutil.Pull("tomcat:8.0"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("tomcat-test")
+	if _, err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	}
+
+	// Ensure that content is being served.
+	url := fmt.Sprintf("http://localhost:%d", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Errorf("Error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+}
+
 func MainTest(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
diff --git a/runsc/test/image/python_test.go b/runsc/test/image/python_test.go
deleted file mode 100644
index a8d28e080..000000000
--- a/runsc/test/image/python_test.go
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package image
-
-import (
-	"fmt"
-	"net/http"
-	"testing"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
-)
-
-func TestPythonHello(t *testing.T) {
-	if err := testutil.Pull("google/python-hello"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("python-hello-test")
-	if _, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
-	}
-
-	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
-	resp, err := http.Get(url)
-	if err != nil {
-		t.Errorf("Error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-}
diff --git a/runsc/test/image/tomcat_test.go b/runsc/test/image/tomcat_test.go
deleted file mode 100644
index 97cf95834..000000000
--- a/runsc/test/image/tomcat_test.go
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package image
-
-import (
-	"fmt"
-	"net/http"
-	"testing"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
-)
-
-func TestTomcat(t *testing.T) {
-	if err := testutil.Pull("tomcat:8.0"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("tomcat-test")
-	if _, err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
-	}
-
-	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
-	resp, err := http.Get(url)
-	if err != nil {
-		t.Errorf("Error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-}
-- 
cgit v1.2.3


From d8f0db9bcf2ecfaf7fb1b09d7d4cace3a8e40cc7 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Tue, 28 Aug 2018 11:52:56 -0700
Subject: runsc: unmount volume mounts when destroy container.

PiperOrigin-RevId: 210579178
Change-Id: Iae20639c5186b1a976cbff6d05bda134cd00d0da
---
 runsc/container/container.go | 12 ++++++++----
 runsc/container/fs.go        | 29 +++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 16af66d3e..725b4d347 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -510,10 +510,6 @@ func (c *Container) Destroy() error {
 		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
 	}
 
-	if err := os.RemoveAll(c.Root); err != nil {
-		log.Warningf("Failed to delete container root directory %q, err: %v", c.Root, err)
-	}
-
 	// If we are the first container in the sandbox, take the sandbox down
 	// as well.
 	if c.Sandbox != nil && c.Sandbox.IsRootContainer(c.ID) {
@@ -532,6 +528,14 @@ func (c *Container) Destroy() error {
 		}
 	}
 
+	if err := destroyFS(c.Spec); err != nil {
+		return fmt.Errorf("error destroying container fs: %v", err)
+	}
+
+	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
+	}
+
 	c.Status = Stopped
 	return nil
 }
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index c12f5c331..dd8bdf120 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -134,6 +134,35 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 	return nil
 }
 
+// destroyFS unmounts mounts done by runsc under `spec.Root.Path`. This
+// recovers the container rootfs into the original state.
+func destroyFS(spec *specs.Spec) error {
+	for _, m := range spec.Mounts {
+		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+			continue
+		}
+
+		// It's possible that 'm.Destination' follows symlinks inside the
+		// container.
+		dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
+		if err != nil {
+			return err
+		}
+
+		flags := syscall.MNT_DETACH
+		log.Infof("Unmounting dst: %q, flags: %#x", dst, flags)
+		// Do not return error if dst is not a mountpoint.
+		// Based on http://man7.org/linux/man-pages/man2/umount.2.html
+		// For kernel version 2.6+ and MNT_DETACH flag, EINVAL means
+		// the dst is not a mount point.
+		if err := syscall.Unmount(dst, flags); err != nil &&
+			!os.IsNotExist(err) && err != syscall.EINVAL {
+			return err
+		}
+	}
+	return nil
+}
+
 // resolveSymlinks walks 'rel' having 'root' as the root directory. If there are
 // symlinks, they are evaluated relative to 'root' to ensure the end result is
 // the same as if the process was running inside the container.
-- 
cgit v1.2.3


From ea113a4380543080f7ad92f536e71706e71d9285 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 28 Aug 2018 12:55:11 -0700
Subject: Drop support for Go 1.10

PiperOrigin-RevId: 210589588
Change-Id: Iba898bc3eb8f13e17c668ceea6dc820fc8180a70
---
 runsc/boot/filter/BUILD           |  2 --
 runsc/boot/filter/config.go       |  1 +
 runsc/boot/filter/config_go110.go | 30 ------------------------------
 runsc/boot/filter/config_go111.go | 27 ---------------------------
 4 files changed, 1 insertion(+), 59 deletions(-)
 delete mode 100644 runsc/boot/filter/config_go110.go
 delete mode 100644 runsc/boot/filter/config_go111.go

(limited to 'runsc')

diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index 96be051fe..d20605a91 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -6,8 +6,6 @@ go_library(
     name = "filter",
     srcs = [
         "config.go",
-        "config_go110.go",
-        "config_go111.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index db2e3f9d8..1bec89900 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -60,6 +60,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_MMAP:            {},
 	syscall.SYS_MPROTECT:        {},
 	syscall.SYS_MUNMAP:          {},
+	syscall.SYS_NANOSLEEP:       {},
 	syscall.SYS_POLL:            {},
 	syscall.SYS_PREAD64:         {},
 	syscall.SYS_PWRITE64:        {},
diff --git a/runsc/boot/filter/config_go110.go b/runsc/boot/filter/config_go110.go
deleted file mode 100644
index f4feb4ce4..000000000
--- a/runsc/boot/filter/config_go110.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build !go1.11
-
-package filter
-
-import (
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/seccomp"
-)
-
-// TODO: Remove this file and merge config_go111.go back into
-// config.go once we no longer build with Go 1.10.
-
-func init() {
-	allowedSyscalls[syscall.SYS_PSELECT6] = []seccomp.Rule{}
-}
diff --git a/runsc/boot/filter/config_go111.go b/runsc/boot/filter/config_go111.go
deleted file mode 100644
index f5eb2c3c8..000000000
--- a/runsc/boot/filter/config_go111.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build go1.11
-
-package filter
-
-import (
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/seccomp"
-)
-
-func init() {
-	allowedSyscalls[syscall.SYS_NANOSLEEP] = []seccomp.Rule{}
-}
-- 
cgit v1.2.3


From 30c025f3efdf5b599d8fbd4172bb5b856cc269af Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 28 Aug 2018 17:08:49 -0700
Subject: Add argument checks to seccomp

This is required to increase protection when running in GKE.

PiperOrigin-RevId: 210635123
Change-Id: Iaaa8be49e73f7a3a90805313885e75894416f0b5
---
 runsc/boot/filter/BUILD     |   1 +
 runsc/boot/filter/config.go | 447 +++++++++++++++++++++++++++++++++++++-------
 runsc/boot/filter/filter.go |  17 +-
 runsc/boot/loader.go        |  10 +-
 4 files changed, 400 insertions(+), 75 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index d20605a91..48f2c8024 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -22,6 +22,7 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm",
         "//pkg/sentry/platform/ptrace",
+        "//pkg/tcpip/link/fdbased",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 1bec89900..7227ea2b7 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -15,72 +15,127 @@
 package filter
 
 import (
+	"os"
 	"syscall"
 
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
 )
 
-// allowedSyscalls is the set of syscalls executed by the Sentry
-// to the host OS.
+// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
-	syscall.SYS_ACCEPT:        {},
-	syscall.SYS_ARCH_PRCTL:    {},
+	syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+		{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	},
 	syscall.SYS_CLOCK_GETTIME: {},
-	syscall.SYS_CLONE:         {},
+	syscall.SYS_CLONE: []seccomp.Rule{
+		{
+			seccomp.AllowValue(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+		},
+	},
 	syscall.SYS_CLOSE:         {},
 	syscall.SYS_DUP:           {},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
-	syscall.SYS_EPOLL_PWAIT:   {},
-	syscall.SYS_EPOLL_WAIT:    {},
-	syscall.SYS_EVENTFD2:      {},
-	syscall.SYS_EXIT:          {},
-	syscall.SYS_EXIT_GROUP:    {},
-	syscall.SYS_FALLOCATE:     {},
-	syscall.SYS_FCNTL:         {},
-	syscall.SYS_FSTAT:         {},
-	syscall.SYS_FSYNC:         {},
-	syscall.SYS_FTRUNCATE:     {},
-	syscall.SYS_FUTEX:         {},
-	syscall.SYS_GETDENTS64:    {},
-	syscall.SYS_GETPID:        {},
-	unix.SYS_GETRANDOM:        {},
-	syscall.SYS_GETSOCKOPT:    {},
-	syscall.SYS_GETTID:        {},
-	syscall.SYS_GETTIMEOFDAY:  {},
-	syscall.SYS_LISTEN:        {},
-	syscall.SYS_LSEEK:         {},
-	// TODO: Remove SYS_LSTAT when executable lookup moves
-	// into the gofer.
-	syscall.SYS_LSTAT:           {},
-	syscall.SYS_MADVISE:         {},
-	syscall.SYS_MINCORE:         {},
-	syscall.SYS_MMAP:            {},
-	syscall.SYS_MPROTECT:        {},
-	syscall.SYS_MUNMAP:          {},
-	syscall.SYS_NANOSLEEP:       {},
-	syscall.SYS_POLL:            {},
-	syscall.SYS_PREAD64:         {},
-	syscall.SYS_PWRITE64:        {},
-	syscall.SYS_READ:            {},
-	syscall.SYS_READV:           {},
-	syscall.SYS_RECVMSG:         {},
-	syscall.SYS_RESTART_SYSCALL: {},
-	syscall.SYS_RT_SIGACTION:    {},
-	syscall.SYS_RT_SIGPROCMASK:  {},
-	syscall.SYS_RT_SIGRETURN:    {},
-	syscall.SYS_SCHED_YIELD:     {},
-	syscall.SYS_SENDMSG:         {},
-	syscall.SYS_SETITIMER:       {},
-	syscall.SYS_SHUTDOWN:        {},
-	syscall.SYS_SIGALTSTACK:     {},
-	syscall.SYS_SYNC_FILE_RANGE: {},
-	syscall.SYS_TGKILL:          {},
-	syscall.SYS_WRITE:           {},
-	syscall.SYS_WRITEV:          {},
-
+	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EPOLL_WAIT: {},
+	syscall.SYS_EVENTFD2: []seccomp.Rule{
+		{
+			seccomp.AllowValue(0),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EXIT:       {},
+	syscall.SYS_EXIT_GROUP: {},
+	syscall.SYS_FALLOCATE:  {},
+	syscall.SYS_FCHMOD:     {},
+	syscall.SYS_FCNTL: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFL),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_SETFL),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFD),
+		},
+	},
+	syscall.SYS_FSTAT:     {},
+	syscall.SYS_FSYNC:     {},
+	syscall.SYS_FTRUNCATE: {},
+	syscall.SYS_FUTEX: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_GETDENTS64: {},
+	syscall.SYS_GETPID:     {},
+	unix.SYS_GETRANDOM:     {},
+	syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_DOMAIN),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_TYPE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_ERROR),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_SNDBUF),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_RCVBUF),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_REUSEADDR),
+		},
+	},
+	syscall.SYS_GETTID:       {},
+	syscall.SYS_GETTIMEOFDAY: {},
 	// SYS_IOCTL is needed for terminal support, but we only allow
 	// setting/getting termios and winsize.
 	syscall.SYS_IOCTL: []seccomp.Rule{
@@ -110,6 +165,107 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowAny{}, /* winsize struct */
 		},
 	},
+	syscall.SYS_LSEEK: {},
+	// TODO: Remove SYS_LSTAT when executable lookup moves
+	// into the gofer.
+	syscall.SYS_LSTAT:   {},
+	syscall.SYS_MADVISE: {},
+	syscall.SYS_MINCORE: {},
+	syscall.SYS_MMAP: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_SHARED),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+		},
+	},
+	syscall.SYS_MPROTECT:  {},
+	syscall.SYS_MUNMAP:    {},
+	syscall.SYS_NANOSLEEP: {},
+	syscall.SYS_POLL:      {},
+	syscall.SYS_PREAD64:   {},
+	syscall.SYS_PWRITE64:  {},
+	syscall.SYS_READ:      {},
+	syscall.SYS_READV: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(len(fdbased.BufConfig)),
+		},
+	},
+	syscall.SYS_RECVMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+		},
+	},
+	syscall.SYS_RESTART_SYSCALL: {},
+	syscall.SYS_RT_SIGACTION:    {},
+	syscall.SYS_RT_SIGPROCMASK:  {},
+	syscall.SYS_RT_SIGRETURN:    {},
+	syscall.SYS_SCHED_YIELD:     {},
+	syscall.SYS_SENDMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+		},
+	},
+	syscall.SYS_SETITIMER: {},
+	syscall.SYS_SHUTDOWN: []seccomp.Rule{
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+	},
+	syscall.SYS_SIGALTSTACK:     {},
+	syscall.SYS_SYNC_FILE_RANGE: {},
+	syscall.SYS_TGKILL: []seccomp.Rule{
+		{
+			seccomp.AllowValue(uint64(os.Getpid())),
+		},
+	},
+	syscall.SYS_WRITE: {},
+	syscall.SYS_WRITEV: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(2),
+		},
+	},
 }
 
 // whitelistFSFilters returns syscalls made by whitelistFS. Using WhitelistFS
@@ -154,42 +310,197 @@ func whitelistFSFilters() seccomp.SyscallRules {
 // hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
 func hostInetFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		syscall.SYS_ACCEPT4:     {},
+		syscall.SYS_ACCEPT4: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+			},
+		},
 		syscall.SYS_BIND:        {},
 		syscall.SYS_CONNECT:     {},
 		syscall.SYS_GETPEERNAME: {},
 		syscall.SYS_GETSOCKNAME: {},
-		syscall.SYS_GETSOCKOPT:  {},
-		syscall.SYS_IOCTL:       {},
-		syscall.SYS_LISTEN:      {},
-		syscall.SYS_READV:       {},
-		syscall.SYS_RECVFROM:    {},
-		syscall.SYS_RECVMSG:     {},
-		syscall.SYS_SENDMSG:     {},
-		syscall.SYS_SENDTO:      {},
-		syscall.SYS_SETSOCKOPT:  {},
-		syscall.SYS_SHUTDOWN:    {},
-		syscall.SYS_SOCKET:      {},
-		syscall.SYS_WRITEV:      {},
+		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_ERROR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_KEEPALIVE),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_SNDBUF),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_REUSEADDR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_TYPE),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_LINGER),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_NODELAY),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_INFO),
+			},
+		},
+		syscall.SYS_IOCTL: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.TIOCOUTQ),
+			},
+		},
+		syscall.SYS_LISTEN:   {},
+		syscall.SYS_READV:    {},
+		syscall.SYS_RECVFROM: {},
+		syscall.SYS_RECVMSG:  {},
+		syscall.SYS_SENDMSG:  {},
+		syscall.SYS_SENDTO:   {},
+		syscall.SYS_SETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_SNDBUF),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_RCVBUF),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_REUSEADDR),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_NODELAY),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+		},
+		syscall.SYS_SHUTDOWN: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_RD),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_WR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_RDWR),
+			},
+		},
+		syscall.SYS_SOCKET: []seccomp.Rule{
+			{
+				seccomp.AllowValue(syscall.AF_INET),
+				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET),
+				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET6),
+				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET6),
+				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+		},
+		syscall.SYS_WRITEV: {},
 	}
 }
 
 // ptraceFilters returns syscalls made exclusively by the ptrace platform.
 func ptraceFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		syscall.SYS_PTRACE:         {},
-		syscall.SYS_WAIT4:          {},
 		unix.SYS_GETCPU:            {},
 		unix.SYS_SCHED_SETAFFINITY: {},
+		syscall.SYS_PTRACE:         {},
+		syscall.SYS_TGKILL:         {},
+		syscall.SYS_WAIT4:          {},
 	}
 }
 
 // kvmFilters returns syscalls made exclusively by the KVM platform.
 func kvmFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
+		syscall.SYS_ARCH_PRCTL:      {},
+		syscall.SYS_FUTEX:           {},
 		syscall.SYS_IOCTL:           {},
+		syscall.SYS_MMAP:            {},
 		syscall.SYS_RT_SIGSUSPEND:   {},
 		syscall.SYS_RT_SIGTIMEDWAIT: {},
 		0xffffffffffffffff:          {}, // KVM uses syscall -1 to transition to host.
 	}
 }
+
+func controlServerFilters(fd int) seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_ACCEPT: []seccomp.Rule{
+			{
+				seccomp.AllowValue(fd),
+			},
+		},
+		syscall.SYS_LISTEN: []seccomp.Rule{
+			{
+				seccomp.AllowValue(fd),
+				seccomp.AllowValue(16 /* unet.backlog */),
+			},
+		},
+		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_PEERCRED),
+			},
+		},
+	}
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index c57bbd2e5..56d30f2a0 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -27,24 +27,33 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
 )
 
+// Options are seccomp filter related options.
+type Options struct {
+	Platform     platform.Platform
+	WhitelistFS  bool
+	HostNetwork  bool
+	ControllerFD int
+}
+
 // Install installs seccomp filters for based on the given platform.
-func Install(p platform.Platform, whitelistFS, hostNetwork bool) error {
+func Install(opt Options) error {
 	s := allowedSyscalls
+	s.Merge(controlServerFilters(opt.ControllerFD))
 
 	// Set of additional filters used by -race and -msan. Returns empty
 	// when not enabled.
 	s.Merge(instrumentationFilters())
 
-	if whitelistFS {
+	if opt.WhitelistFS {
 		Report("direct file access allows unrestricted file access!")
 		s.Merge(whitelistFSFilters())
 	}
-	if hostNetwork {
+	if opt.HostNetwork {
 		Report("host networking enabled: syscall filters less restrictive!")
 		s.Merge(hostInetFilters())
 	}
 
-	switch p := p.(type) {
+	switch p := opt.Platform.(type) {
 	case *ptrace.PTrace:
 		s.Merge(ptraceFilters())
 	case *kvm.KVM:
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0ad830a6b..74d0c2534 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -351,9 +351,13 @@ func (l *Loader) run() error {
 	if l.conf.DisableSeccomp {
 		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
 	} else {
-		whitelistFS := l.conf.FileAccess == FileAccessDirect
-		hostNet := l.conf.Network == NetworkHost
-		if err := filter.Install(l.k.Platform, whitelistFS, hostNet); err != nil {
+		opts := filter.Options{
+			Platform:     l.k.Platform,
+			WhitelistFS:  l.conf.FileAccess == FileAccessDirect,
+			HostNetwork:  l.conf.Network == NetworkHost,
+			ControllerFD: l.ctrl.srv.FD(),
+		}
+		if err := filter.Install(opts); err != nil {
 			return fmt.Errorf("Failed to install seccomp filters: %v", err)
 		}
 	}
-- 
cgit v1.2.3


From 5ade9350ad18476a2cddbd3a0b36778d1c6ec376 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 30 Aug 2018 15:46:12 -0700
Subject: runsc: Pass log and config files to sandbox process by FD.

This is a prereq for running the sandbox process as user "nobody", when it may
not have permissions to open these files.

Instead, we must open then before starting the sandbox process, and pass them
by FD.

PiperOrigin-RevId: 210995199
Change-Id: I715875a9553290b4a49394a8fcd93be78b1933dd
---
 runsc/boot/config.go            |  3 ++
 runsc/cmd/boot.go               | 15 ++++++---
 runsc/cmd/create.go             |  3 ++
 runsc/cmd/run.go                |  3 ++
 runsc/main.go                   | 25 ++++++++------
 runsc/sandbox/sandbox.go        | 74 +++++++++++++++++++++++++++++++----------
 runsc/specutils/specutils.go    | 26 ++++++++++++---
 runsc/test/testutil/testutil.go |  1 +
 8 files changed, 113 insertions(+), 37 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index efb8563ea..212f5b003 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -204,6 +204,9 @@ type Config struct {
 	// TODO: Remove this when multiple container is fully supported.
 	MultiContainer bool
 
+	// SpecFile is the file containing the OCI spec.
+	SpecFile string
+
 	// WatchdogAction sets what action the watchdog takes when triggered.
 	WatchdogAction watchdog.Action
 
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 4e08dafc8..4bd6fa12a 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -32,9 +32,12 @@ import (
 // Boot implements subcommands.Command for the "boot" command which starts a
 // new sandbox. It should not be called directly.
 type Boot struct {
-	// bundleDir is the path to the bundle directory.
+	// bundleDir is the directory containing the OCI spec.
 	bundleDir string
 
+	// specFD is the file descriptor that the spec will be read from.
+	specFD int
+
 	// controllerFD is the file descriptor of a stream socket for the
 	// control server that is donated to this process.
 	controllerFD int
@@ -68,7 +71,7 @@ func (*Boot) Usage() string {
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (b *Boot) SetFlags(f *flag.FlagSet) {
-	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
+	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
 	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
@@ -78,7 +81,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
 // waiting state.
 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if b.bundleDir == "" || b.controllerFD == -1 || f.NArg() != 0 {
+	if b.specFD == -1 || b.controllerFD == -1 || f.NArg() != 0 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
@@ -86,8 +89,10 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Ensure that if there is a panic, all goroutine stacks are printed.
 	debug.SetTraceback("all")
 
-	// Get the spec from the bundleDir.
-	spec, err := specutils.ReadSpec(b.bundleDir)
+	// Get the spec from the specFD.
+	specFile := os.NewFile(uintptr(b.specFD), "spec file")
+	defer specFile.Close()
+	spec, err := specutils.ReadSpecFromFile(specFile)
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
 	}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 94a889077..38ae03e7a 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -15,6 +15,8 @@
 package cmd
 
 import (
+	"path/filepath"
+
 	"context"
 	"flag"
 	"github.com/google/subcommands"
@@ -83,6 +85,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 		Fatalf("error reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
+	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
 	// Create the container. A new sandbox will be created for the
 	// container unless the metadata specifies that it should be run in an
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 681112f30..92aa6bc40 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -15,6 +15,7 @@
 package cmd
 
 import (
+	"path/filepath"
 	"syscall"
 
 	"context"
@@ -71,6 +72,8 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
 	}
+	specutils.LogSpec(spec)
+	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile)
 	if err != nil {
diff --git a/runsc/main.go b/runsc/main.go
index 773ec6486..0c9b9af78 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -17,13 +17,11 @@
 package main
 
 import (
-	"fmt"
 	"io"
 	"os"
 	"path/filepath"
 	"strings"
 	"syscall"
-	"time"
 
 	"context"
 	"flag"
@@ -32,6 +30,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/cmd"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 var (
@@ -48,6 +47,8 @@ var (
 	// Debugging flags.
 	debugLogDir = flag.String("debug-log-dir", "", "additional location for logs. It creates individual log files per command")
 	logPackets  = flag.Bool("log-packets", false, "enable network packet logging")
+	logFD       = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD  = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
 
 	// Debugging flags: strace related
 	strace         = flag.Bool("strace", false, "enable strace")
@@ -64,6 +65,7 @@ var (
 	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 )
 
+// gitRevision is set during linking.
 var gitRevision = ""
 
 func main() {
@@ -152,7 +154,9 @@ func main() {
 	}
 
 	var logFile io.Writer = os.Stderr
-	if *logFilename != "" {
+	if *logFD > -1 {
+		logFile = os.NewFile(uintptr(*logFD), "log file")
+	} else if *logFilename != "" {
 		// We must set O_APPEND and not O_TRUNC because Docker passes
 		// the same log file for all commands (and also parses these
 		// log files), so we can't destroy them on each command.
@@ -173,18 +177,17 @@ func main() {
 		cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat)
 	}
 
-	if *debugLogDir != "" {
+	if *debugLogFD > -1 {
+		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
+	} else if *debugLogDir != "" {
 		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
 			cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err)
 		}
-
-		// Format: <debug-log-dir>/runsc.log.<yyymmdd-hhmmss.uuuuuu>.<command>
-		scmd := flag.CommandLine.Arg(0)
-		filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), scmd)
-		path := filepath.Join(*debugLogDir, filename)
-		f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+		subcommand := flag.CommandLine.Arg(0)
+		f, err := specutils.DebugLogFile(*debugLogDir, subcommand)
 		if err != nil {
-			cmd.Fatalf("error opening log file %q: %v", filename, err)
+			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
 		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f14a2f8c9..f58916574 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -233,16 +233,6 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
 
-	// Create control server socket here and donate FD to child process because
-	// it may be in a different network namespace and won't be reachable from
-	// outside.
-	addr := boot.ControlSocketAddr(s.ID)
-	fd, err := server.CreateSocket(addr)
-	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
-	if err != nil {
-		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
-	}
-
 	consoleEnabled := consoleSocket != ""
 
 	binPath, err := specutils.BinPath()
@@ -251,16 +241,61 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 	cmd := exec.Command(binPath, conf.ToFlags()...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{}
-	cmd.Args = append(cmd.Args,
-		"boot",
-		"--bundle", bundleDir,
-		"--controller-fd="+strconv.Itoa(nextFD),
-		"--console="+strconv.FormatBool(consoleEnabled))
-	nextFD++
 
-	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
+	// Open the log files to pass to the sandbox as FDs.
+	//
+	// These flags must come BEFORE the "boot" command in cmd.Args.
+	if conf.LogFilename != "" {
+		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		if err != nil {
+			return fmt.Errorf("error opening log file %q: %v", conf.LogFilename, err)
+		}
+		defer logFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, logFile)
+		cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+	if conf.DebugLogDir != "" {
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLogDir, "boot")
+		if err != nil {
+			return fmt.Errorf("error opening debug log file in %q: %v", conf.DebugLogDir, err)
+		}
+		defer debugLogFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile)
+		cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	// Add the "boot" command to the args.
+	//
+	// All flags after this must be for the boot command
+	cmd.Args = append(cmd.Args, "boot", "--console="+strconv.FormatBool(consoleEnabled))
+
+	// Create a socket for the control server and donate it to the sandbox.
+	addr := boot.ControlSocketAddr(s.ID)
+	sockFD, err := server.CreateSocket(addr)
+	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
+	if err != nil {
+		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
+	}
+	controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket")
 	defer controllerFile.Close()
 	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
+	cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
+	// Open the spec file to donate to the sandbox.
+	if conf.SpecFile == "" {
+		return fmt.Errorf("conf.SpecFile must be set")
+	}
+	specFile, err := os.Open(conf.SpecFile)
+	if err != nil {
+		return fmt.Errorf("error opening spec file %q: %v", conf.SpecFile, err)
+	}
+	defer specFile.Close()
+	cmd.ExtraFiles = append(cmd.ExtraFiles, specFile)
+	cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD))
+	nextFD++
 
 	// If there is a gofer, sends all socket ends to the sandbox.
 	for _, f := range ioFiles {
@@ -357,6 +392,11 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
 	}
 
+	// Log the fds we are donating to the sandbox process.
+	for i, f := range cmd.ExtraFiles {
+		log.Debugf("Donating FD %d: %q", i+3, f.Name())
+	}
+
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
 		return err
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 5fb53edb2..477409112 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -108,14 +108,24 @@ func ValidateSpec(spec *specs.Spec) error {
 // ReadSpec reads an OCI runtime spec from the given bundle directory.
 func ReadSpec(bundleDir string) (*specs.Spec, error) {
 	// The spec file must be in "config.json" inside the bundle directory.
-	specFile := filepath.Join(bundleDir, "config.json")
-	specBytes, err := ioutil.ReadFile(specFile)
+	specPath := filepath.Join(bundleDir, "config.json")
+	specFile, err := os.Open(specPath)
 	if err != nil {
-		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile, err)
+		return nil, fmt.Errorf("error opening spec file %q: %v", specPath, err)
+	}
+	defer specFile.Close()
+	return ReadSpecFromFile(specFile)
+}
+
+// ReadSpecFromFile reads an OCI runtime spec from the given File.
+func ReadSpecFromFile(specFile *os.File) (*specs.Spec, error) {
+	specBytes, err := ioutil.ReadAll(specFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err)
 	}
 	var spec specs.Spec
 	if err := json.Unmarshal(specBytes, &spec); err != nil {
-		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile, err, string(specBytes))
+		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes))
 	}
 	if err := ValidateSpec(&spec); err != nil {
 		return nil, err
@@ -346,3 +356,11 @@ func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) er
 	}
 	return backoff.Retry(op, b)
 }
+
+// DebugLogFile opens a file in logDir based on the timestamp and subcommand
+// for writing.
+func DebugLogFile(logDir, subcommand string) (*os.File, error) {
+	// Format: <debug-log-dir>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
+	filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), subcommand)
+	return os.OpenFile(filepath.Join(logDir, filename), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index e90ab5ad5..4a1c37105 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -176,6 +176,7 @@ func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (
 	}
 
 	conf.RootDir = rootDir
+	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 	return bundleDir, nil
 }
 
-- 
cgit v1.2.3


From 3e493adf7adb6c8b920ae224fb68e2c317a16a56 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 30 Aug 2018 17:29:14 -0700
Subject: Add seccomp filter to fsgofer

PiperOrigin-RevId: 211011542
Change-Id: Ib5a83a00f8eb6401603c6fb5b59afc93bac52558
---
 runsc/cmd/BUILD                            |   1 +
 runsc/cmd/gofer.go                         |   5 +
 runsc/fsgofer/filter/BUILD                 |  24 ++++
 runsc/fsgofer/filter/config.go             | 175 +++++++++++++++++++++++++++++
 runsc/fsgofer/filter/extra_filters.go      |  28 +++++
 runsc/fsgofer/filter/extra_filters_msan.go |  33 ++++++
 runsc/fsgofer/filter/extra_filters_race.go |  42 +++++++
 runsc/fsgofer/filter/filter.go             |  34 ++++++
 8 files changed, 342 insertions(+)
 create mode 100644 runsc/fsgofer/filter/BUILD
 create mode 100644 runsc/fsgofer/filter/config.go
 create mode 100644 runsc/fsgofer/filter/extra_filters.go
 create mode 100644 runsc/fsgofer/filter/extra_filters_msan.go
 create mode 100644 runsc/fsgofer/filter/extra_filters_race.go
 create mode 100644 runsc/fsgofer/filter/filter.go

(limited to 'runsc')

diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 5dee26a5c..f9c091ba2 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -42,6 +42,7 @@ go_library(
         "//runsc/console",
         "//runsc/container",
         "//runsc/fsgofer",
+        "//runsc/fsgofer/filter",
         "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index ab76734fc..f28e02798 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
+	"gvisor.googlesource.com/gvisor/runsc/fsgofer/filter"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -151,6 +152,10 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
 	}
 
+	if err := filter.Install(); err != nil {
+		Fatalf("Failed to install seccomp filters: %v", err)
+	}
+
 	runServers(ats, g.ioFDs)
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
new file mode 100644
index 000000000..40f4f2205
--- /dev/null
+++ b/runsc/fsgofer/filter/BUILD
@@ -0,0 +1,24 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "filter",
+    srcs = [
+        "config.go",
+        "extra_filters.go",
+        "extra_filters_msan.go",
+        "extra_filters_race.go",
+        "filter.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/fsgofer/filter",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/seccomp",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
new file mode 100644
index 000000000..97e556ceb
--- /dev/null
+++ b/runsc/fsgofer/filter/config.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"os"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// allowedSyscalls is the set of syscalls executed by the gofer.
+var allowedSyscalls = seccomp.SyscallRules{
+	syscall.SYS_ACCEPT: {},
+	syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+		{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	},
+	syscall.SYS_CLOCK_GETTIME: {},
+	syscall.SYS_CLONE: []seccomp.Rule{
+		{
+			seccomp.AllowValue(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+		},
+	},
+	syscall.SYS_CLOSE:     {},
+	syscall.SYS_DUP:       {},
+	syscall.SYS_EPOLL_CTL: {},
+	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EVENTFD2: []seccomp.Rule{
+		{
+			seccomp.AllowValue(0),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EXIT:       {},
+	syscall.SYS_EXIT_GROUP: {},
+	syscall.SYS_FCHMOD:     {},
+	syscall.SYS_FCHOWNAT:   {},
+	syscall.SYS_FCNTL: []seccomp.Rule{
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFL),
+		},
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_SETFL),
+		},
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFD),
+		},
+	},
+	syscall.SYS_FSTAT:     {},
+	syscall.SYS_FSTATFS:   {},
+	syscall.SYS_FSYNC:     {},
+	syscall.SYS_FTRUNCATE: {},
+	syscall.SYS_FUTEX: {
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_GETDENTS64:   {},
+	syscall.SYS_GETPID:       {},
+	unix.SYS_GETRANDOM:       {},
+	syscall.SYS_GETTID:       {},
+	syscall.SYS_GETTIMEOFDAY: {},
+	syscall.SYS_LINKAT:       {},
+	syscall.SYS_LSEEK:        {},
+	syscall.SYS_MKDIRAT:      {},
+	syscall.SYS_MMAP: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_SHARED),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+		},
+	},
+	syscall.SYS_MPROTECT:   {},
+	syscall.SYS_MUNMAP:     {},
+	syscall.SYS_NANOSLEEP:  {},
+	syscall.SYS_NEWFSTATAT: {},
+	syscall.SYS_OPENAT:     {},
+	syscall.SYS_POLL:       {},
+	syscall.SYS_PREAD64:    {},
+	syscall.SYS_PWRITE64:   {},
+	syscall.SYS_READ:       {},
+	syscall.SYS_READLINKAT: {},
+	syscall.SYS_RECVMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+		},
+	},
+	syscall.SYS_RENAMEAT:        {},
+	syscall.SYS_RESTART_SYSCALL: {},
+	syscall.SYS_RT_SIGPROCMASK:  {},
+	syscall.SYS_SCHED_YIELD:     {},
+	syscall.SYS_SENDMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+		},
+	},
+	syscall.SYS_SHUTDOWN: []seccomp.Rule{
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+	},
+	syscall.SYS_SIGALTSTACK: {},
+	syscall.SYS_SYMLINKAT:   {},
+	syscall.SYS_TGKILL: []seccomp.Rule{
+		{
+			seccomp.AllowValue(uint64(os.Getpid())),
+		},
+	},
+	syscall.SYS_UNLINKAT:  {},
+	syscall.SYS_UTIMENSAT: {},
+	syscall.SYS_WRITE:     {},
+}
diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go
new file mode 100644
index 000000000..82cf00dfb
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters.go
@@ -0,0 +1,28 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go intrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() seccomp.SyscallRules {
+	return nil
+}
diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go
new file mode 100644
index 000000000..169a79ed8
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters_msan.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+	log.Warningf("*** SECCOMP WARNING: MSAN is enabled: syscall filters less restrictive!")
+	return seccomp.SyscallRules{
+		syscall.SYS_SCHED_GETAFFINITY: {},
+		syscall.SYS_SET_ROBUST_LIST:   {},
+	}
+}
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
new file mode 100644
index 000000000..9e6512d8c
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -0,0 +1,42 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+	log.Warningf("*** SECCOMP WARNING: TSAN is enabled: syscall filters less restrictive!")
+	return seccomp.SyscallRules{
+		syscall.SYS_BRK:             {},
+		syscall.SYS_CLONE:           {},
+		syscall.SYS_FUTEX:           {},
+		syscall.SYS_MADVISE:         {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_MUNLOCK:         {},
+		syscall.SYS_NANOSLEEP:       {},
+		syscall.SYS_OPEN:            {},
+		syscall.SYS_SET_ROBUST_LIST: {},
+		// Used within glibc's malloc.
+		syscall.SYS_TIME: {},
+	}
+}
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
new file mode 100644
index 000000000..6f341f688
--- /dev/null
+++ b/runsc/fsgofer/filter/filter.go
@@ -0,0 +1,34 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the gofer is allowed to make, and
+// installs seccomp filters to prevent prohibited syscalls in case it's
+// compromised.
+package filter
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// Install installs seccomp filters.
+func Install() error {
+	s := allowedSyscalls
+
+	// Set of additional filters used by -race and -msan. Returns empty
+	// when not enabled.
+	s.Merge(instrumentationFilters())
+
+	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
+	return seccomp.Install(s, false)
+}
-- 
cgit v1.2.3


From 3f04bd68b24c939b2342047dd022837c3b1b6085 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Fri, 31 Aug 2018 08:06:58 -0700
Subject: Add missing import

GoCompile: missing strict dependencies:
	/tmpfs/tmp/bazel/sandbox/linux-sandbox/1744/execroot/__main__/runsc/main.go:
	import of "gvisor.googlesource.com/gvisor/runsc/specutils"

This was broken in 210995199.

PiperOrigin-RevId: 211086595
Change-Id: I166b9a2ed8e4d6e624def944b720190940d7537c
---
 runsc/BUILD | 1 +
 1 file changed, 1 insertion(+)

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index 660cb2a06..cc6bfef47 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -45,6 +45,7 @@ go_binary(
         "//pkg/log",
         "//runsc/boot",
         "//runsc/cmd",
+        "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
-- 
cgit v1.2.3


From e669697241e9774f1a1e4ab609dde933a0563ba6 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 31 Aug 2018 09:44:31 -0700
Subject: Fix RunAsRoot arguments forwarding

It was including the path to the executable twice in the
arguments.

PiperOrigin-RevId: 211098311
Change-Id: I5357c51c63f38dfab551b17bb0e04011a0575010
---
 runsc/test/testutil/testutil.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4a1c37105..4429b981b 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -254,7 +254,7 @@ func RunAsRoot(m *testing.M) {
 	runtime.LockOSThread()
 	defer runtime.UnlockOSThread()
 
-	cmd := exec.Command("/proc/self/exe", os.Args...)
+	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{
 		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
 		// Set current user/group as root inside the namespace.
-- 
cgit v1.2.3


From 08bfb5643c4d4755d0e982e69d2da99449e25c57 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 31 Aug 2018 10:21:02 -0700
Subject: Add other missing dep

runsc and runsc-race need the same deps.

PiperOrigin-RevId: 211103766
Change-Id: Ib0c97078a469656c1e5b019648589a1d07915625
---
 runsc/BUILD | 1 +
 1 file changed, 1 insertion(+)

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index cc6bfef47..e390b7bae 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -16,6 +16,7 @@ go_binary(
         "//pkg/log",
         "//runsc/boot",
         "//runsc/cmd",
+        "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
-- 
cgit v1.2.3


From be9f454eb6e456fb1acf084612f363aa959ef9d9 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Fri, 31 Aug 2018 11:01:57 -0700
Subject: runsc: Set volume mount rslave.

PiperOrigin-RevId: 211111376
Change-Id: I27b8cb4e070d476fa4781ed6ecfa0cf1dcaf85f5
---
 runsc/container/fs.go | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'runsc')

diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index dd8bdf120..b93c866ea 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -117,6 +117,12 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		if err := syscall.Mount(src, dst, m.Type, uintptr(flags), ""); err != nil {
 			return fmt.Errorf("failed to mount src: %q, dst: %q, flags: %#x, err: %v", src, dst, flags, err)
 		}
+		// Make the mount a slave, so that for recursive bind mount, umount won't
+		// propagate to the source.
+		flags = syscall.MS_SLAVE | syscall.MS_REC
+		if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
+			return fmt.Errorf("failed to rslave mount dst: %q, flags: %#x, err: %v", dst, flags, err)
+		}
 	}
 
 	// Remount root as readonly after setup is done, if requested.
-- 
cgit v1.2.3


From 7e18f158b2ea87b7f06f8e0b91e10558b4f52722 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 31 Aug 2018 11:29:36 -0700
Subject: Automated rollback of changelist 210995199

PiperOrigin-RevId: 211116429
Change-Id: I446d149c822177dc9fc3c64ce5e455f7f029aa82
---
 runsc/boot/config.go            |  3 --
 runsc/cmd/boot.go               | 15 +++------
 runsc/cmd/create.go             |  3 --
 runsc/cmd/run.go                |  3 --
 runsc/main.go                   | 25 ++++++--------
 runsc/sandbox/sandbox.go        | 74 ++++++++++-------------------------------
 runsc/specutils/specutils.go    | 26 +++------------
 runsc/test/testutil/testutil.go |  1 -
 8 files changed, 37 insertions(+), 113 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 212f5b003..efb8563ea 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -204,9 +204,6 @@ type Config struct {
 	// TODO: Remove this when multiple container is fully supported.
 	MultiContainer bool
 
-	// SpecFile is the file containing the OCI spec.
-	SpecFile string
-
 	// WatchdogAction sets what action the watchdog takes when triggered.
 	WatchdogAction watchdog.Action
 
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 4bd6fa12a..4e08dafc8 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -32,12 +32,9 @@ import (
 // Boot implements subcommands.Command for the "boot" command which starts a
 // new sandbox. It should not be called directly.
 type Boot struct {
-	// bundleDir is the directory containing the OCI spec.
+	// bundleDir is the path to the bundle directory.
 	bundleDir string
 
-	// specFD is the file descriptor that the spec will be read from.
-	specFD int
-
 	// controllerFD is the file descriptor of a stream socket for the
 	// control server that is donated to this process.
 	controllerFD int
@@ -71,7 +68,7 @@ func (*Boot) Usage() string {
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (b *Boot) SetFlags(f *flag.FlagSet) {
-	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
+	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
 	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
@@ -81,7 +78,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
 // waiting state.
 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if b.specFD == -1 || b.controllerFD == -1 || f.NArg() != 0 {
+	if b.bundleDir == "" || b.controllerFD == -1 || f.NArg() != 0 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
@@ -89,10 +86,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Ensure that if there is a panic, all goroutine stacks are printed.
 	debug.SetTraceback("all")
 
-	// Get the spec from the specFD.
-	specFile := os.NewFile(uintptr(b.specFD), "spec file")
-	defer specFile.Close()
-	spec, err := specutils.ReadSpecFromFile(specFile)
+	// Get the spec from the bundleDir.
+	spec, err := specutils.ReadSpec(b.bundleDir)
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
 	}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 38ae03e7a..94a889077 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -15,8 +15,6 @@
 package cmd
 
 import (
-	"path/filepath"
-
 	"context"
 	"flag"
 	"github.com/google/subcommands"
@@ -85,7 +83,6 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 		Fatalf("error reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
-	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
 	// Create the container. A new sandbox will be created for the
 	// container unless the metadata specifies that it should be run in an
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 92aa6bc40..681112f30 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -15,7 +15,6 @@
 package cmd
 
 import (
-	"path/filepath"
 	"syscall"
 
 	"context"
@@ -72,8 +71,6 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
 	}
-	specutils.LogSpec(spec)
-	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile)
 	if err != nil {
diff --git a/runsc/main.go b/runsc/main.go
index 0c9b9af78..773ec6486 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -17,11 +17,13 @@
 package main
 
 import (
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
 	"strings"
 	"syscall"
+	"time"
 
 	"context"
 	"flag"
@@ -30,7 +32,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/cmd"
-	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 var (
@@ -47,8 +48,6 @@ var (
 	// Debugging flags.
 	debugLogDir = flag.String("debug-log-dir", "", "additional location for logs. It creates individual log files per command")
 	logPackets  = flag.Bool("log-packets", false, "enable network packet logging")
-	logFD       = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
-	debugLogFD  = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
 
 	// Debugging flags: strace related
 	strace         = flag.Bool("strace", false, "enable strace")
@@ -65,7 +64,6 @@ var (
 	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 )
 
-// gitRevision is set during linking.
 var gitRevision = ""
 
 func main() {
@@ -154,9 +152,7 @@ func main() {
 	}
 
 	var logFile io.Writer = os.Stderr
-	if *logFD > -1 {
-		logFile = os.NewFile(uintptr(*logFD), "log file")
-	} else if *logFilename != "" {
+	if *logFilename != "" {
 		// We must set O_APPEND and not O_TRUNC because Docker passes
 		// the same log file for all commands (and also parses these
 		// log files), so we can't destroy them on each command.
@@ -177,17 +173,18 @@ func main() {
 		cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat)
 	}
 
-	if *debugLogFD > -1 {
-		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
-		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
-	} else if *debugLogDir != "" {
+	if *debugLogDir != "" {
 		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
 			cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err)
 		}
-		subcommand := flag.CommandLine.Arg(0)
-		f, err := specutils.DebugLogFile(*debugLogDir, subcommand)
+
+		// Format: <debug-log-dir>/runsc.log.<yyymmdd-hhmmss.uuuuuu>.<command>
+		scmd := flag.CommandLine.Arg(0)
+		filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), scmd)
+		path := filepath.Join(*debugLogDir, filename)
+		f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
 		if err != nil {
-			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
+			cmd.Fatalf("error opening log file %q: %v", filename, err)
 		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f58916574..f14a2f8c9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -233,6 +233,16 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
 
+	// Create control server socket here and donate FD to child process because
+	// it may be in a different network namespace and won't be reachable from
+	// outside.
+	addr := boot.ControlSocketAddr(s.ID)
+	fd, err := server.CreateSocket(addr)
+	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
+	if err != nil {
+		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
+	}
+
 	consoleEnabled := consoleSocket != ""
 
 	binPath, err := specutils.BinPath()
@@ -241,61 +251,16 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 	cmd := exec.Command(binPath, conf.ToFlags()...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{}
+	cmd.Args = append(cmd.Args,
+		"boot",
+		"--bundle", bundleDir,
+		"--controller-fd="+strconv.Itoa(nextFD),
+		"--console="+strconv.FormatBool(consoleEnabled))
+	nextFD++
 
-	// Open the log files to pass to the sandbox as FDs.
-	//
-	// These flags must come BEFORE the "boot" command in cmd.Args.
-	if conf.LogFilename != "" {
-		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
-		if err != nil {
-			return fmt.Errorf("error opening log file %q: %v", conf.LogFilename, err)
-		}
-		defer logFile.Close()
-		cmd.ExtraFiles = append(cmd.ExtraFiles, logFile)
-		cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD))
-		nextFD++
-	}
-	if conf.DebugLogDir != "" {
-		debugLogFile, err := specutils.DebugLogFile(conf.DebugLogDir, "boot")
-		if err != nil {
-			return fmt.Errorf("error opening debug log file in %q: %v", conf.DebugLogDir, err)
-		}
-		defer debugLogFile.Close()
-		cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile)
-		cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD))
-		nextFD++
-	}
-
-	// Add the "boot" command to the args.
-	//
-	// All flags after this must be for the boot command
-	cmd.Args = append(cmd.Args, "boot", "--console="+strconv.FormatBool(consoleEnabled))
-
-	// Create a socket for the control server and donate it to the sandbox.
-	addr := boot.ControlSocketAddr(s.ID)
-	sockFD, err := server.CreateSocket(addr)
-	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
-	if err != nil {
-		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
-	}
-	controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket")
+	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
 	defer controllerFile.Close()
 	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
-	cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD))
-	nextFD++
-
-	// Open the spec file to donate to the sandbox.
-	if conf.SpecFile == "" {
-		return fmt.Errorf("conf.SpecFile must be set")
-	}
-	specFile, err := os.Open(conf.SpecFile)
-	if err != nil {
-		return fmt.Errorf("error opening spec file %q: %v", conf.SpecFile, err)
-	}
-	defer specFile.Close()
-	cmd.ExtraFiles = append(cmd.ExtraFiles, specFile)
-	cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD))
-	nextFD++
 
 	// If there is a gofer, sends all socket ends to the sandbox.
 	for _, f := range ioFiles {
@@ -392,11 +357,6 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
 	}
 
-	// Log the fds we are donating to the sandbox process.
-	for i, f := range cmd.ExtraFiles {
-		log.Debugf("Donating FD %d: %q", i+3, f.Name())
-	}
-
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
 		return err
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 477409112..5fb53edb2 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -108,24 +108,14 @@ func ValidateSpec(spec *specs.Spec) error {
 // ReadSpec reads an OCI runtime spec from the given bundle directory.
 func ReadSpec(bundleDir string) (*specs.Spec, error) {
 	// The spec file must be in "config.json" inside the bundle directory.
-	specPath := filepath.Join(bundleDir, "config.json")
-	specFile, err := os.Open(specPath)
+	specFile := filepath.Join(bundleDir, "config.json")
+	specBytes, err := ioutil.ReadFile(specFile)
 	if err != nil {
-		return nil, fmt.Errorf("error opening spec file %q: %v", specPath, err)
-	}
-	defer specFile.Close()
-	return ReadSpecFromFile(specFile)
-}
-
-// ReadSpecFromFile reads an OCI runtime spec from the given File.
-func ReadSpecFromFile(specFile *os.File) (*specs.Spec, error) {
-	specBytes, err := ioutil.ReadAll(specFile)
-	if err != nil {
-		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err)
+		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile, err)
 	}
 	var spec specs.Spec
 	if err := json.Unmarshal(specBytes, &spec); err != nil {
-		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes))
+		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile, err, string(specBytes))
 	}
 	if err := ValidateSpec(&spec); err != nil {
 		return nil, err
@@ -356,11 +346,3 @@ func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) er
 	}
 	return backoff.Retry(op, b)
 }
-
-// DebugLogFile opens a file in logDir based on the timestamp and subcommand
-// for writing.
-func DebugLogFile(logDir, subcommand string) (*os.File, error) {
-	// Format: <debug-log-dir>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
-	filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), subcommand)
-	return os.OpenFile(filepath.Join(logDir, filename), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
-}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4429b981b..25987d040 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -176,7 +176,6 @@ func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (
 	}
 
 	conf.RootDir = rootDir
-	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 	return bundleDir, nil
 }
 
-- 
cgit v1.2.3


From 7713e2cb75a5d21c1a9c62ae2f332e76ea536867 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 31 Aug 2018 14:46:29 -0700
Subject: Remove not used deps

PiperOrigin-RevId: 211147521
Change-Id: I9b8b67df50a3ba084c07a48c72a874d7e2007f23
---
 runsc/BUILD | 2 --
 1 file changed, 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index e390b7bae..660cb2a06 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -16,7 +16,6 @@ go_binary(
         "//pkg/log",
         "//runsc/boot",
         "//runsc/cmd",
-        "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
@@ -46,7 +45,6 @@ go_binary(
         "//pkg/log",
         "//runsc/boot",
         "//runsc/cmd",
-        "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
-- 
cgit v1.2.3


From 66c03b3dd79c45014da19f36973a85290e9a4458 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 31 Aug 2018 16:11:07 -0700
Subject: Mounting over '/tmp' may fail

PiperOrigin-RevId: 211160120
Change-Id: Ie5f280bdac17afd01cb16562ffff6222b3184c34
---
 runsc/container/container_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 4ce3afc91..e7e53c492 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -744,8 +744,8 @@ func TestUnixDomainSockets(t *testing.T) {
 		spec.Mounts = []specs.Mount{
 			specs.Mount{
 				Type:        "bind",
-				Destination: "/tmp",
-				Source:      "/tmp",
+				Destination: dir,
+				Source:      dir,
 			},
 		}
 
-- 
cgit v1.2.3


From ab7174611c948c44ce509292063b95813183828d Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 4 Sep 2018 13:08:59 -0700
Subject: Remove epoll_wait from filters

Go 1.11 replaced it with epoll_pwait.

PiperOrigin-RevId: 211510006
Change-Id: I48a6cae95ed3d57a4633895358ad05ad8bf2f633
---
 runsc/boot/filter/config.go | 1 -
 1 file changed, 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 7227ea2b7..113023bdd 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -55,7 +55,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(0),
 		},
 	},
-	syscall.SYS_EPOLL_WAIT: {},
 	syscall.SYS_EVENTFD2: []seccomp.Rule{
 		{
 			seccomp.AllowValue(0),
-- 
cgit v1.2.3


From 9ae4e28f75979905a6396962a232e217323499f9 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Tue, 4 Sep 2018 13:36:26 -0700
Subject: runsc: fix container rootfs path.

PiperOrigin-RevId: 211515350
Change-Id: Ia495af57447c799909aa97bb873a50b87bee2625
---
 runsc/cmd/boot.go                 |  8 --------
 runsc/cmd/gofer.go                |  6 +-----
 runsc/cmd/path.go                 | 10 ----------
 runsc/container/fs.go             |  6 ------
 runsc/specutils/specutils.go      | 26 ++++++++++++++++++++++++++
 runsc/specutils/specutils_test.go | 22 ++++++++++++++++++++++
 6 files changed, 49 insertions(+), 29 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 4e08dafc8..666be902a 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -93,14 +93,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	specutils.LogSpec(spec)
 
-	// Turn any relative paths in the spec to absolute by prepending the bundleDir.
-	spec.Root.Path = absPath(b.bundleDir, spec.Root.Path)
-	for _, m := range spec.Mounts {
-		if m.Source != "" {
-			m.Source = absPath(b.bundleDir, m.Source)
-		}
-	}
-
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index f28e02798..95926f5f9 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -16,7 +16,6 @@ package cmd
 
 import (
 	"os"
-	"path"
 	"sync"
 	"syscall"
 
@@ -108,7 +107,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	syscall.Umask(0)
 
 	// Find what path is going to be served by this gofer.
-	root := absPath(g.bundleDir, spec.Root.Path)
+	root := spec.Root.Path
 	if err := syscall.Chroot(root); err != nil {
 		Fatalf("failed to chroot to %q: %v", root, err)
 	}
@@ -131,9 +130,6 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	mountIdx := 1 // first one is the root
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
-			if !path.IsAbs(m.Destination) {
-				Fatalf("destination must be absolute path: %v", m.Destination)
-			}
 			cfg := fsgofer.Config{
 				ROMount:          isReadonlyMount(m.Options),
 				PanicOnWrite:     g.panicOnWrite,
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
index 4bb1dbb4f..c207b80da 100644
--- a/runsc/cmd/path.go
+++ b/runsc/cmd/path.go
@@ -16,18 +16,8 @@ package cmd
 
 import (
 	"os"
-	"path/filepath"
 )
 
-// absPath turns the given path into an absolute path (if it is not already
-// absolute) by prepending the base path.
-func absPath(base, rel string) string {
-	if filepath.IsAbs(rel) {
-		return rel
-	}
-	return filepath.Join(base, rel)
-}
-
 // getwdOrDie returns the current working directory and dies if it cannot.
 func getwdOrDie() string {
 	wd, err := os.Getwd()
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index b93c866ea..fb352fc7c 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -78,9 +78,6 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 			continue
 		}
 		src := m.Source
-		if !filepath.IsAbs(src) {
-			src = filepath.Join(bundleDir, src)
-		}
 		srcfi, err := os.Stat(src)
 		if err != nil {
 			return fmt.Errorf("failed to stat() mount source: %v", err)
@@ -130,9 +127,6 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		log.Infof("Remounting root as readonly: %q", spec.Root.Path)
 		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
 		src := spec.Root.Path
-		if !filepath.IsAbs(src) {
-			src = filepath.Join(bundleDir, src)
-		}
 		if err := syscall.Mount(src, src, "bind", flags, ""); err != nil {
 			return fmt.Errorf("failed to remount root as readonly with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
 		}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 5fb53edb2..6c1ac56c3 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
+	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
@@ -83,6 +84,12 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("Seccomp spec is being ignored")
 	}
 
+	for i, m := range spec.Mounts {
+		if !path.IsAbs(m.Destination) {
+			return fmt.Errorf("Spec.Mounts[%d] Mount.Destination must be an absolute path: %v", i, m)
+		}
+	}
+
 	// Two annotations are use by containerd to support multi-container pods.
 	//   "io.kubernetes.cri.container-type"
 	//   "io.kubernetes.cri.sandbox-id"
@@ -105,7 +112,18 @@ func ValidateSpec(spec *specs.Spec) error {
 	return nil
 }
 
+// absPath turns the given path into an absolute path (if it is not already
+// absolute) by prepending the base path.
+func absPath(base, rel string) string {
+	if filepath.IsAbs(rel) {
+		return rel
+	}
+	return filepath.Join(base, rel)
+}
+
 // ReadSpec reads an OCI runtime spec from the given bundle directory.
+// ReadSpec also normalizes all potential relative paths into absolute
+// path, e.g. spec.Root.Path, mount.Source.
 func ReadSpec(bundleDir string) (*specs.Spec, error) {
 	// The spec file must be in "config.json" inside the bundle directory.
 	specFile := filepath.Join(bundleDir, "config.json")
@@ -120,6 +138,14 @@ func ReadSpec(bundleDir string) (*specs.Spec, error) {
 	if err := ValidateSpec(&spec); err != nil {
 		return nil, err
 	}
+	// Turn any relative paths in the spec to absolute by prepending the bundleDir.
+	spec.Root.Path = absPath(bundleDir, spec.Root.Path)
+	for i := range spec.Mounts {
+		m := &spec.Mounts[i]
+		if m.Source != "" {
+			m.Source = absPath(bundleDir, m.Source)
+		}
+	}
 	return &spec, nil
 }
 
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index 2c4e3e729..64e2172c8 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -113,6 +113,12 @@ func TestSpecInvalid(t *testing.T) {
 				Process: &specs.Process{
 					Args: []string{"/bin/true"},
 				},
+				Mounts: []specs.Mount{
+					{
+						Source:      "src",
+						Destination: "/dst",
+					},
+				},
 			},
 			error: "",
 		},
@@ -197,6 +203,22 @@ func TestSpecInvalid(t *testing.T) {
 			},
 			error: "is not supported",
 		},
+		{
+			name: "relative mount destination",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Mounts: []specs.Mount{
+					{
+						Source:      "src",
+						Destination: "dst",
+					},
+				},
+			},
+			error: "must be an absolute path",
+		},
 	} {
 		err := ValidateSpec(&test.spec)
 		if len(test.error) == 0 {
-- 
cgit v1.2.3


From ad8648c6343cf2cf3e51a0f58cb053ee303f6ffb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 4 Sep 2018 20:08:41 -0700
Subject: runsc: Pass log and config files to sandbox process by FD.

This is a prereq for running the sandbox process as user "nobody", when it may
not have permissions to open these files.

Instead, we must open then before starting the sandbox process, and pass them
by FD.

The specutils.ReadSpecFromFile method was fixed to always seek to the beginning
of the file before reading. This allows Files from the same FD to be read
multiple times, as we do in the boot command when the apply-caps flag is set.

Tested with --network=host.

PiperOrigin-RevId: 211570647
Change-Id: I685be0a290aa7f70731ebdce82ebc0ebcc9d475c
---
 runsc/BUILD                     |  2 ++
 runsc/boot/config.go            |  3 ++
 runsc/cmd/boot.go               | 19 +++++++---
 runsc/cmd/create.go             |  3 ++
 runsc/cmd/run.go                |  3 ++
 runsc/main.go                   | 25 +++++++------
 runsc/sandbox/sandbox.go        | 79 +++++++++++++++++++++++++++++++----------
 runsc/specutils/specutils.go    | 30 +++++++++++++---
 runsc/test/testutil/testutil.go |  1 +
 9 files changed, 127 insertions(+), 38 deletions(-)

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index 660cb2a06..e390b7bae 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -16,6 +16,7 @@ go_binary(
         "//pkg/log",
         "//runsc/boot",
         "//runsc/cmd",
+        "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
@@ -45,6 +46,7 @@ go_binary(
         "//pkg/log",
         "//runsc/boot",
         "//runsc/cmd",
+        "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index efb8563ea..212f5b003 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -204,6 +204,9 @@ type Config struct {
 	// TODO: Remove this when multiple container is fully supported.
 	MultiContainer bool
 
+	// SpecFile is the file containing the OCI spec.
+	SpecFile string
+
 	// WatchdogAction sets what action the watchdog takes when triggered.
 	WatchdogAction watchdog.Action
 
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 666be902a..784baf23b 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -32,9 +32,12 @@ import (
 // Boot implements subcommands.Command for the "boot" command which starts a
 // new sandbox. It should not be called directly.
 type Boot struct {
-	// bundleDir is the path to the bundle directory.
+	// bundleDir is the directory containing the OCI spec.
 	bundleDir string
 
+	// specFD is the file descriptor that the spec will be read from.
+	specFD int
+
 	// controllerFD is the file descriptor of a stream socket for the
 	// control server that is donated to this process.
 	controllerFD int
@@ -69,6 +72,7 @@ func (*Boot) Usage() string {
 // SetFlags implements subcommands.Command.SetFlags.
 func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
+	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
 	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
@@ -78,7 +82,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
 // waiting state.
 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if b.bundleDir == "" || b.controllerFD == -1 || f.NArg() != 0 {
+	if b.specFD == -1 || b.controllerFD == -1 || f.NArg() != 0 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
@@ -86,8 +90,10 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Ensure that if there is a panic, all goroutine stacks are printed.
 	debug.SetTraceback("all")
 
-	// Get the spec from the bundleDir.
-	spec, err := specutils.ReadSpec(b.bundleDir)
+	// Get the spec from the specFD.
+	specFile := os.NewFile(uintptr(b.specFD), "spec file")
+	defer specFile.Close()
+	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile)
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
 	}
@@ -123,6 +129,11 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 				args = append(args, arg)
 			}
 		}
+
+		// Note that we've already read the spec from the spec FD, and
+		// we will read it again after the exec call. This works
+		// because the ReadSpecFromFile function seeks to the beginning
+		// of the file before reading.
 		if err := setCapsAndCallSelf(args, caps); err != nil {
 			Fatalf("%v", err)
 		}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 94a889077..38ae03e7a 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -15,6 +15,8 @@
 package cmd
 
 import (
+	"path/filepath"
+
 	"context"
 	"flag"
 	"github.com/google/subcommands"
@@ -83,6 +85,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 		Fatalf("error reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
+	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
 	// Create the container. A new sandbox will be created for the
 	// container unless the metadata specifies that it should be run in an
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 681112f30..92aa6bc40 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -15,6 +15,7 @@
 package cmd
 
 import (
+	"path/filepath"
 	"syscall"
 
 	"context"
@@ -71,6 +72,8 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
 	}
+	specutils.LogSpec(spec)
+	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile)
 	if err != nil {
diff --git a/runsc/main.go b/runsc/main.go
index 773ec6486..0c9b9af78 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -17,13 +17,11 @@
 package main
 
 import (
-	"fmt"
 	"io"
 	"os"
 	"path/filepath"
 	"strings"
 	"syscall"
-	"time"
 
 	"context"
 	"flag"
@@ -32,6 +30,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/cmd"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 var (
@@ -48,6 +47,8 @@ var (
 	// Debugging flags.
 	debugLogDir = flag.String("debug-log-dir", "", "additional location for logs. It creates individual log files per command")
 	logPackets  = flag.Bool("log-packets", false, "enable network packet logging")
+	logFD       = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD  = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
 
 	// Debugging flags: strace related
 	strace         = flag.Bool("strace", false, "enable strace")
@@ -64,6 +65,7 @@ var (
 	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 )
 
+// gitRevision is set during linking.
 var gitRevision = ""
 
 func main() {
@@ -152,7 +154,9 @@ func main() {
 	}
 
 	var logFile io.Writer = os.Stderr
-	if *logFilename != "" {
+	if *logFD > -1 {
+		logFile = os.NewFile(uintptr(*logFD), "log file")
+	} else if *logFilename != "" {
 		// We must set O_APPEND and not O_TRUNC because Docker passes
 		// the same log file for all commands (and also parses these
 		// log files), so we can't destroy them on each command.
@@ -173,18 +177,17 @@ func main() {
 		cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat)
 	}
 
-	if *debugLogDir != "" {
+	if *debugLogFD > -1 {
+		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
+	} else if *debugLogDir != "" {
 		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
 			cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err)
 		}
-
-		// Format: <debug-log-dir>/runsc.log.<yyymmdd-hhmmss.uuuuuu>.<command>
-		scmd := flag.CommandLine.Arg(0)
-		filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), scmd)
-		path := filepath.Join(*debugLogDir, filename)
-		f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+		subcommand := flag.CommandLine.Arg(0)
+		f, err := specutils.DebugLogFile(*debugLogDir, subcommand)
 		if err != nil {
-			cmd.Fatalf("error opening log file %q: %v", filename, err)
+			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
 		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f14a2f8c9..e0fadefcd 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -233,34 +233,70 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
 
-	// Create control server socket here and donate FD to child process because
-	// it may be in a different network namespace and won't be reachable from
-	// outside.
-	addr := boot.ControlSocketAddr(s.ID)
-	fd, err := server.CreateSocket(addr)
-	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
-	if err != nil {
-		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
-	}
-
-	consoleEnabled := consoleSocket != ""
-
 	binPath, err := specutils.BinPath()
 	if err != nil {
 		return err
 	}
 	cmd := exec.Command(binPath, conf.ToFlags()...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{}
-	cmd.Args = append(cmd.Args,
-		"boot",
-		"--bundle", bundleDir,
-		"--controller-fd="+strconv.Itoa(nextFD),
-		"--console="+strconv.FormatBool(consoleEnabled))
-	nextFD++
 
-	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
+	// Open the log files to pass to the sandbox as FDs.
+	//
+	// These flags must come BEFORE the "boot" command in cmd.Args.
+	if conf.LogFilename != "" {
+		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		if err != nil {
+			return fmt.Errorf("error opening log file %q: %v", conf.LogFilename, err)
+		}
+		defer logFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, logFile)
+		cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+	if conf.DebugLogDir != "" {
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLogDir, "boot")
+		if err != nil {
+			return fmt.Errorf("error opening debug log file in %q: %v", conf.DebugLogDir, err)
+		}
+		defer debugLogFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile)
+		cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	// Add the "boot" command to the args.
+	//
+	// All flags after this must be for the boot command
+	cmd.Args = append(cmd.Args, "boot", "--bundle="+bundleDir)
+
+	consoleEnabled := consoleSocket != ""
+	cmd.Args = append(cmd.Args, "--console="+strconv.FormatBool(consoleEnabled))
+
+	// Create a socket for the control server and donate it to the sandbox.
+	addr := boot.ControlSocketAddr(s.ID)
+	sockFD, err := server.CreateSocket(addr)
+	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
+	if err != nil {
+		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
+	}
+	controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket")
 	defer controllerFile.Close()
 	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
+	cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
+	// Open the spec file to donate to the sandbox.
+	if conf.SpecFile == "" {
+		return fmt.Errorf("conf.SpecFile must be set")
+	}
+	specFile, err := os.Open(conf.SpecFile)
+	if err != nil {
+		return fmt.Errorf("error opening spec file %q: %v", conf.SpecFile, err)
+	}
+	defer specFile.Close()
+	cmd.ExtraFiles = append(cmd.ExtraFiles, specFile)
+	cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD))
+	nextFD++
 
 	// If there is a gofer, sends all socket ends to the sandbox.
 	for _, f := range ioFiles {
@@ -357,6 +393,11 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
 	}
 
+	// Log the fds we are donating to the sandbox process.
+	for i, f := range cmd.ExtraFiles {
+		log.Debugf("Donating FD %d: %q", i+3, f.Name())
+	}
+
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
 		return err
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 6c1ac56c3..3234cc088 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -126,14 +126,28 @@ func absPath(base, rel string) string {
 // path, e.g. spec.Root.Path, mount.Source.
 func ReadSpec(bundleDir string) (*specs.Spec, error) {
 	// The spec file must be in "config.json" inside the bundle directory.
-	specFile := filepath.Join(bundleDir, "config.json")
-	specBytes, err := ioutil.ReadFile(specFile)
+	specPath := filepath.Join(bundleDir, "config.json")
+	specFile, err := os.Open(specPath)
 	if err != nil {
-		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile, err)
+		return nil, fmt.Errorf("error opening spec file %q: %v", specPath, err)
+	}
+	defer specFile.Close()
+	return ReadSpecFromFile(bundleDir, specFile)
+}
+
+// ReadSpecFromFile reads an OCI runtime spec from the given File, and
+// normalizes all relative paths into absolute by prepending the bundle dir.
+func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) {
+	if _, err := specFile.Seek(0, os.SEEK_SET); err != nil {
+		return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
+	}
+	specBytes, err := ioutil.ReadAll(specFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err)
 	}
 	var spec specs.Spec
 	if err := json.Unmarshal(specBytes, &spec); err != nil {
-		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile, err, string(specBytes))
+		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes))
 	}
 	if err := ValidateSpec(&spec); err != nil {
 		return nil, err
@@ -372,3 +386,11 @@ func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) er
 	}
 	return backoff.Retry(op, b)
 }
+
+// DebugLogFile opens a file in logDir based on the timestamp and subcommand
+// for writing.
+func DebugLogFile(logDir, subcommand string) (*os.File, error) {
+	// Format: <debug-log-dir>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
+	filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), subcommand)
+	return os.OpenFile(filepath.Join(logDir, filename), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 25987d040..4429b981b 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -176,6 +176,7 @@ func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (
 	}
 
 	conf.RootDir = rootDir
+	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 	return bundleDir, nil
 }
 
-- 
cgit v1.2.3


From 0a9a40abcda602dc3403e2108e1348bf4e04051a Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 4 Sep 2018 20:31:52 -0700
Subject: runsc: Run sandbox as user nobody.

When starting a sandbox without direct file or network access, we create an
empty user namespace and run the sandbox in there.  However, the root user in
that namespace is still mapped to the root user in the parent namespace.

This CL maps the "nobody" user from the parent namespace into the child
namespace, and runs the sandbox process as user "nobody" inside the new
namespace.

PiperOrigin-RevId: 211572223
Change-Id: I1b1f9b1a86c0b4e7e5ca7bc93be7d4887678bab6
---
 README.md                       |  7 ++++---
 runsc/boot/config.go            |  5 +++++
 runsc/sandbox/sandbox.go        | 33 +++++++++++++++++++++++++++++++--
 runsc/specutils/BUILD           |  1 +
 runsc/specutils/namespace.go    | 14 ++++++++++++++
 runsc/test/testutil/testutil.go | 15 ++++++++-------
 6 files changed, 63 insertions(+), 12 deletions(-)

(limited to 'runsc')

diff --git a/README.md b/README.md
index 4ec83ab0c..d85948ce5 100644
--- a/README.md
+++ b/README.md
@@ -179,14 +179,15 @@ here:
 `https://storage.googleapis.com/gvisor/releases/nightly/${yyyy-mm-dd}/runsc.sha512`
 
 **It is important to copy this binary to some place that is accessible to all
-users**, since `runsc` executes itself as user `nobody` to avoid unnecessary
-privileges. The `/usr/local/bin` directory is a good choice.
+users, and make is executable to all users**, since `runsc` executes itself as
+user `nobody` to avoid unnecessary privileges. The `/usr/local/bin` directory is
+a good place to put the `runsc` binary.
 
 ```
 wget https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc
 wget https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc.sha512
 sha512sum -c runsc.sha512
-chmod +x runsc
+chmod a+x runsc
 sudo mv runsc /usr/local/bin
 ```
 
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 212f5b003..87a47dd0b 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -213,6 +213,11 @@ type Config struct {
 	// PanicSignal register signal handling that panics. Usually set to
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
 	PanicSignal int
+
+	// TestOnlyAllowRunAsCurrentUser should only be used in tests. It
+	// allows runsc to start the sandbox process as the current user if we
+	// do not have capability to set uid/gid to another user.
+	TestOnlyAllowRunAsCurrentUser bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e0fadefcd..dd5a0aa56 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -373,8 +373,8 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// User namespace depends on the following options:
 	//   - Host network/filesystem: requires to run inside the user namespace
 	//       specified in the spec or the current namespace if none is configured.
-	//   - Gofer: when using a Gofer, the sandbox process can run isolated in an
-	//       empty namespace.
+	//   - Gofer: when using a Gofer, the sandbox process can run isolated in a
+	//       new user namespace with only the "nobody" user and group.
 	if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect {
 		if userns, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
 			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
@@ -391,6 +391,34 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	} else {
 		log.Infof("Sandbox will be started in new user namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+
+		if conf.TestOnlyAllowRunAsCurrentUser {
+			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
+		} else if specutils.CanSetUIDGID() {
+			// If we have CAP_SETUID and CAP_SETGID, then we can also run
+			// as user nobody.
+
+			// Map nobody in the new namespace to nobody in the parent namespace.
+			const nobody = 65534
+			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{{
+				ContainerID: int(nobody),
+				HostID:      int(nobody),
+				Size:        int(1),
+			}}
+			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{{
+				ContainerID: int(nobody),
+				HostID:      int(nobody),
+				Size:        int(1),
+			}}
+
+			// Set credentials to run as user and group nobody.
+			cmd.SysProcAttr.Credential = &syscall.Credential{
+				Uid: nobody,
+				Gid: nobody,
+			}
+		} else {
+			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
+		}
 	}
 
 	// Log the fds we are donating to the sandbox process.
@@ -399,6 +427,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
+	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
 		return err
 	}
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 97a504b20..e73b2293f 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -18,6 +18,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 80eaad965..356943a65 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -23,6 +23,7 @@ import (
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 )
@@ -202,3 +203,16 @@ func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
 		})
 	}
 }
+
+// CanSetUIDGID returns true if the user has SETUID and SETGID capabilities.
+func CanSetUIDGID() bool {
+	caps, err := capability.NewPid2(os.Getpid())
+	if err != nil {
+		return false
+	}
+	if err := caps.Load(); err != nil {
+		return false
+	}
+	return caps.Get(capability.EFFECTIVE, capability.CAP_SETUID) &&
+		caps.Get(capability.EFFECTIVE, capability.CAP_SETGID)
+}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4429b981b..77bd56912 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -104,13 +104,14 @@ func FindFile(path string) (string, error) {
 // TestConfig return the default configuration to use in tests.
 func TestConfig() *boot.Config {
 	return &boot.Config{
-		Debug:          true,
-		LogFormat:      "text",
-		LogPackets:     true,
-		Network:        boot.NetworkNone,
-		Strace:         true,
-		MultiContainer: true,
-		FileAccess:     boot.FileAccessProxyExclusive,
+		Debug:                         true,
+		LogFormat:                     "text",
+		LogPackets:                    true,
+		Network:                       boot.NetworkNone,
+		Strace:                        true,
+		MultiContainer:                true,
+		FileAccess:                    boot.FileAccessProxyExclusive,
+		TestOnlyAllowRunAsCurrentUser: true,
 	}
 }
 
-- 
cgit v1.2.3


From f96b33c73c2150632a8a1ba22b1a420ec1f1214d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 5 Sep 2018 13:00:08 -0700
Subject: runsc: Promote getExecutablePathInternal to getExecutablePath.

Remove GetExecutablePath (the non-internal version).  This makes path handling
more consistent between exec, root, and child containers.

The new getExecutablePath now uses MountNamespace.FindInode, which is more
robust than Walking the Dirent tree ourselves.

This also removes the last use of lstat(2) in the sentry, so that can be
removed from the filters.

PiperOrigin-RevId: 211683110
Change-Id: Ic8ec960fc1c267aa7d310b8efe6e900c88a9207a
---
 runsc/boot/controller.go          | 10 ++++++
 runsc/boot/filter/config.go       |  5 +--
 runsc/boot/fs.go                  | 65 ++++++++++++++++++++++++++-------------
 runsc/boot/loader.go              | 31 +++++++++----------
 runsc/cmd/exec.go                 | 10 ------
 runsc/container/container_test.go |  3 --
 runsc/specutils/specutils.go      | 31 -------------------
 7 files changed, 69 insertions(+), 86 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index fdb6be5b1..ec1110059 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -228,6 +228,16 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 // Execute runs a command on a created or running sandbox.
 func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Execute: %+v", *e)
+
+	if e.Filename == "" {
+		rootCtx := cm.l.rootProcArgs.NewContext(cm.l.k)
+		rootMns := cm.l.k.RootMountNamespace()
+		var err error
+		if e.Filename, err = getExecutablePath(rootCtx, rootMns, e.Argv[0], e.Envv); err != nil {
+			return fmt.Errorf("error getting executable path for %q: %v", e.Argv[0], err)
+		}
+	}
+
 	proc := control.Proc{Kernel: cm.l.k}
 	if err := proc.Exec(e, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 113023bdd..f864b1f45 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -164,10 +164,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowAny{}, /* winsize struct */
 		},
 	},
-	syscall.SYS_LSEEK: {},
-	// TODO: Remove SYS_LSTAT when executable lookup moves
-	// into the gofer.
-	syscall.SYS_LSTAT:   {},
+	syscall.SYS_LSEEK:   {},
 	syscall.SYS_MADVISE: {},
 	syscall.SYS_MINCORE: {},
 	syscall.SYS_MMAP: []seccomp.Rule{
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 20d0e42ef..4a11b30f1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -701,13 +701,13 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	return nil
 }
 
-// GetExecutablePathInternal traverses the *container's* filesystem to resolve
-// exec's absolute path. For example, if the container is being served files by
-// the fsgofer serving /foo/bar as the container root, it will search within
+// getExecutablePath traverses the *container's* filesystem to resolve exec's
+// absolute path. For example, if the container is being served files by the
+// fsgofer serving /foo/bar as the container root, it will search within
 // /foo/bar, not the host root.
 // TODO: Unit test this.
-func GetExecutablePathInternal(ctx context.Context, procArgs *kernel.CreateProcessArgs) (string, error) {
-	exec := filepath.Clean(procArgs.Filename)
+func getExecutablePath(ctx context.Context, mns *fs.MountNamespace, filename string, env []string) (string, error) {
+	exec := filepath.Clean(filename)
 
 	// Don't search PATH if exec is a path to a file (absolute or relative).
 	if strings.IndexByte(exec, '/') >= 0 {
@@ -716,31 +716,52 @@ func GetExecutablePathInternal(ctx context.Context, procArgs *kernel.CreateProce
 
 	// Search the PATH for a file whose name matches the one we are looking
 	// for.
-	pathDirs := specutils.GetPath(procArgs.Envv)
+	pathDirs := specutils.GetPath(env)
 	for _, p := range pathDirs {
-		// Walk to the end of the path.
-		curDir := procArgs.Root
-		for _, pc := range strings.Split(p, "/") {
-			var err error
-			if curDir, err = curDir.Walk(ctx, curDir, pc); err != nil {
-				break
-			}
-		}
-		if curDir == nil {
+		// Try to find the binary inside path p.
+		binPath := path.Join(p, filename)
+		root := fs.RootFromContext(ctx)
+		defer root.DecRef()
+		d, err := mns.FindInode(ctx, root, nil, binPath, linux.MaxSymlinkTraversals)
+		if err == syserror.ENOENT || err == syserror.EACCES {
 			continue
 		}
-		// Check for the executable in the path directory.
-		dirent, err := curDir.Walk(ctx, curDir, exec)
 		if err != nil {
-			continue
+			return "", fmt.Errorf("FindInode(%q) failed: %v", binPath, err)
 		}
-		// Check whether we can read and execute the file in question.
-		if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
-			log.Infof("Found executable at %q, but user cannot execute it: %v", path.Join(p, exec), err)
+		defer d.DecRef()
+
+		// Check whether we can read and execute the found file.
+		if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
+			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
 			continue
 		}
 		return path.Join("/", p, exec), nil
 	}
 
-	return "", fmt.Errorf("could not find executable %s in path %v", exec, pathDirs)
+	return "", fmt.Errorf("could not find executable %q in path %v", exec, pathDirs)
+}
+
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+	if procArgs.Filename != "" {
+		// Sanity check.
+		if !path.IsAbs(procArgs.Filename) {
+			return fmt.Errorf("filename must be absolute: %q", procArgs.Filename)
+		}
+		// Nothing to set.
+		return nil
+	}
+
+	if len(procArgs.Argv) == 0 {
+		return fmt.Errorf("Argv must not be empty")
+	}
+
+	f, err := getExecutablePath(ctx, mns, procArgs.Argv[0], procArgs.Envv)
+	if err != nil {
+		return err
+	}
+	procArgs.Filename = f
+	return nil
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 74d0c2534..2733c4d69 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -271,16 +271,8 @@ func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSName
 		return kernel.CreateProcessArgs{}, fmt.Errorf("error creating limits: %v", err)
 	}
 
-	// Get the executable path, which is a bit tricky because we have to
-	// inspect the environment PATH which is relative to the root path.
-	exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
-	if err != nil {
-		return kernel.CreateProcessArgs{}, fmt.Errorf("error getting executable path: %v", err)
-	}
-
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
-		Filename:             exec,
 		Argv:                 spec.Process.Args,
 		Envv:                 spec.Process.Env,
 		WorkingDirectory:     spec.Process.Cwd, // Defaults to '/' if empty.
@@ -365,7 +357,7 @@ func (l *Loader) run() error {
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
-		err := setFileSystemForProcess(
+		if err := setFileSystemForProcess(
 			&l.rootProcArgs,
 			l.spec,
 			l.conf,
@@ -374,10 +366,16 @@ func (l *Loader) run() error {
 			l.rootProcArgs.Credentials,
 			l.rootProcArgs.Limits,
 			l.k,
-			"" /* CID, which isn't needed for the root container */)
-		if err != nil {
+			"" /* CID, which isn't needed for the root container */); err != nil {
 			return err
 		}
+
+		rootCtx := l.rootProcArgs.NewContext(l.k)
+		rootMns := l.k.RootMountNamespace()
+		if err := setExecutablePath(rootCtx, rootMns, &l.rootProcArgs); err != nil {
+			return fmt.Errorf("error setting executable path for %+v: %v", l.rootProcArgs, err)
+		}
+
 		// Create the root container init task.
 		if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
 			return fmt.Errorf("failed to create init process: %v", err)
@@ -443,7 +441,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		ioFDs = append(ioFDs, fd)
 	}
 
-	err = setFileSystemForProcess(
+	if err := setFileSystemForProcess(
 		&procArgs,
 		spec,
 		conf,
@@ -452,13 +450,14 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		creds,
 		procArgs.Limits,
 		k,
-		cid)
-	if err != nil {
+		cid); err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
 
-	if procArgs.Filename, err = GetExecutablePathInternal(procArgs.NewContext(k), &procArgs); err != nil {
-		return 0, err
+	ctx := procArgs.NewContext(l.k)
+	mns := k.RootMountNamespace()
+	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
+		return 0, fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
 	}
 
 	tg, err := l.k.CreateProcess(procArgs)
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index b84a80119..966d2e258 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -140,16 +140,6 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 
-	// Get the executable path, which is a bit tricky because we have to
-	// inspect the environment PATH which is relative to the root path.
-	// If the user is overriding environment variables, PATH may have been
-	// overwritten.
-	rootPath := c.Spec.Root.Path
-	e.Filename, err = specutils.GetExecutablePath(e.Argv[0], rootPath, e.Envv)
-	if err != nil {
-		Fatalf("error getting executable path: %v", err)
-	}
-
 	ws, err := c.Execute(e)
 	if err != nil {
 		Fatalf("error getting processes for container: %v", err)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index e7e53c492..5f452acbf 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -521,7 +521,6 @@ func TestExec(t *testing.T) {
 		execArgs := control.ExecArgs{
 			Filename:         "/bin/sleep",
 			Argv:             []string{"sleep", "5"},
-			Envv:             []string{"PATH=" + os.Getenv("PATH")},
 			WorkingDirectory: "/",
 			KUID:             uid,
 		}
@@ -889,7 +888,6 @@ func TestPauseResume(t *testing.T) {
 		execArgs := control.ExecArgs{
 			Filename:         "/bin/bash",
 			Argv:             []string{"bash", "-c", script},
-			Envv:             []string{"PATH=" + os.Getenv("PATH")},
 			WorkingDirectory: "/",
 			KUID:             uid,
 		}
@@ -1070,7 +1068,6 @@ func TestCapabilities(t *testing.T) {
 		execArgs := control.ExecArgs{
 			Filename:         exePath,
 			Argv:             []string{exePath},
-			Envv:             []string{"PATH=" + os.Getenv("PATH")},
 			WorkingDirectory: "/",
 			KUID:             uid,
 			KGID:             gid,
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 3234cc088..551718e9a 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -163,37 +163,6 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error)
 	return &spec, nil
 }
 
-// GetExecutablePath returns the absolute path to the executable, relative to
-// the root. It searches the environment PATH for the first file that exists
-// with the given name.
-// TODO: Remove this in favor of finding executables via
-// boot.GetExecutablePathInternal.
-func GetExecutablePath(exec, root string, env []string) (string, error) {
-	exec = filepath.Clean(exec)
-
-	// Don't search PATH if exec is a path to a file (absolute or relative).
-	if strings.IndexByte(exec, '/') >= 0 {
-		return exec, nil
-	}
-
-	// Search the PATH for a file whose name matches the one we are looking
-	// for.
-	path := GetPath(env)
-	for _, p := range path {
-		abs := filepath.Join(root, p, exec)
-		// Do not follow symlink link because the target is in the container
-		// root filesystem.
-		if _, err := os.Lstat(abs); err == nil {
-			// We found it!  Return the path relative to the root.
-			return filepath.Join("/", p, exec), nil
-		}
-	}
-
-	// Could not find a suitable path, just return the original string.
-	log.Warningf("could not find executable %s in path %s", exec, path)
-	return exec, nil
-}
-
 // GetPath returns the PATH as a slice of strings given the environemnt
 // variables.
 func GetPath(env []string) []string {
-- 
cgit v1.2.3


From 1d22d87fdc464b0641eca69f730777c27984c2ff Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 5 Sep 2018 13:12:15 -0700
Subject: Move multi-container test to a single file

PiperOrigin-RevId: 211685288
Change-Id: I7872f2a83fcaaa54f385e6e567af6e72320c5aa0
---
 runsc/container/BUILD                   |   1 +
 runsc/container/container_test.go       | 195 ----------------------------
 runsc/container/multi_container_test.go | 220 ++++++++++++++++++++++++++++++++
 3 files changed, 221 insertions(+), 195 deletions(-)
 create mode 100644 runsc/container/multi_container_test.go

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index b86974d41..efdf43175 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -38,6 +38,7 @@ go_test(
     srcs = [
         "container_test.go",
         "fs_test.go",
+        "multi_container_test.go",
     ],
     data = [
         ":uds_test_app",
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5f452acbf..c00db3e91 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -38,7 +38,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
@@ -1366,200 +1365,6 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 }
 
-// TestMultiContainerSanity checks that it is possible to run 2 dead-simple
-// containers in the same sandbox.
-func TestMultiContainerSanity(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		containerIDs := []string{
-			testutil.UniqueContainerID(),
-			testutil.UniqueContainerID(),
-		}
-		containerAnnotations := []map[string]string{
-			// The first container creates a sandbox.
-			map[string]string{
-				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
-			},
-			// The second container creates a container within the first
-			// container's sandbox.
-			map[string]string{
-				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
-				specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
-			},
-		}
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-
-		// Setup the containers.
-		containers := make([]*Container, 0, len(containerIDs))
-		for i, annotations := range containerAnnotations {
-			spec := testutil.NewSpecWithArgs("sleep", "100")
-			spec.Annotations = annotations
-			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer os.RemoveAll(bundleDir)
-			cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer cont.Destroy()
-			if err := cont.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-			containers = append(containers, cont)
-		}
-
-		expectedPL := []*control.Process{
-			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
-			},
-			{
-				UID:  0,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
-			},
-		}
-
-		// Check via ps that multiple processes are running.
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-	}
-}
-
-func TestMultiContainerWait(t *testing.T) {
-	t.Skip("Test is flakey.") // TODO: Remove.
-	containerIDs := []string{
-		testutil.UniqueContainerID(),
-		testutil.UniqueContainerID(),
-	}
-	containerAnnotations := []map[string]string{
-		// The first container creates a sandbox.
-		map[string]string{
-			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
-		},
-		// The second container creates a container within the first
-		// container's sandbox.
-		map[string]string{
-			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
-			specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
-		},
-	}
-	args := [][]string{
-		// The first container should run the entire duration of the
-		// test.
-		{"sleep", "100"},
-		// We'll wait on the second container, which is much shorter
-		// lived.
-		{"sleep", "1"},
-	}
-
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	// Setup the containers.
-	containers := make([]*Container, 0, len(containerIDs))
-	for i, annotations := range containerAnnotations {
-		spec := testutil.NewSpecWithArgs(args[i][0], args[i][1])
-		spec.Annotations = annotations
-		conf := testutil.TestConfig()
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(bundleDir)
-		cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
-		containers = append(containers, cont)
-	}
-
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  0,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
-
-	// Check via ps that multiple processes are running.
-	if err := waitForProcessList(containers[0], expectedPL); err != nil {
-		t.Errorf("failed to wait for sleep to start: %v", err)
-	}
-
-	// Wait on the short lived container from multiple goroutines.
-	wg := sync.WaitGroup{}
-	for i := 0; i < 3; i++ {
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			if ws, err := containers[1].Wait(); err != nil {
-				t.Errorf("failed to wait for process %q: %v", strings.Join(containers[1].Spec.Process.Args, " "), err)
-			} else if es := ws.ExitStatus(); es != 0 {
-				t.Errorf("process %q exited with non-zero status %d", strings.Join(containers[1].Spec.Process.Args, " "), es)
-			}
-			if _, err := containers[1].Wait(); err == nil {
-				t.Errorf("wait for stopped process %q should fail", strings.Join(containers[1].Spec.Process.Args, " "))
-			}
-
-			// After Wait returns, ensure that the root container is running and
-			// the child has finished.
-			if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
-				t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
-			}
-		}()
-	}
-
-	// Also wait via PID.
-	for i := 0; i < 3; i++ {
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			const pid = 2
-			if ws, err := containers[0].WaitPID(pid); err != nil {
-				t.Errorf("failed to wait for PID %d: %v", pid, err)
-			} else if es := ws.ExitStatus(); es != 0 {
-				t.Errorf("PID %d exited with non-zero status %d", pid, es)
-			}
-			if _, err := containers[0].WaitPID(pid); err == nil {
-				t.Errorf("wait for stopped PID %d should fail", pid)
-			}
-		}()
-	}
-
-	wg.Wait()
-}
-
 // Check that modifications to a volume mount are propigated into and out of
 // the sandbox.
 func TestContainerVolumeContentsShared(t *testing.T) {
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
new file mode 100644
index 000000000..34e27383e
--- /dev/null
+++ b/runsc/container/multi_container_test.go
@@ -0,0 +1,220 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"os"
+	"strings"
+	"sync"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// TestMultiContainerSanity checks that it is possible to run 2 dead-simple
+// containers in the same sandbox.
+func TestMultiContainerSanity(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		containerIDs := []string{
+			testutil.UniqueContainerID(),
+			testutil.UniqueContainerID(),
+		}
+		containerAnnotations := []map[string]string{
+			// The first container creates a sandbox.
+			map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+			},
+			// The second container creates a container within the first
+			// container's sandbox.
+			map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+				specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
+			},
+		}
+
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+
+		// Setup the containers.
+		containers := make([]*Container, 0, len(containerIDs))
+		for i, annotations := range containerAnnotations {
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			spec.Annotations = annotations
+			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(bundleDir)
+			cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+			containers = append(containers, cont)
+		}
+
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  0,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
+
+		// Check via ps that multiple processes are running.
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
+	}
+}
+
+func TestMultiContainerWait(t *testing.T) {
+	t.Skip("Test is flakey.") // TODO: Remove.
+	containerIDs := []string{
+		testutil.UniqueContainerID(),
+		testutil.UniqueContainerID(),
+	}
+	containerAnnotations := []map[string]string{
+		// The first container creates a sandbox.
+		map[string]string{
+			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+		},
+		// The second container creates a container within the first
+		// container's sandbox.
+		map[string]string{
+			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+			specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
+		},
+	}
+	args := [][]string{
+		// The first container should run the entire duration of the
+		// test.
+		{"sleep", "100"},
+		// We'll wait on the second container, which is much shorter
+		// lived.
+		{"sleep", "1"},
+	}
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// Setup the containers.
+	containers := make([]*Container, 0, len(containerIDs))
+	for i, annotations := range containerAnnotations {
+		spec := testutil.NewSpecWithArgs(args[i][0], args[i][1])
+		spec.Annotations = annotations
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  0,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Check via ps that multiple processes are running.
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+
+	// Wait on the short lived container from multiple goroutines.
+	wg := sync.WaitGroup{}
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			if ws, err := containers[1].Wait(); err != nil {
+				t.Errorf("failed to wait for process %q: %v", strings.Join(containers[1].Spec.Process.Args, " "), err)
+			} else if es := ws.ExitStatus(); es != 0 {
+				t.Errorf("process %q exited with non-zero status %d", strings.Join(containers[1].Spec.Process.Args, " "), es)
+			}
+			if _, err := containers[1].Wait(); err == nil {
+				t.Errorf("wait for stopped process %q should fail", strings.Join(containers[1].Spec.Process.Args, " "))
+			}
+
+			// After Wait returns, ensure that the root container is running and
+			// the child has finished.
+			if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+				t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
+			}
+		}()
+	}
+
+	// Also wait via PID.
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			const pid = 2
+			if ws, err := containers[0].WaitPID(pid); err != nil {
+				t.Errorf("failed to wait for PID %d: %v", pid, err)
+			} else if es := ws.ExitStatus(); es != 0 {
+				t.Errorf("PID %d exited with non-zero status %d", pid, es)
+			}
+			if _, err := containers[0].WaitPID(pid); err == nil {
+				t.Errorf("wait for stopped PID %d should fail", pid)
+			}
+		}()
+	}
+
+	wg.Wait()
+}
-- 
cgit v1.2.3


From 4b57fd920d2d9fe3c8351d5b73b496902c928d95 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 5 Sep 2018 13:16:49 -0700
Subject: Add MADVISE to fsgofer seccomp profile

PiperOrigin-RevId: 211686037
Change-Id: I0e776ca760b65ba100e495f471b6e811dbd6590a
---
 runsc/fsgofer/filter/config.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'runsc')

diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 97e556ceb..0a1c63753 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -105,6 +105,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_GETTIMEOFDAY: {},
 	syscall.SYS_LINKAT:       {},
 	syscall.SYS_LSEEK:        {},
+	syscall.SYS_MADVISE:      {},
 	syscall.SYS_MKDIRAT:      {},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
-- 
cgit v1.2.3


From 0c7cfca0da234ae34497c420a23fea91a47a566c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 5 Sep 2018 14:01:25 -0700
Subject: Running container should have a valid sandbox

PiperOrigin-RevId: 211693868
Change-Id: Iea340dd78bf26ae6409c310b63c17cc611c2055f
---
 runsc/container/container.go            |   5 +-
 runsc/container/multi_container_test.go | 165 ++++++++++++--------------------
 2 files changed, 64 insertions(+), 106 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 725b4d347..a3454eb8f 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -517,6 +517,7 @@ func (c *Container) Destroy() error {
 			log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err)
 		}
 	}
+	c.Status = Stopped
 	c.Sandbox = nil
 
 	if c.GoferPid != 0 {
@@ -536,15 +537,11 @@ func (c *Container) Destroy() error {
 		return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
 	}
 
-	c.Status = Stopped
 	return nil
 }
 
 // IsRunning returns true if the sandbox or gofer process is running.
 func (c *Container) IsRunning() bool {
-	if c.Status == Stopped {
-		return false
-	}
 	if c.Sandbox != nil && c.Sandbox.IsRunning() {
 		return true
 	}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 34e27383e..cf5140b4e 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -20,34 +20,42 @@ import (
 	"sync"
 	"testing"
 
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
+func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
+	var specs []*specs.Spec
+	var ids []string
+	rootID := testutil.UniqueContainerID()
+
+	for i, cmd := range cmds {
+		spec := testutil.NewSpecWithArgs(cmd...)
+		if i == 0 {
+			spec.Annotations = map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+			}
+			ids = append(ids, rootID)
+		} else {
+			spec.Annotations = map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+				specutils.ContainerdSandboxIDAnnotation:     rootID,
+			}
+			ids = append(ids, testutil.UniqueContainerID())
+		}
+		specs = append(specs, spec)
+	}
+	return specs, ids
+}
+
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		containerIDs := []string{
-			testutil.UniqueContainerID(),
-			testutil.UniqueContainerID(),
-		}
-		containerAnnotations := []map[string]string{
-			// The first container creates a sandbox.
-			map[string]string{
-				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
-			},
-			// The second container creates a container within the first
-			// container's sandbox.
-			map[string]string{
-				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
-				specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
-			},
-		}
-
 		rootDir, err := testutil.SetupRootDir()
 		if err != nil {
 			t.Fatalf("error creating root dir: %v", err)
@@ -55,16 +63,16 @@ func TestMultiContainerSanity(t *testing.T) {
 		defer os.RemoveAll(rootDir)
 
 		// Setup the containers.
-		containers := make([]*Container, 0, len(containerIDs))
-		for i, annotations := range containerAnnotations {
-			spec := testutil.NewSpecWithArgs("sleep", "100")
-			spec.Annotations = annotations
+		sleep := []string{"sleep", "100"}
+		specs, ids := createSpecs(sleep, sleep)
+		var containers []*Container
+		for i, spec := range specs {
 			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
 			defer os.RemoveAll(bundleDir)
-			cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
+			cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
 			if err != nil {
 				t.Fatalf("error creating container: %v", err)
 			}
@@ -75,24 +83,11 @@ func TestMultiContainerSanity(t *testing.T) {
 			containers = append(containers, cont)
 		}
 
+		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
-			},
-			{
-				UID:  0,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
-			},
+			{PID: 1, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep"},
 		}
-
-		// Check via ps that multiple processes are running.
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
@@ -100,50 +95,28 @@ func TestMultiContainerSanity(t *testing.T) {
 }
 
 func TestMultiContainerWait(t *testing.T) {
-	t.Skip("Test is flakey.") // TODO: Remove.
-	containerIDs := []string{
-		testutil.UniqueContainerID(),
-		testutil.UniqueContainerID(),
-	}
-	containerAnnotations := []map[string]string{
-		// The first container creates a sandbox.
-		map[string]string{
-			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
-		},
-		// The second container creates a container within the first
-		// container's sandbox.
-		map[string]string{
-			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
-			specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
-		},
-	}
-	args := [][]string{
-		// The first container should run the entire duration of the
-		// test.
-		{"sleep", "100"},
-		// We'll wait on the second container, which is much shorter
-		// lived.
-		{"sleep", "1"},
-	}
-
 	rootDir, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
 	defer os.RemoveAll(rootDir)
 
+	// The first container should run the entire duration of the test.
+	cmd1 := []string{"sleep", "100"}
+	// We'll wait on the second container, which is much shorter lived.
+	cmd2 := []string{"sleep", "1"}
+	specs, ids := createSpecs(cmd1, cmd2)
+
 	// Setup the containers.
-	containers := make([]*Container, 0, len(containerIDs))
-	for i, annotations := range containerAnnotations {
-		spec := testutil.NewSpecWithArgs(args[i][0], args[i][1])
-		spec.Annotations = annotations
+	var containers []*Container
+	for i, spec := range specs {
 		conf := testutil.TestConfig()
 		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
 		defer os.RemoveAll(bundleDir)
-		cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -154,24 +127,11 @@ func TestMultiContainerWait(t *testing.T) {
 		containers = append(containers, cont)
 	}
 
+	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  0,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
+		{PID: 1, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep"},
 	}
-
-	// Check via ps that multiple processes are running.
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
 	}
@@ -180,41 +140,42 @@ func TestMultiContainerWait(t *testing.T) {
 	wg := sync.WaitGroup{}
 	for i := 0; i < 3; i++ {
 		wg.Add(1)
-		go func() {
+		go func(c *Container) {
 			defer wg.Done()
-			if ws, err := containers[1].Wait(); err != nil {
-				t.Errorf("failed to wait for process %q: %v", strings.Join(containers[1].Spec.Process.Args, " "), err)
+			if ws, err := c.Wait(); err != nil {
+				t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err)
 			} else if es := ws.ExitStatus(); es != 0 {
-				t.Errorf("process %q exited with non-zero status %d", strings.Join(containers[1].Spec.Process.Args, " "), es)
+				t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
 			}
-			if _, err := containers[1].Wait(); err == nil {
-				t.Errorf("wait for stopped process %q should fail", strings.Join(containers[1].Spec.Process.Args, " "))
+			if _, err := c.Wait(); err == nil {
+				t.Errorf("wait for stopped process %s should fail", c.Spec.Process.Args)
 			}
-
-			// After Wait returns, ensure that the root container is running and
-			// the child has finished.
-			if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
-				t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
-			}
-		}()
+		}(containers[1])
 	}
 
 	// Also wait via PID.
 	for i := 0; i < 3; i++ {
 		wg.Add(1)
-		go func() {
+		go func(c *Container) {
 			defer wg.Done()
 			const pid = 2
-			if ws, err := containers[0].WaitPID(pid); err != nil {
+			if ws, err := c.WaitPID(pid); err != nil {
 				t.Errorf("failed to wait for PID %d: %v", pid, err)
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("PID %d exited with non-zero status %d", pid, es)
 			}
-			if _, err := containers[0].WaitPID(pid); err == nil {
+			if _, err := c.WaitPID(pid); err == nil {
 				t.Errorf("wait for stopped PID %d should fail", pid)
 			}
-		}()
+			// TODO: use 'container[1]' when PID namespace is supported.
+		}(containers[0])
 	}
 
 	wg.Wait()
+
+	// After Wait returns, ensure that the root container is running and
+	// the child has finished.
+	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
+	}
 }
-- 
cgit v1.2.3


From 12aef686af3f37029e619602286f00a40144c52d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 5 Sep 2018 14:28:52 -0700
Subject: Enabled bind mounts in sub-containers

With multi-gofers, bind mounts in sub-containers should
just work. Removed restrictions and added test. There are
also a few cleanups along the way, e.g. retry unmounting
in case cleanup races with gofer teardown.

PiperOrigin-RevId: 211699569
Change-Id: Ic0a69c29d7c31cd7e038909cc686c6ac98703374
---
 runsc/boot/fds.go                       |  5 ---
 runsc/boot/fs.go                        |  5 ---
 runsc/container/container.go            | 36 +++++++++++++-------
 runsc/container/multi_container_test.go | 58 +++++++++++++++++++++++++++++++++
 4 files changed, 82 insertions(+), 22 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 0449e243d..9de5a78b1 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -28,11 +28,6 @@ import (
 
 // createFDMap creates an fd map that contains stdin, stdout, and stderr. If
 // console is true, then ioctl calls will be passed through to the host fd.
-//
-// TODO: We currently arn't passing any FDs in to the sandbox, so
-// there's not much else for this function to do.  It will get more complicated
-// when gofers enter the picture.  Also the LISTEN_FDS environment variable
-// allows passing arbitrary FDs to the sandbox, which we do not yet support.
 func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool) (*kernel.FDMap, error) {
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 4a11b30f1..772df40fe 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -685,11 +685,6 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	// Mount all submounts.
 	mounts := compileMounts(spec)
 	for _, m := range mounts {
-		// TODO: Enable bind mounts in child containers.
-		if m.Type == bind {
-			log.Infof("Bind mounts in child containers are not yet supported: %+v", m)
-			continue
-		}
 		dest := filepath.Join(containerRoot, m.Destination)
 		if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil {
 			return fmt.Errorf("error mounting filesystem for container: %v", err)
diff --git a/runsc/container/container.go b/runsc/container/container.go
index a3454eb8f..a4a3ed56d 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -520,24 +520,35 @@ func (c *Container) Destroy() error {
 	c.Status = Stopped
 	c.Sandbox = nil
 
+	if err := c.destroyGofer(); err != nil {
+		return fmt.Errorf("error destroying gofer: %v", err)
+	}
+
+	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
+	}
+
+	return nil
+}
+
+func (c *Container) destroyGofer() error {
 	if c.GoferPid != 0 {
 		log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
 		if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
 			log.Warningf("error sending signal %d to pid %d: %v", syscall.SIGKILL, c.GoferPid, err)
-		} else {
-			c.GoferPid = 0
 		}
 	}
 
-	if err := destroyFS(c.Spec); err != nil {
-		return fmt.Errorf("error destroying container fs: %v", err)
-	}
-
-	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
+	// Gofer process may take some time to teardown. Retry in case of failure.
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
+	defer cancel()
+	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+	err := backoff.Retry(func() error { return destroyFS(c.Spec) }, b)
+	if err == nil {
+		// Success!
+		c.GoferPid = 0
 	}
-
-	return nil
+	return err
 }
 
 // IsRunning returns true if the sandbox or gofer process is running.
@@ -549,8 +560,9 @@ func (c *Container) IsRunning() bool {
 		// Send a signal 0 to the gofer process.
 		if err := syscall.Kill(c.GoferPid, 0); err == nil {
 			log.Warningf("Found orphan gofer process, pid: %d", c.GoferPid)
-			// Attempt to kill gofer if it's orphan.
-			syscall.Kill(c.GoferPid, syscall.SIGKILL)
+			if err := c.destroyGofer(); err != nil {
+				log.Warningf("Error destroying gofer: %v", err)
+			}
 
 			// Don't wait for gofer to die. Return 'running' and hope gofer is dead
 			// next time around.
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index cf5140b4e..3bdfbaca3 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -15,7 +15,9 @@
 package container
 
 import (
+	"io/ioutil"
 	"os"
+	"path/filepath"
 	"strings"
 	"sync"
 	"testing"
@@ -179,3 +181,59 @@ func TestMultiContainerWait(t *testing.T) {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
 	}
 }
+
+// TestMultiContainerMount tests that bind mounts can be used with multiple
+// containers.
+func TestMultiContainerMount(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	cmd1 := []string{"sleep", "100"}
+
+	// 'src != dst' ensures that 'dst' doesn't exist in the host and must be
+	// properly mapped inside the container to work.
+	src, err := ioutil.TempDir(testutil.TmpDir(), "container")
+	if err != nil {
+		t.Fatal("ioutil.TempDir failed:", err)
+	}
+	dst := src + ".dst"
+	cmd2 := []string{"touch", filepath.Join(dst, "file")}
+
+	sps, ids := createSpecs(cmd1, cmd2)
+	sps[1].Mounts = append(sps[1].Mounts, specs.Mount{
+		Source:      src,
+		Destination: dst,
+		Type:        "bind",
+	})
+
+	// Setup the containers.
+	var containers []*Container
+	for i, spec := range sps {
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	ws, err := containers[1].Wait()
+	if err != nil {
+		t.Error("error waiting on container:", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Error("container failed, waitStatus:", ws)
+	}
+}
-- 
cgit v1.2.3


From 5f0002fc83a77a39d9a2ef1443bc6c18e22ea779 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 5 Sep 2018 18:31:37 -0700
Subject: Use container's capabilities in exec

When no capabilities are specified in exec, use the
container's capabilities to match runc's behavior.

PiperOrigin-RevId: 211735186
Change-Id: Icd372ed64410c81144eae94f432dffc9fe3a86ce
---
 runsc/cmd/exec.go                          | 28 ++++++++++----
 runsc/test/image/image_test.go             |  8 ++--
 runsc/test/integration/BUILD               |  1 +
 runsc/test/integration/exec_test.go        | 62 ++++++++++++++++++++++++++++++
 runsc/test/integration/integration_test.go |  2 +-
 runsc/test/testutil/docker.go              | 10 ++---
 6 files changed, 94 insertions(+), 17 deletions(-)
 create mode 100644 runsc/test/integration/exec_test.go

(limited to 'runsc')

diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 966d2e258..da1642c08 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -115,16 +115,22 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("error loading sandbox: %v", err)
 	}
 
+	// Replace empty settings with defaults from container.
 	if e.WorkingDirectory == "" {
 		e.WorkingDirectory = c.Spec.Process.Cwd
 	}
-
 	if e.Envv == nil {
 		e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env)
 		if err != nil {
 			Fatalf("error getting environment variables: %v", err)
 		}
 	}
+	if e.Capabilities == nil {
+		e.Capabilities, err = specutils.Capabilities(c.Spec.Process.Capabilities)
+		if err != nil {
+			Fatalf("error creating capabilities: %v", err)
+		}
+	}
 
 	// containerd expects an actual process to represent the container being
 	// executed. If detach was specified, starts a child in non-detach mode,
@@ -265,9 +271,13 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
 		extraKGIDs = append(extraKGIDs, auth.KGID(kgid))
 	}
 
-	caps, err := capabilities(ex.caps)
-	if err != nil {
-		return nil, fmt.Errorf("capabilities error: %v", err)
+	var caps *auth.TaskCapabilities
+	if len(ex.caps) > 0 {
+		var err error
+		caps, err = capabilities(ex.caps)
+		if err != nil {
+			return nil, fmt.Errorf("capabilities error: %v", err)
+		}
 	}
 
 	return &control.ExecArgs{
@@ -299,9 +309,13 @@ func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
 // to ExecArgs.
 func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
 	// Create capabilities.
-	caps, err := specutils.Capabilities(p.Capabilities)
-	if err != nil {
-		return nil, fmt.Errorf("error creating capabilities: %v", err)
+	var caps *auth.TaskCapabilities
+	if p.Capabilities != nil {
+		var err error
+		caps, err = specutils.Capabilities(p.Capabilities)
+		if err != nil {
+			return nil, fmt.Errorf("error creating capabilities: %v", err)
+		}
 	}
 
 	// Convert the spec's additional GIDs to KGIDs.
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index b1e5e726a..13fd8f1ee 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -44,7 +44,7 @@ func TestHelloWorld(t *testing.T) {
 	}
 	defer d.CleanUp()
 
-	if err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
+	if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
 		t.Fatalf("docker didn't say hello: %v", err)
 	}
 }
@@ -160,7 +160,7 @@ func TestMysql(t *testing.T) {
 	defer d.CleanUp()
 
 	// Wait until it's up and running.
-	if err := d.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
+	if _, err := d.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
 
@@ -184,10 +184,10 @@ func TestMysql(t *testing.T) {
 	defer client.CleanUp()
 
 	// Ensure file executed to the end and shutdown mysql.
-	if err := client.WaitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
+	if _, err := client.WaitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
-	if err := d.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
+	if _, err := d.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
 		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
 	}
 }
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index b366fe936..4407016ad 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -6,6 +6,7 @@ go_test(
     name = "integration_test",
     size = "large",
     srcs = [
+        "exec_test.go",
         "integration_test.go",
     ],
     embed = [":integration"],
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
new file mode 100644
index 000000000..6053ecd1c
--- /dev/null
+++ b/runsc/test/integration/exec_test.go
@@ -0,0 +1,62 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package image provides end-to-end integration tests for runsc. These tests require
+// docker and runsc to be installed on the machine. To set it up, run:
+//
+//     ./runsc/test/install.sh [--runtime <name>]
+//
+// The tests expect the runtime name to be provided in the RUNSC_RUNTIME
+// environment variable (default: runsc-test).
+//
+// Each test calls docker commands to start up a container, and tests that it is
+// behaving properly, with various runsc commands. The container is killed and deleted
+// at the end.
+
+package integration
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+func TestExecCapabilities(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("exec-test")
+
+	// Start the container.
+	if _, err := d.Run("alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	want, err := d.WaitForOutput("CapEff:\t[0-9a-f]+\n", 5*time.Second)
+	if err != nil {
+		t.Fatalf("WaitForOutput() timeout: %v", err)
+	}
+	t.Log("Root capabilities:", want)
+
+	// Now check that exec'd process capabilities match the root.
+	got, err := d.Exec("grep", "CapEff:", "/proc/self/status")
+	if err != nil {
+		t.Fatalf("docker exec failed: %v", err)
+	}
+	if got != want {
+		t.Errorf("wrong capabilities, got: %q, want: %q", got, want)
+	}
+}
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index c6b546a56..59df5bd7c 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -179,7 +179,7 @@ func TestConnectToSelf(t *testing.T) {
 	if want := "server\n"; reply != want {
 		t.Errorf("Error on server, want: %q, got: %q", want, reply)
 	}
-	if err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil {
+	if _, err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil {
 		t.Fatal("docker.WaitForOutput(client) timeout:", err)
 	}
 }
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index fc67c174a..c73bb0406 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -218,20 +218,20 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) {
 
 // WaitForOutput calls 'docker logs' to retrieve containers output and searches
 // for the given pattern.
-func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) error {
+func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
 	re := regexp.MustCompile(pattern)
 	var out string
 	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
 		var err error
 		out, err = do("logs", d.Name)
 		if err != nil {
-			return err
+			return "", err
 		}
-		if re.MatchString(out) {
+		if match := re.FindString(out); match != "" {
 			// Success!
-			return nil
+			return match, nil
 		}
 		time.Sleep(100 * time.Millisecond)
 	}
-	return fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
+	return "", fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
 }
-- 
cgit v1.2.3


From 8f0b6e7fc02919df034dea9e9c9dbab1b80de2be Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 5 Sep 2018 21:13:46 -0700
Subject: runsc: Support runsc kill multi-container.

Now, we can kill individual containers rather than the entire sandbox.

PiperOrigin-RevId: 211748106
Change-Id: Ic97e91db33d53782f838338c4a6d0aab7a313ead
---
 runsc/boot/controller.go          |  11 +--
 runsc/boot/loader.go              |  17 +++++
 runsc/cmd/kill.go                 |   2 +
 runsc/container/container.go      |   1 +
 runsc/container/container_test.go | 142 ++++++++++++++++++++++++++++++++++++--
 5 files changed, 156 insertions(+), 17 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index ec1110059..45aa255c4 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,7 +22,6 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -387,13 +386,5 @@ type SignalArgs struct {
 // Signal sends a signal to the init process of the container.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
 	log.Debugf("containerManager.Signal")
-	// TODO: Use the cid and send the signal to the init
-	// process in theat container. Currently we just signal PID 1 in the
-	// sandbox.
-	si := arch.SignalInfo{Signo: args.Signo}
-	t := cm.l.k.TaskSet().Root.TaskWithID(1)
-	if t == nil {
-		return fmt.Errorf("cannot signal: no task with id 1")
-	}
-	return t.SendSignal(&si)
+	return cm.l.signal(args.CID, args.Signo)
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 2733c4d69..ae2226e12 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -576,3 +577,19 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
 	}
 }
+
+func (l *Loader) signal(cid string, signo int32) error {
+	l.mu.Lock()
+	tgid, ok := l.containerRootTGIDs[cid]
+	l.mu.Unlock()
+	if !ok {
+		return fmt.Errorf("failed to signal container %q: no such container", cid)
+	}
+
+	// The thread group ID of a process is the leading task's thread ID.
+	t := l.k.TaskSet().Root.TaskWithID(tgid)
+	if t == nil {
+		return fmt.Errorf("cannot signal: no task with ID %d", tgid)
+	}
+	return t.SendSignal(&arch.SignalInfo{Signo: signo})
+}
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 0979b002b..6fa5674f1 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -81,6 +81,8 @@ func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	if err != nil {
 		Fatalf("%v", err)
 	}
+	// TODO: Distinguish between already-exited containers and
+	// genuine errors.
 	if err := c.Signal(sig); err != nil {
 		Fatalf("%v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index a4a3ed56d..5977fbd21 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -427,6 +427,7 @@ func (c *Container) Signal(sig syscall.Signal) error {
 		log.Warningf("container %q not running, not sending signal %v", c.ID, sig)
 		return nil
 	}
+	// TODO: Query the container for its state, then save it.
 	return c.Sandbox.Signal(c.ID, sig)
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index c00db3e91..00e38e12c 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -38,6 +38,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
@@ -336,8 +337,8 @@ func TestLifecycle(t *testing.T) {
 			wg.Done()
 		}()
 
-		// Wait a bit to ensure that we've started waiting on the container
-		// before we signal.
+		// Wait a bit to ensure that we've started waiting on the
+		// container before we signal.
 		<-ch
 		time.Sleep(100 * time.Millisecond)
 		// Send the container a SIGTERM which will cause it to stop.
@@ -347,11 +348,11 @@ func TestLifecycle(t *testing.T) {
 		// Wait for it to die.
 		wg.Wait()
 
-		// The sandbox process should have exited by now, but it is a zombie.
-		// In normal runsc usage, it will be parented to init, and init will
-		// reap the sandbox. However, in this case the test runner is the
-		// parent and will not reap the sandbox process, so we must do it
-		// ourselves.
+		// The sandbox process should have exited by now, but it is a
+		// zombie. In normal runsc usage, it will be parented to init,
+		// and init will reap the sandbox. However, in this case the
+		// test runner is the parent and will not reap the sandbox
+		// process, so we must do it ourselves.
 		p, _ := os.FindProcess(s.Sandbox.Pid)
 		p.Wait()
 		g, _ := os.FindProcess(s.GoferPid)
@@ -1547,6 +1548,133 @@ func TestGoferExits(t *testing.T) {
 	}
 }
 
+// TestMultiContainerSignal checks that it is possible to signal individual
+// containers without killing the entire sandbox.
+func TestMultiContainerSignal(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		containerIDs := []string{
+			testutil.UniqueContainerID(),
+			testutil.UniqueContainerID(),
+		}
+		containerAnnotations := []map[string]string{
+			// The first container creates a sandbox.
+			map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+			},
+			// The second container creates a container within the first
+			// container's sandbox.
+			map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+				specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
+			},
+		}
+
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+
+		// Setup the containers.
+		containers := make([]*Container, 0, len(containerIDs))
+		for i, annotations := range containerAnnotations {
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			spec.Annotations = annotations
+			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(bundleDir)
+			cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+			containers = append(containers, cont)
+		}
+
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  0,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
+
+		// Check via ps that multiple processes are running.
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
+
+		// Kill process 2.
+		if err := containers[1].Signal(syscall.SIGKILL); err != nil {
+			t.Errorf("failed to kill process 2: %v", err)
+		}
+
+		// Make sure process 1 is still running.
+		if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
+
+		// Now that process 2 is gone, ensure we get an error trying to
+		// signal it again.
+		if err := containers[1].Signal(syscall.SIGKILL); err == nil {
+			t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
+		}
+
+		// Kill process 1.
+		if err := containers[0].Signal(syscall.SIGKILL); err != nil {
+			t.Errorf("failed to kill process 1: %v", err)
+		}
+
+		if err := waitForSandboxExit(containers[0]); err != nil {
+			t.Errorf("failed to exit sandbox: %v", err)
+		}
+
+		// The sentry should be gone, so signaling should yield an
+		// error.
+		if err := containers[0].Signal(syscall.SIGKILL); err == nil {
+			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
+		}
+	}
+}
+
+// waitForSandboxExit waits until both the sandbox and gofer processes of the
+// container have exited.
+func waitForSandboxExit(container *Container) error {
+	goferProc, _ := os.FindProcess(container.GoferPid)
+	state, err := goferProc.Wait()
+	if err != nil {
+		return err
+	}
+	if !state.Exited() {
+		return fmt.Errorf("gofer with PID %d failed to exit", container.GoferPid)
+	}
+	sandboxProc, _ := os.FindProcess(container.Sandbox.Pid)
+	state, err = sandboxProc.Wait()
+	if err != nil {
+		return err
+	}
+	if !state.Exited() {
+		return fmt.Errorf("sandbox with PID %d failed to exit", container.Sandbox.Pid)
+	}
+	return nil
+}
+
 func TestMain(m *testing.M) {
 	testutil.RunAsRoot(m)
 }
-- 
cgit v1.2.3


From d95663a6b9831b56602c09f33a9679fa15175b97 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 6 Sep 2018 10:40:53 -0700
Subject: runsc testing: Move TestMultiContainerSignal to multi_container_test.

PiperOrigin-RevId: 211831396
Change-Id: Id67f182cb43dccb696180ec967f5b96176f252e0
---
 runsc/container/container_test.go       | 128 --------------------------------
 runsc/container/multi_container_test.go | 100 +++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 128 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 00e38e12c..9a94347b6 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -38,7 +38,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
@@ -1548,133 +1547,6 @@ func TestGoferExits(t *testing.T) {
 	}
 }
 
-// TestMultiContainerSignal checks that it is possible to signal individual
-// containers without killing the entire sandbox.
-func TestMultiContainerSignal(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		containerIDs := []string{
-			testutil.UniqueContainerID(),
-			testutil.UniqueContainerID(),
-		}
-		containerAnnotations := []map[string]string{
-			// The first container creates a sandbox.
-			map[string]string{
-				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
-			},
-			// The second container creates a container within the first
-			// container's sandbox.
-			map[string]string{
-				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
-				specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
-			},
-		}
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-
-		// Setup the containers.
-		containers := make([]*Container, 0, len(containerIDs))
-		for i, annotations := range containerAnnotations {
-			spec := testutil.NewSpecWithArgs("sleep", "100")
-			spec.Annotations = annotations
-			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer os.RemoveAll(bundleDir)
-			cont, err := Create(containerIDs[i], spec, conf, bundleDir, "", "")
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer cont.Destroy()
-			if err := cont.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-			containers = append(containers, cont)
-		}
-
-		expectedPL := []*control.Process{
-			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
-			},
-			{
-				UID:  0,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
-			},
-		}
-
-		// Check via ps that multiple processes are running.
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-
-		// Kill process 2.
-		if err := containers[1].Signal(syscall.SIGKILL); err != nil {
-			t.Errorf("failed to kill process 2: %v", err)
-		}
-
-		// Make sure process 1 is still running.
-		if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-
-		// Now that process 2 is gone, ensure we get an error trying to
-		// signal it again.
-		if err := containers[1].Signal(syscall.SIGKILL); err == nil {
-			t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
-		}
-
-		// Kill process 1.
-		if err := containers[0].Signal(syscall.SIGKILL); err != nil {
-			t.Errorf("failed to kill process 1: %v", err)
-		}
-
-		if err := waitForSandboxExit(containers[0]); err != nil {
-			t.Errorf("failed to exit sandbox: %v", err)
-		}
-
-		// The sentry should be gone, so signaling should yield an
-		// error.
-		if err := containers[0].Signal(syscall.SIGKILL); err == nil {
-			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
-		}
-	}
-}
-
-// waitForSandboxExit waits until both the sandbox and gofer processes of the
-// container have exited.
-func waitForSandboxExit(container *Container) error {
-	goferProc, _ := os.FindProcess(container.GoferPid)
-	state, err := goferProc.Wait()
-	if err != nil {
-		return err
-	}
-	if !state.Exited() {
-		return fmt.Errorf("gofer with PID %d failed to exit", container.GoferPid)
-	}
-	sandboxProc, _ := os.FindProcess(container.Sandbox.Pid)
-	state, err = sandboxProc.Wait()
-	if err != nil {
-		return err
-	}
-	if !state.Exited() {
-		return fmt.Errorf("sandbox with PID %d failed to exit", container.Sandbox.Pid)
-	}
-	return nil
-}
-
 func TestMain(m *testing.M) {
 	testutil.RunAsRoot(m)
 }
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 3bdfbaca3..84e0ec080 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -15,11 +15,13 @@
 package container
 
 import (
+	"fmt"
 	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
+	"syscall"
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -237,3 +239,101 @@ func TestMultiContainerMount(t *testing.T) {
 		t.Error("container failed, waitStatus:", ws)
 	}
 }
+
+// TestMultiContainerSignal checks that it is possible to signal individual
+// containers without killing the entire sandbox.
+func TestMultiContainerSignal(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+
+		// Setup the containers.
+		sleep := []string{"sleep", "100"}
+		specs, ids := createSpecs(sleep, sleep)
+		var containers []*Container
+		for i, spec := range specs {
+			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(bundleDir)
+			cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+			containers = append(containers, cont)
+		}
+
+		// Check via ps that multiple processes are running.
+		expectedPL := []*control.Process{
+			{PID: 1, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep"},
+		}
+
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
+
+		// Kill process 2.
+		if err := containers[1].Signal(syscall.SIGKILL); err != nil {
+			t.Errorf("failed to kill process 2: %v", err)
+		}
+
+		// Make sure process 1 is still running.
+		if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
+
+		// Now that process 2 is gone, ensure we get an error trying to
+		// signal it again.
+		if err := containers[1].Signal(syscall.SIGKILL); err == nil {
+			t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
+		}
+
+		// Kill process 1.
+		if err := containers[0].Signal(syscall.SIGKILL); err != nil {
+			t.Errorf("failed to kill process 1: %v", err)
+		}
+
+		if err := waitForSandboxExit(containers[0]); err != nil {
+			t.Errorf("failed to exit sandbox: %v", err)
+		}
+
+		// The sentry should be gone, so signaling should yield an
+		// error.
+		if err := containers[0].Signal(syscall.SIGKILL); err == nil {
+			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
+		}
+	}
+}
+
+// waitForSandboxExit waits until both the sandbox and gofer processes of the
+// container have exited.
+func waitForSandboxExit(container *Container) error {
+	goferProc, _ := os.FindProcess(container.GoferPid)
+	state, err := goferProc.Wait()
+	if err != nil {
+		return err
+	}
+	if !state.Exited() {
+		return fmt.Errorf("gofer with PID %d failed to exit", container.GoferPid)
+	}
+	sandboxProc, _ := os.FindProcess(container.Sandbox.Pid)
+	state, err = sandboxProc.Wait()
+	if err != nil {
+		return err
+	}
+	if !state.Exited() {
+		return fmt.Errorf("sandbox with PID %d failed to exit", container.Sandbox.Pid)
+	}
+	return nil
+}
-- 
cgit v1.2.3


From efac28976c6dbf40627d02753fee1467c8272b45 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 6 Sep 2018 10:58:58 -0700
Subject: Enable network for multi-container

PiperOrigin-RevId: 211834411
Change-Id: I52311a6c5407f984e5069359d9444027084e4d2a
---
 runsc/sandbox/network.go | 52 +++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 8694ba755..60cbbfcdb 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -57,31 +57,33 @@ const (
 func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
 	log.Infof("Setting up network")
 
-	// HACK!
-	//
-	// When kubernetes starts a pod, it first creates a sandbox with an
-	// application that just pauses forever.  Later, when a container is
-	// added to the pod, kubernetes will create another sandbox with a
-	// config that corresponds to the containerized application, and add it
-	// to the same namespaces as the pause sandbox.
-	//
-	// Running a second sandbox currently breaks because the two sandboxes
-	// have the same network namespace and configuration, and try to create
-	// a tap device on the same host device which fails.
-	//
-	// Runsc will eventually need to detect that this container is meant to
-	// be run in the same sandbox as the pausing application, and somehow
-	// make that happen.
-	//
-	// For now the following HACK disables networking for the "pause"
-	// sandbox, allowing the second sandbox to start up successfully.
-	//
-	// TODO: Remove this once multiple containers per sandbox
-	// is properly supported.
-	if spec.Annotations[crioContainerTypeAnnotation] == "sandbox" ||
-		spec.Annotations[containerdContainerTypeAnnotation] == "sandbox" {
-		log.Warningf("HACK: Disabling network")
-		conf.Network = boot.NetworkNone
+	if !conf.MultiContainer {
+		// HACK!
+		//
+		// When kubernetes starts a pod, it first creates a sandbox with an
+		// application that just pauses forever.  Later, when a container is
+		// added to the pod, kubernetes will create another sandbox with a
+		// config that corresponds to the containerized application, and add it
+		// to the same namespaces as the pause sandbox.
+		//
+		// Running a second sandbox currently breaks because the two sandboxes
+		// have the same network namespace and configuration, and try to create
+		// a tap device on the same host device which fails.
+		//
+		// Runsc will eventually need to detect that this container is meant to
+		// be run in the same sandbox as the pausing application, and somehow
+		// make that happen.
+		//
+		// For now the following HACK disables networking for the "pause"
+		// sandbox, allowing the second sandbox to start up successfully.
+		//
+		// TODO: Remove this once multiple containers per sandbox
+		// is properly supported.
+		if spec.Annotations[crioContainerTypeAnnotation] == "sandbox" ||
+			spec.Annotations[containerdContainerTypeAnnotation] == "sandbox" {
+			log.Warningf("HACK: Disabling network")
+			conf.Network = boot.NetworkNone
+		}
 	}
 
 	switch conf.Network {
-- 
cgit v1.2.3


From 4f3053cb4e4ec408efdce6c7174e847ae71f2f88 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Thu, 6 Sep 2018 11:04:58 -0700
Subject: runsc: do not delete in paused state.

PiperOrigin-RevId: 211835570
Change-Id: Ied7933732cad5bc60b762e9c964986cb49a8d9b9
---
 runsc/cmd/delete.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 46de5f348..92b609c3c 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -76,8 +76,8 @@ func (d *Delete) execute(ids []string, conf *boot.Config) error {
 			}
 			return fmt.Errorf("error loading container %q: %v", id, err)
 		}
-		if !d.force && (c.Status == container.Running) {
-			return fmt.Errorf("cannot stop running container without --force flag")
+		if !d.force && c.Status != container.Created && c.Status != container.Stopped {
+			return fmt.Errorf("cannot delete container that is not stopped without --force flag")
 		}
 		if err := c.Destroy(); err != nil {
 			return fmt.Errorf("error destroying container: %v", err)
-- 
cgit v1.2.3


From 590d8320992d74e54e2c095c68c49abc2b23dcbe Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 7 Sep 2018 10:04:11 -0700
Subject: runsc: Dup debug log file to stderr, so sentry panics don't get lost.

Docker and containerd do not expose runsc's stderr, so tracking down sentry
panics can be painful.

If we have a debug log file, we should send panics (and all stderr data) to the
log file.

PiperOrigin-RevId: 211992321
Change-Id: I5f0d2f45f35c110a38dab86bafc695aaba42f7a3
---
 runsc/main.go | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'runsc')

diff --git a/runsc/main.go b/runsc/main.go
index 0c9b9af78..c51b199aa 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -179,6 +179,10 @@ func main() {
 
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+		// Dup f to stderr so we capture stack traces on panic.
+		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
+		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	} else if *debugLogDir != "" {
 		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
@@ -189,6 +193,10 @@ func main() {
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
 		}
+		// Dup f to stderr so we capture stack traces on panic.
+		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
+		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	}
 
-- 
cgit v1.2.3


From 210c2520890ea48d551c0c9fffe890a7c60fb802 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 7 Sep 2018 10:15:34 -0700
Subject: runsc: Run sandbox process inside minimal chroot.

We construct a dir with the executable bind-mounted at /exe, and proc mounted
at /proc.  Runsc now executes the sandbox process inside this chroot, thus
limiting access to the host filesystem.  The mounts and chroot dir are removed
when the sandbox is destroyed.

Because this requires bind-mounts, we can only do the chroot if we have
CAP_SYS_ADMIN.

PiperOrigin-RevId: 211994001
Change-Id: Ia71c515e26085e0b69b833e71691830148bc70d1
---
 runsc/boot/config.go            |   9 +--
 runsc/container/fs.go           |  30 ++--------
 runsc/sandbox/BUILD             |   1 +
 runsc/sandbox/chroot.go         | 120 ++++++++++++++++++++++++++++++++++++++++
 runsc/sandbox/sandbox.go        |  33 +++++++++--
 runsc/specutils/namespace.go    |  12 ++++
 runsc/specutils/specutils.go    |  41 ++++++++++++++
 runsc/test/testutil/BUILD       |   1 -
 runsc/test/testutil/testutil.go |  26 +++------
 9 files changed, 221 insertions(+), 52 deletions(-)
 create mode 100644 runsc/sandbox/chroot.go

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 87a47dd0b..28a1600cd 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -214,10 +214,11 @@ type Config struct {
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
 	PanicSignal int
 
-	// TestOnlyAllowRunAsCurrentUser should only be used in tests. It
-	// allows runsc to start the sandbox process as the current user if we
-	// do not have capability to set uid/gid to another user.
-	TestOnlyAllowRunAsCurrentUser bool
+	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+	// tests. It allows runsc to start the sandbox process as the current
+	// user, and without chrooting the sandbox process. This can be
+	// necessary in test environments that have limited capabilities.
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index fb352fc7c..a3c5772ba 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -77,11 +77,6 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
 			continue
 		}
-		src := m.Source
-		srcfi, err := os.Stat(src)
-		if err != nil {
-			return fmt.Errorf("failed to stat() mount source: %v", err)
-		}
 
 		// It's possible that 'm.Destination' follows symlinks inside the
 		// container.
@@ -90,30 +85,13 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 			return fmt.Errorf("failed to resolve symlinks: %v", err)
 		}
 
-		// Create mount point if it doesn't exits
-		if _, err := os.Stat(dst); os.IsNotExist(err) {
-			if srcfi.IsDir() {
-				if err := os.MkdirAll(dst, 0755); err != nil {
-					return fmt.Errorf("failed to make mount directory %q: %v", dst, err)
-				}
-			} else {
-				if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
-					return fmt.Errorf("failed to make mount directory for file %q: %v", filepath.Dir(dst), err)
-				}
-				f, err := os.OpenFile(dst, os.O_CREATE, 0755)
-				if err != nil {
-					return fmt.Errorf("failed to open mount file %q: %v", dst, err)
-				}
-				f.Close()
-			}
-		}
-
 		flags := optionsToFlags(m.Options)
 		flags |= syscall.MS_BIND
-		log.Infof("Mounting src: %q, dst: %q, flags: %#x", src, dst, flags)
-		if err := syscall.Mount(src, dst, m.Type, uintptr(flags), ""); err != nil {
-			return fmt.Errorf("failed to mount src: %q, dst: %q, flags: %#x, err: %v", src, dst, flags, err)
+		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
+		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
+			return fmt.Errorf("failed to mount %v: %v", m, err)
 		}
+
 		// Make the mount a slave, so that for recursive bind mount, umount won't
 		// propagate to the source.
 		flags = syscall.MS_SLAVE | syscall.MS_REC
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 9317b1c14..8ebd14c4e 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
     name = "sandbox",
     srcs = [
+        "chroot.go",
         "network.go",
         "sandbox.go",
     ],
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
new file mode 100644
index 000000000..a77a186c2
--- /dev/null
+++ b/runsc/sandbox/chroot.go
@@ -0,0 +1,120 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// chrootBinPath is the location inside the chroot where the runsc binary will
+// be mounted.
+const chrootBinPath = "/runsc"
+
+// mountInChroot creates the destination mount point in the given chroot and
+// mounts the source.
+func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
+	chrootDst := filepath.Join(chroot, dst)
+	log.Infof("Mounting %q at %q", src, chrootDst)
+
+	return specutils.Mount(src, chrootDst, typ, flags)
+}
+
+// setUpChroot creates an empty directory with runsc mounted at /runsc, proc
+// mounted at /proc, and any dev files needed for the platform.
+func setUpChroot(platform boot.PlatformType) (string, error) {
+	// Create the chroot directory and make it accessible to all users.
+	chroot, err := ioutil.TempDir("", "runsc-sandbox-chroot-")
+	if err != nil {
+		return "", fmt.Errorf("TempDir() failed: %v", err)
+	}
+	if err := os.Chmod(chroot, 0777); err != nil {
+		return "", fmt.Errorf("Chmod(%q) failed: %v", chroot, err)
+	}
+	log.Infof("Setting up sandbox chroot in %q", chroot)
+
+	// Mount /proc.
+	if err := mountInChroot(chroot, "proc", "/proc", "proc", 0); err != nil {
+		return "", fmt.Errorf("error mounting proc in chroot: %v", err)
+	}
+
+	// Mount runsc at /runsc in the chroot.
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		return "", err
+	}
+	if err := mountInChroot(chroot, binPath, chrootBinPath, "bind", syscall.MS_BIND|syscall.MS_RDONLY); err != nil {
+		return "", fmt.Errorf("error mounting runsc in chroot: %v", err)
+	}
+
+	// Mount dev files needed for platform.
+	var devMount string
+	switch platform {
+	case boot.PlatformKVM:
+		devMount = "/dev/kvm"
+	}
+	if devMount != "" {
+		if err := mountInChroot(chroot, devMount, devMount, "bind", syscall.MS_BIND); err != nil {
+			return "", fmt.Errorf("error mounting platform device in chroot: %v", err)
+		}
+	}
+
+	return chroot, nil
+}
+
+// tearDownChroot unmounts /proc and /runsc from the chroot before deleting the
+// directory.
+func tearDownChroot(chroot string) error {
+	// Unmount /proc.
+	proc := filepath.Join(chroot, "proc")
+	if err := syscall.Unmount(proc, 0); err != nil {
+		return fmt.Errorf("error unmounting %q: %v", proc, err)
+	}
+
+	// Unmount /runsc.
+	exe := filepath.Join(chroot, chrootBinPath)
+	if err := syscall.Unmount(exe, 0); err != nil {
+		return fmt.Errorf("error unmounting %q: %v", exe, err)
+	}
+
+	// Unmount platform dev files.
+	devFiles := []string{"dev/kvm"}
+	for _, f := range devFiles {
+		devPath := filepath.Join(chroot, f)
+		if _, err := os.Stat(devPath); err != nil {
+			if os.IsNotExist(err) {
+				continue
+			}
+			return fmt.Errorf("Stat(%q) failed: %v", devPath, err)
+		}
+		if err := syscall.Unmount(devPath, 0); err != nil {
+			return fmt.Errorf("error unmounting %q: %v", devPath, err)
+		}
+	}
+
+	// Remove chroot directory.
+	if err := os.RemoveAll(chroot); err != nil {
+		return fmt.Errorf("error removing %q: %v", chroot, err)
+	}
+
+	return nil
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index dd5a0aa56..f6264d5b2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -51,6 +51,10 @@ type Sandbox struct {
 	// Pid is the pid of the running sandbox (immutable). May be 0 is the sandbox
 	// is not running.
 	Pid int `json:"pid"`
+
+	// Chroot is the path to the chroot directory that the sandbox process
+	// is running in.
+	Chroot string `json:"chroot"`
 }
 
 // Create creates the sandbox process.
@@ -392,12 +396,11 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		log.Infof("Sandbox will be started in new user namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
 
-		if conf.TestOnlyAllowRunAsCurrentUser {
+		// If we have CAP_SETUID and CAP_SETGID, then we can also run
+		// as user nobody.
+		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
 		} else if specutils.CanSetUIDGID() {
-			// If we have CAP_SETUID and CAP_SETGID, then we can also run
-			// as user nobody.
-
 			// Map nobody in the new namespace to nobody in the parent namespace.
 			const nobody = 65534
 			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{{
@@ -419,6 +422,23 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		} else {
 			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
 		}
+
+		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
+		// bind-mount the executable inside it.
+		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
+		} else if specutils.HasCapSysAdmin() {
+			log.Infof("Sandbox will be started in minimal chroot")
+			chroot, err := setUpChroot(conf.Platform)
+			if err != nil {
+				return fmt.Errorf("error setting up chroot: %v", err)
+			}
+			cmd.SysProcAttr.Chroot = chroot
+			cmd.Args[0] = "/runsc"
+			cmd.Path = "/runsc"
+		} else {
+			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
+		}
 	}
 
 	// Log the fds we are donating to the sandbox process.
@@ -525,6 +545,11 @@ func (s *Sandbox) Destroy() error {
 		log.Debugf("Killing sandbox %q", s.ID)
 		signalProcess(s.Pid, unix.SIGKILL)
 	}
+
+	if s.Chroot != "" {
+		return tearDownChroot(s.Chroot)
+	}
+
 	return nil
 }
 
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 356943a65..48a199a77 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -216,3 +216,15 @@ func CanSetUIDGID() bool {
 	return caps.Get(capability.EFFECTIVE, capability.CAP_SETUID) &&
 		caps.Get(capability.EFFECTIVE, capability.CAP_SETGID)
 }
+
+// HasCapSysAdmin returns true if the user has CAP_SYS_ADMIN capability.
+func HasCapSysAdmin() bool {
+	caps, err := capability.NewPid2(os.Getpid())
+	if err != nil {
+		return false
+	}
+	if err := caps.Load(); err != nil {
+		return false
+	}
+	return caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN)
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 551718e9a..f3fa8d129 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -363,3 +363,44 @@ func DebugLogFile(logDir, subcommand string) (*os.File, error) {
 	filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), subcommand)
 	return os.OpenFile(filepath.Join(logDir, filename), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
 }
+
+// Mount creates the mount point and calls Mount with the given flags.
+func Mount(src, dst, typ string, flags uint32) error {
+	// Create the mount point inside. The type must be the same as the
+	// source (file or directory).
+	var isDir bool
+	if typ == "proc" {
+		// Special case, as there is no source directory for proc
+		// mounts.
+		isDir = true
+	} else if fi, err := os.Stat(src); err != nil {
+		return fmt.Errorf("Stat(%q) failed: %v", src, err)
+	} else {
+		isDir = fi.IsDir()
+	}
+
+	if isDir {
+		// Create the destination directory.
+		if err := os.MkdirAll(dst, 0777); err != nil {
+			return fmt.Errorf("Mkdir(%q) failed: %v", dst, err)
+		}
+	} else {
+		// Create the parent destination directory.
+		parent := path.Dir(dst)
+		if err := os.MkdirAll(parent, 0777); err != nil {
+			return fmt.Errorf("Mkdir(%q) failed: %v", parent, err)
+		}
+		// Create the destination file if it does not exist.
+		f, err := os.OpenFile(dst, syscall.O_CREAT, 0777)
+		if err != nil {
+			return fmt.Errorf("Open(%q) failed: %v", dst, err)
+		}
+		f.Close()
+	}
+
+	// Do the mount.
+	if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil {
+		return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err)
+	}
+	return nil
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index ca91e07ff..03ab3c4ac 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -18,6 +18,5 @@ go_library(
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 77bd56912..4f012a8ea 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -32,7 +32,6 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/syndtr/gocapability/capability"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -104,14 +103,14 @@ func FindFile(path string) (string, error) {
 // TestConfig return the default configuration to use in tests.
 func TestConfig() *boot.Config {
 	return &boot.Config{
-		Debug:                         true,
-		LogFormat:                     "text",
-		LogPackets:                    true,
-		Network:                       boot.NetworkNone,
-		Strace:                        true,
-		MultiContainer:                true,
-		FileAccess:                    boot.FileAccessProxyExclusive,
-		TestOnlyAllowRunAsCurrentUser: true,
+		Debug:          true,
+		LogFormat:      "text",
+		LogPackets:     true,
+		Network:        boot.NetworkNone,
+		Strace:         true,
+		MultiContainer: true,
+		FileAccess:     boot.FileAccessProxyExclusive,
+		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
 	}
 }
 
@@ -238,14 +237,7 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 // RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create
 // a new user namespace and reexecute the test as root inside of the namespace.
 func RunAsRoot(m *testing.M) {
-	caps, err := capability.NewPid2(os.Getpid())
-	if err != nil {
-		panic(err.Error())
-	}
-	if err := caps.Load(); err != nil {
-		panic(err.Error())
-	}
-	if caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN) {
+	if specutils.HasCapSysAdmin() {
 		// Capability: check! Good to run.
 		os.Exit(m.Run())
 	}
-- 
cgit v1.2.3


From f895cb4d8b4b37a563b7a5b9dc92eae552084b44 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 7 Sep 2018 10:44:50 -0700
Subject: Use root abstract socket namespace for exec

PiperOrigin-RevId: 211999211
Change-Id: I5968dd1a8313d3e49bb6e6614e130107495de41d
---
 pkg/sentry/control/proc.go  | 23 ++++++++-------
 pkg/sentry/kernel/kernel.go | 41 +++++++++++++++++---------
 runsc/boot/loader.go        | 72 +++++++++++++++++++--------------------------
 3 files changed, 70 insertions(+), 66 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 4848a5d2b..6949a3ae5 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -95,17 +95,18 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		proc.Kernel.RootUserNamespace())
 
 	initArgs := kernel.CreateProcessArgs{
-		Filename:             args.Filename,
-		Argv:                 args.Argv,
-		Envv:                 args.Envv,
-		WorkingDirectory:     args.WorkingDirectory,
-		Credentials:          creds,
-		FDMap:                fdm,
-		Umask:                0022,
-		Limits:               l,
-		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		UTSNamespace:         proc.Kernel.RootUTSNamespace(),
-		IPCNamespace:         proc.Kernel.RootIPCNamespace(),
+		Filename:                args.Filename,
+		Argv:                    args.Argv,
+		Envv:                    args.Envv,
+		WorkingDirectory:        args.WorkingDirectory,
+		Credentials:             creds,
+		FDMap:                   fdm,
+		Umask:                   0022,
+		Limits:                  l,
+		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
+		UTSNamespace:            proc.Kernel.RootUTSNamespace(),
+		IPCNamespace:            proc.Kernel.RootIPCNamespace(),
+		AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
 	}
 	ctx := initArgs.NewContext(proc.Kernel)
 	mounter := fs.FileOwnerFromContext(ctx)
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 31a2f068d..bc41c3963 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -90,17 +90,18 @@ type Kernel struct {
 	platform.Platform `state:"nosave"`
 
 	// See InitKernelArgs for the meaning of these fields.
-	featureSet        *cpuid.FeatureSet
-	timekeeper        *Timekeeper
-	tasks             *TaskSet
-	rootUserNamespace *auth.UserNamespace
-	networkStack      inet.Stack `state:"nosave"`
-	applicationCores  uint
-	useHostCores      bool
-	extraAuxv         []arch.AuxEntry
-	vdso              *loader.VDSO
-	rootUTSNamespace  *UTSNamespace
-	rootIPCNamespace  *IPCNamespace
+	featureSet                  *cpuid.FeatureSet
+	timekeeper                  *Timekeeper
+	tasks                       *TaskSet
+	rootUserNamespace           *auth.UserNamespace
+	networkStack                inet.Stack `state:"nosave"`
+	applicationCores            uint
+	useHostCores                bool
+	extraAuxv                   []arch.AuxEntry
+	vdso                        *loader.VDSO
+	rootUTSNamespace            *UTSNamespace
+	rootIPCNamespace            *IPCNamespace
+	rootAbstractSocketNamespace *AbstractSocketNamespace
 
 	// mounts holds the state of the virtual filesystem. mounts is initially
 	// nil, and must be set by calling Kernel.SetRootMountNamespace before
@@ -201,11 +202,14 @@ type InitKernelArgs struct {
 	// Vdso holds the VDSO and its parameter page.
 	Vdso *loader.VDSO
 
-	// RootUTSNamespace is the root UTS namepsace.
+	// RootUTSNamespace is the root UTS namespace.
 	RootUTSNamespace *UTSNamespace
 
-	// RootIPCNamespace is the root IPC namepsace.
+	// RootIPCNamespace is the root IPC namespace.
 	RootIPCNamespace *IPCNamespace
+
+	// RootAbstractSocketNamespace is the root Abstract Socket namespace.
+	RootAbstractSocketNamespace *AbstractSocketNamespace
 }
 
 // Init initialize the Kernel with no tasks.
@@ -231,6 +235,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.rootUserNamespace = args.RootUserNamespace
 	k.rootUTSNamespace = args.RootUTSNamespace
 	k.rootIPCNamespace = args.RootIPCNamespace
+	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
 	k.networkStack = args.NetworkStack
 	k.applicationCores = args.ApplicationCores
 	if args.UseHostCores {
@@ -509,6 +514,9 @@ type CreateProcessArgs struct {
 	// IPCNamespace is the initial IPC namespace.
 	IPCNamespace *IPCNamespace
 
+	// AbstractSocketNamespace is the initial Abstract Socket namespace.
+	AbstractSocketNamespace *AbstractSocketNamespace
+
 	// Root optionally contains the dirent that serves as the root for the
 	// process. If nil, the mount namespace's root is used as the process'
 	// root.
@@ -651,7 +659,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
-		AbstractSocketNamespace: NewAbstractSocketNamespace(), // FIXME
+		AbstractSocketNamespace: args.AbstractSocketNamespace,
 	}
 	t, err := k.tasks.NewTask(config)
 	if err != nil {
@@ -839,6 +847,11 @@ func (k *Kernel) RootIPCNamespace() *IPCNamespace {
 	return k.rootIPCNamespace
 }
 
+// RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
+func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
+	return k.rootAbstractSocketNamespace
+}
+
 // RootMountNamespace returns the MountNamespace.
 func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
 	k.extMu.Lock()
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index ae2226e12..540cd6188 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -143,6 +143,19 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	}
 	tk.SetClocks(time.NewCalibratedClocks())
 
+	if err := enableStrace(conf); err != nil {
+		return nil, fmt.Errorf("failed to enable strace: %v", err)
+	}
+
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
+	networkStack, err := newEmptyNetworkStack(conf, k)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create network: %v", err)
+	}
+
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
@@ -163,26 +176,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		caps,
 		auth.NewRootUserNamespace())
 
-	// Create user namespace.
-	// TODO: Not clear what domain name should be here.  It is
-	// not configurable from runtime spec.
-	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
-
-	ipcns := kernel.NewIPCNamespace(creds.UserNamespace)
-
-	if err := enableStrace(conf); err != nil {
-		return nil, fmt.Errorf("failed to enable strace: %v", err)
-	}
-
-	// Create an empty network stack because the network namespace may be empty at
-	// this point. Netns is configured before Run() is called. Netstack is
-	// configured using a control uRPC message. Host network is configured inside
-	// Run().
-	networkStack, err := newEmptyNetworkStack(conf, k)
-	if err != nil {
-		return nil, fmt.Errorf("failed to create network: %v", err)
-	}
-
 	// Initiate the Kernel object, which is required by the Context passed
 	// to createVFS in order to mount (among other things) procfs.
 	if err = k.Init(kernel.InitKernelArgs{
@@ -191,10 +184,11 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		RootUserNamespace: creds.UserNamespace,
 		NetworkStack:      networkStack,
 		// TODO: use number of logical processors from cgroups.
-		ApplicationCores: uint(runtime.NumCPU()),
-		Vdso:             vdso,
-		RootUTSNamespace: utsns,
-		RootIPCNamespace: ipcns,
+		ApplicationCores:            uint(runtime.NumCPU()),
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
 	}); err != nil {
 		return nil, fmt.Errorf("error initializing kernel: %v", err)
 	}
@@ -244,7 +238,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal)
 	}
 
-	procArgs, err := newProcess(spec, creds, utsns, ipcns, k)
+	procArgs, err := newProcess(spec, creds, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
@@ -265,7 +259,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 }
 
 // newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+func newProcess(spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
@@ -274,15 +268,16 @@ func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSName
 
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
-		Argv:                 spec.Process.Args,
-		Envv:                 spec.Process.Env,
-		WorkingDirectory:     spec.Process.Cwd, // Defaults to '/' if empty.
-		Credentials:          creds,
-		Umask:                0022,
-		Limits:               ls,
-		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		UTSNamespace:         utsns,
-		IPCNamespace:         ipcns,
+		Argv:                    spec.Process.Args,
+		Envv:                    spec.Process.Env,
+		WorkingDirectory:        spec.Process.Cwd, // Defaults to '/' if empty.
+		Credentials:             creds,
+		Umask:                   0022,
+		Limits:                  ls,
+		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
+		UTSNamespace:            k.RootUTSNamespace(),
+		IPCNamespace:            k.RootIPCNamespace(),
+		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
 	}
 	return procArgs, nil
 }
@@ -421,12 +416,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	// TODO New containers should be started in new PID namespaces
 	// when indicated by the spec.
 
-	procArgs, err := newProcess(
-		spec,
-		creds,
-		l.k.RootUTSNamespace(),
-		l.k.RootIPCNamespace(),
-		l.k)
+	procArgs, err := newProcess(spec, creds, l.k)
 	if err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
-- 
cgit v1.2.3


From bc81f3fe4a042a15343d2eab44da32d818ac1ade Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 7 Sep 2018 12:27:44 -0700
Subject: Remove '--file-access=direct' option

It was used before gofer was implemented and it's not
supported anymore.
BREAKING CHANGE: proxy-shared and proxy-exclusive options
are now: shared and exclusive.

PiperOrigin-RevId: 212017643
Change-Id: If029d4073fe60583e5ca25f98abb2953de0d78fd
---
 runsc/boot/BUILD                  |   3 ++
 runsc/boot/config.go              |  31 +++++------
 runsc/boot/filter/config.go       |  39 --------------
 runsc/boot/filter/filter.go       |   5 --
 runsc/boot/fs.go                  |  19 ++-----
 runsc/boot/loader.go              |   1 -
 runsc/boot/loader_test.go         | 106 ++++++++++++++++++++------------------
 runsc/cmd/boot.go                 |   7 ---
 runsc/container/container.go      |   5 --
 runsc/container/container_test.go |   6 +--
 runsc/main.go                     |   6 +--
 runsc/sandbox/sandbox.go          |  12 ++---
 runsc/test/testutil/testutil.go   |   2 +-
 13 files changed, 85 insertions(+), 157 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index e96722069..a38a3a94e 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -85,8 +85,11 @@ go_test(
     deps = [
         "//pkg/control/server",
         "//pkg/log",
+        "//pkg/p9",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
+        "//pkg/unet",
+        "//runsc/fsgofer",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 28a1600cd..01da535af 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -60,28 +60,23 @@ func (p PlatformType) String() string {
 type FileAccessType int
 
 const (
-	// FileAccessProxy sends IO requests to a Gofer process that validates the
+	// FileAccessShared sends IO requests to a Gofer process that validates the
 	// requests and forwards them to the host.
-	FileAccessProxy FileAccessType = iota
+	FileAccessShared FileAccessType = iota
 
-	// FileAccessProxyExclusive is the same as FileAccessProxy, but enables
+	// FileAccessExclusive is the same as FileAccessShared, but enables
 	// extra caching for improved performance. It should only be used if
 	// the sandbox has exclusive access to the filesystem.
-	FileAccessProxyExclusive
-
-	// FileAccessDirect connects the sandbox directly to the host filesystem.
-	FileAccessDirect
+	FileAccessExclusive
 )
 
 // MakeFileAccessType converts type from string.
 func MakeFileAccessType(s string) (FileAccessType, error) {
 	switch s {
-	case "proxy-shared":
-		return FileAccessProxy, nil
-	case "proxy-exclusive":
-		return FileAccessProxyExclusive, nil
-	case "direct":
-		return FileAccessDirect, nil
+	case "shared":
+		return FileAccessShared, nil
+	case "exclusive":
+		return FileAccessExclusive, nil
 	default:
 		return 0, fmt.Errorf("invalid file access type %q", s)
 	}
@@ -89,12 +84,10 @@ func MakeFileAccessType(s string) (FileAccessType, error) {
 
 func (f FileAccessType) String() string {
 	switch f {
-	case FileAccessProxy:
-		return "proxy-shared"
-	case FileAccessProxyExclusive:
-		return "proxy-exclusive"
-	case FileAccessDirect:
-		return "direct"
+	case FileAccessShared:
+		return "shared"
+	case FileAccessExclusive:
+		return "exclusive"
 	default:
 		return fmt.Sprintf("unknown(%d)", f)
 	}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index f864b1f45..1a0c426ab 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -264,45 +264,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 	},
 }
 
-// whitelistFSFilters returns syscalls made by whitelistFS. Using WhitelistFS
-// is less secure because it runs inside the Sentry and must be able to perform
-// file operations that would otherwise be disabled by seccomp when a Gofer is
-// used. When whitelistFS is not used, openning new FD in the Sentry is
-// disallowed.
-func whitelistFSFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_ACCESS:          {},
-		syscall.SYS_FCHMOD:          {},
-		syscall.SYS_FSTAT:           {},
-		syscall.SYS_FSYNC:           {},
-		syscall.SYS_FTRUNCATE:       {},
-		syscall.SYS_GETCWD:          {},
-		syscall.SYS_GETDENTS:        {},
-		syscall.SYS_GETDENTS64:      {},
-		syscall.SYS_LSEEK:           {},
-		syscall.SYS_LSTAT:           {},
-		syscall.SYS_MKDIR:           {},
-		syscall.SYS_MKDIRAT:         {},
-		syscall.SYS_NEWFSTATAT:      {},
-		syscall.SYS_OPEN:            {},
-		syscall.SYS_OPENAT:          {},
-		syscall.SYS_PREAD64:         {},
-		syscall.SYS_PWRITE64:        {},
-		syscall.SYS_READ:            {},
-		syscall.SYS_READLINK:        {},
-		syscall.SYS_READLINKAT:      {},
-		syscall.SYS_RENAMEAT:        {},
-		syscall.SYS_STAT:            {},
-		syscall.SYS_SYMLINK:         {},
-		syscall.SYS_SYMLINKAT:       {},
-		syscall.SYS_SYNC_FILE_RANGE: {},
-		syscall.SYS_UNLINK:          {},
-		syscall.SYS_UNLINKAT:        {},
-		syscall.SYS_UTIMENSAT:       {},
-		syscall.SYS_WRITE:           {},
-	}
-}
-
 // hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
 func hostInetFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index 56d30f2a0..b656883ad 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -30,7 +30,6 @@ import (
 // Options are seccomp filter related options.
 type Options struct {
 	Platform     platform.Platform
-	WhitelistFS  bool
 	HostNetwork  bool
 	ControllerFD int
 }
@@ -44,10 +43,6 @@ func Install(opt Options) error {
 	// when not enabled.
 	s.Merge(instrumentationFilters())
 
-	if opt.WhitelistFS {
-		Report("direct file access allows unrestricted file access!")
-		s.Merge(whitelistFSFilters())
-	}
 	if opt.HostNetwork {
 		Report("host networking enabled: syscall filters less restrictive!")
 		s.Merge(hostInetFilters())
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 772df40fe..3df276170 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -204,7 +204,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	)
 
 	switch conf.FileAccess {
-	case FileAccessProxy, FileAccessProxyExclusive:
+	case FileAccessShared, FileAccessExclusive:
 		fd := fds.remove()
 		log.Infof("Mounting root over 9P, ioFD: %d", fd)
 		hostFS := mustFindFilesystem("9p")
@@ -214,13 +214,6 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 		}
 
-	case FileAccessDirect:
-		hostFS := mustFindFilesystem("whitelistfs")
-		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, "root="+spec.Root.Path+",dont_translate_ownership=true")
-		if err != nil {
-			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
-		}
-
 	default:
 		return nil, fmt.Errorf("invalid file access type: %v", conf.FileAccess)
 	}
@@ -289,13 +282,10 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 
 	case bind:
 		switch conf.FileAccess {
-		case FileAccessProxy, FileAccessProxyExclusive:
+		case FileAccessShared, FileAccessExclusive:
 			fd := fds.remove()
 			fsName = "9p"
 			opts = p9MountOptions(conf, fd)
-		case FileAccessDirect:
-			fsName = "whitelistfs"
-			opts = []string{"root=" + m.Source, "dont_translate_ownership=true"}
 		default:
 			err = fmt.Errorf("invalid file access type: %v", conf.FileAccess)
 		}
@@ -423,7 +413,7 @@ func p9MountOptions(conf *Config, fd int) []string {
 		"wfdno=" + strconv.Itoa(fd),
 		"privateunixsocket=true",
 	}
-	if conf.FileAccess == FileAccessProxy {
+	if conf.FileAccess == FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
 	}
 	return opts
@@ -503,9 +493,6 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
 // to the environment.
 func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
-	if conf.FileAccess == FileAccessDirect {
-		return nil, fmt.Errorf("host filesystem with whitelist not supported with S/R")
-	}
 	renv := &fs.RestoreEnvironment{
 		MountSources: make(map[string][]fs.MountArgs),
 	}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 540cd6188..5fb489766 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -341,7 +341,6 @@ func (l *Loader) run() error {
 	} else {
 		opts := filter.Options{
 			Platform:     l.k.Platform,
-			WhitelistFS:  l.conf.FileAccess == FileAccessDirect,
 			HostNetwork:  l.conf.Network == NetworkHost,
 			ControllerFD: l.ctrl.srv.FD(),
 		}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 2396d52c8..d6bfe9ff1 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -20,14 +20,18 @@ import (
 	"os"
 	"reflect"
 	"sync"
+	"syscall"
 	"testing"
 	"time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 )
 
 func init() {
@@ -39,7 +43,6 @@ func testConfig() *Config {
 	return &Config{
 		RootDir:        "unused_root_dir",
 		Network:        NetworkNone,
-		FileAccess:     FileAccessDirect,
 		DisableSeccomp: true,
 	}
 }
@@ -58,23 +61,62 @@ func testSpec() *specs.Spec {
 	}
 }
 
-func createLoader() (*Loader, error) {
+// startGofer starts a new gofer routine serving 'root' path. It returns the
+// sandbox side of the connection, and a function that when called will stop the
+// gofer.
+func startGofer(root string) (int, func(), error) {
+	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		return 0, nil, err
+	}
+	sandboxEnd, goferEnd := fds[0], fds[1]
+
+	socket, err := unet.NewSocket(goferEnd)
+	if err != nil {
+		syscall.Close(sandboxEnd)
+		syscall.Close(goferEnd)
+		return 0, nil, fmt.Errorf("error creating server on FD %d: %v", goferEnd, err)
+	}
+	go func() {
+		at := fsgofer.NewAttachPoint(root, fsgofer.Config{ROMount: true})
+		s := p9.NewServer(at)
+		if err := s.Handle(socket); err != nil {
+			log.Infof("Gofer is stopping. FD: %d, err: %v\n", goferEnd, err)
+		}
+	}()
+	// Closing the gofer FD will stop the gofer and exit goroutine above.
+	return sandboxEnd, func() { syscall.Close(goferEnd) }, nil
+}
+
+func createLoader() (*Loader, func(), error) {
 	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 	conf := testConfig()
 	spec := testSpec()
-	return New(spec, conf, fd, nil, false)
+
+	sandEnd, cleanup, err := startGofer(spec.Root.Path)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	l, err := New(spec, conf, fd, []int{sandEnd}, false)
+	if err != nil {
+		cleanup()
+		return nil, nil, err
+	}
+	return l, cleanup, nil
 }
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
 func TestRun(t *testing.T) {
-	s, err := createLoader()
+	s, cleanup, err := createLoader()
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
 	defer s.Destroy()
+	defer cleanup()
 
 	// Start a goroutine to read the start chan result, otherwise Run will
 	// block forever.
@@ -106,11 +148,12 @@ func TestRun(t *testing.T) {
 // TestStartSignal tests that the controller Start message will cause
 // WaitForStartSignal to return.
 func TestStartSignal(t *testing.T) {
-	s, err := createLoader()
+	s, cleanup, err := createLoader()
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
 	defer s.Destroy()
+	defer cleanup()
 
 	// We aren't going to wait on this application, so the control server
 	// needs to be shut down manually.
@@ -330,7 +373,14 @@ func TestCreateMountNamespace(t *testing.T) {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
-			mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, nil)
+
+			sandEnd, cleanup, err := startGofer(tc.spec.Root.Path)
+			if err != nil {
+				t.Fatalf("failed to create gofer: %v", err)
+			}
+			defer cleanup()
+
+			mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, []int{sandEnd})
 			if err != nil {
 				t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
 			}
@@ -352,7 +402,6 @@ func TestRestoreEnvironment(t *testing.T) {
 	testCases := []struct {
 		name          string
 		spec          *specs.Spec
-		fileAccess    FileAccessType
 		ioFDs         []int
 		errorExpected bool
 		expectedRenv  fs.RestoreEnvironment
@@ -375,7 +424,6 @@ func TestRestoreEnvironment(t *testing.T) {
 					},
 				},
 			},
-			fileAccess:    FileAccessProxy,
 			ioFDs:         []int{0},
 			errorExpected: false,
 			expectedRenv: fs.RestoreEnvironment{
@@ -430,7 +478,6 @@ func TestRestoreEnvironment(t *testing.T) {
 					},
 				},
 			},
-			fileAccess:    FileAccessProxy,
 			ioFDs:         []int{0, 1},
 			errorExpected: false,
 			expectedRenv: fs.RestoreEnvironment{
@@ -489,7 +536,6 @@ func TestRestoreEnvironment(t *testing.T) {
 					},
 				},
 			},
-			fileAccess:    FileAccessProxy,
 			ioFDs:         []int{0},
 			errorExpected: false,
 			expectedRenv: fs.RestoreEnvironment{
@@ -534,48 +580,10 @@ func TestRestoreEnvironment(t *testing.T) {
 				},
 			},
 		},
-		{
-			name: "whitelist error test",
-			spec: &specs.Spec{
-				Root: &specs.Root{
-					Path:     os.TempDir(),
-					Readonly: true,
-				},
-				Mounts: []specs.Mount{
-					{
-						Destination: "/dev/fd-foo",
-						Type:        "bind",
-					},
-				},
-			},
-			fileAccess:    FileAccessDirect,
-			ioFDs:         []int{0, 1},
-			errorExpected: true,
-		},
-		{
-			name: "bad options test",
-			spec: &specs.Spec{
-				Root: &specs.Root{
-					Path:     os.TempDir(),
-					Readonly: true,
-				},
-				Mounts: []specs.Mount{
-					{
-						Destination: "/dev/fd-foo",
-						Type:        "tmpfs",
-						Options:     []string{"invalid_option=true"},
-					},
-				},
-			},
-			fileAccess:    FileAccessDirect,
-			ioFDs:         []int{0},
-			errorExpected: true,
-		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			conf.FileAccess = tc.fileAccess
 			fds := &fdDispenser{fds: tc.ioFDs}
 			actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds)
 			if !tc.errorExpected && err != nil {
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 784baf23b..d8c7b9cd3 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -102,13 +102,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	// sentry should run with a umask of 0 when --file-access=direct, because we want
-	// to preserve file modes exactly as set by the sentry, which will have applied
-	// its own umask.
-	if conf.FileAccess == boot.FileAccessDirect {
-		syscall.Umask(0)
-	}
-
 	if b.applyCaps {
 		caps := spec.Process.Capabilities
 		if caps == nil {
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 5977fbd21..9a05a1dc5 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -615,11 +615,6 @@ func (c *Container) waitForStopped() error {
 }
 
 func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, error) {
-	if conf.FileAccess == boot.FileAccessDirect {
-		// Don't start a gofer. The sandbox will access host FS directly.
-		return nil, nil
-	}
-
 	if err := setupFS(spec, conf, bundleDir); err != nil {
 		return nil, fmt.Errorf("failed to setup mounts: %v", err)
 	}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 9a94347b6..c45eb79a3 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -241,7 +241,7 @@ func configs(opts ...configOption) []*boot.Config {
 			}
 			c.Platform = boot.PlatformKVM
 		case nonExclusiveFS:
-			c.FileAccess = boot.FileAccessProxy
+			c.FileAccess = boot.FileAccessShared
 		default:
 			panic(fmt.Sprintf("unknown config option %v", o))
 
@@ -1368,10 +1368,10 @@ func TestAbbreviatedIDs(t *testing.T) {
 // Check that modifications to a volume mount are propigated into and out of
 // the sandbox.
 func TestContainerVolumeContentsShared(t *testing.T) {
-	// Only run this test with shared proxy, since that is the only
+	// Only run this test with shared file access, since that is the only
 	// behavior it is testing.
 	conf := testutil.TestConfig()
-	conf.FileAccess = boot.FileAccessProxy
+	conf.FileAccess = boot.FileAccessShared
 	t.Logf("Running test with conf: %+v", conf)
 
 	// Main process just sleeps. We will use "exec" to probe the state of
diff --git a/runsc/main.go b/runsc/main.go
index c51b199aa..c30b29b81 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -58,7 +58,7 @@ var (
 	// Flags that control sandbox runtime behavior.
 	platform       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
 	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	fileAccess     = flag.String("file-access", "proxy-exclusive", "specifies which filesystem to use: proxy-exclusive (default), proxy-shared, or direct. Using a proxy is more secure because it disallows the sandbox from opening files directly in the host. Setting 'proxy-shared' will disable caches and should be used if external modifications to the filesystem are expected.")
+	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use: exclusive (default), shared. Setting 'shared' will disable caches and should be used if external modifications to the filesystem are expected.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
@@ -112,8 +112,8 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
-	if fsAccess == boot.FileAccessProxy && *overlay {
-		cmd.Fatalf("overlay flag is incompatible with proxy-shared file access")
+	if fsAccess == boot.FileAccessShared && *overlay {
+		cmd.Fatalf("overlay flag is incompatible with shared file access")
 	}
 
 	netType, err := boot.MakeNetworkType(*network)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f6264d5b2..697210669 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -356,12 +356,8 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
 	}
 
-	if conf.FileAccess == boot.FileAccessDirect {
-		log.Infof("Sandbox will be started in the current mount namespace")
-	} else {
-		log.Infof("Sandbox will be started in new mount namespace")
-		nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
-	}
+	log.Infof("Sandbox will be started in new mount namespace")
+	nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
 
 	// Joins the network namespace if network is enabled. the sandbox talks
 	// directly to the host network, which may have been configured in the
@@ -377,9 +373,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// User namespace depends on the following options:
 	//   - Host network/filesystem: requires to run inside the user namespace
 	//       specified in the spec or the current namespace if none is configured.
-	//   - Gofer: when using a Gofer, the sandbox process can run isolated in a
-	//       new user namespace with only the "nobody" user and group.
-	if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect {
+	if conf.Network == boot.NetworkHost {
 		if userns, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
 			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
 			nss = append(nss, userns)
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4f012a8ea..4d354de31 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -109,7 +109,7 @@ func TestConfig() *boot.Config {
 		Network:        boot.NetworkNone,
 		Strace:         true,
 		MultiContainer: true,
-		FileAccess:     boot.FileAccessProxyExclusive,
+		FileAccess:     boot.FileAccessExclusive,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
 	}
 }
-- 
cgit v1.2.3


From 8ce3fbf9f87677ac34c577be9fb9b395ede8e714 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 7 Sep 2018 13:38:12 -0700
Subject: Only start signal forwarding after init process is created

PiperOrigin-RevId: 212028121
Change-Id: If9c2c62f3be103e2bb556b8d154c169888e34369
---
 runsc/boot/loader.go | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 5fb489766..994b3d2e2 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -84,6 +84,10 @@ type Loader struct {
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
 
+	// startSignalForwarding enables forwarding of signals to the sandboxed
+	// container. It should be called after the init process is loaded.
+	startSignalForwarding func() func()
+
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
@@ -226,7 +230,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	}
 	// Ensure that signals received are forwarded to the emulated kernel.
 	ps := syscall.Signal(conf.PanicSignal)
-	stopSignalForwarding := sighandling.PrepareForwarding(k, ps)()
+	startSignalForwarding := sighandling.PrepareForwarding(k, ps)
 	if conf.PanicSignal != -1 {
 		// Panics if the sentry receives 'conf.PanicSignal'.
 		panicChan := make(chan os.Signal, 1)
@@ -244,15 +248,15 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	}
 
 	l := &Loader{
-		k:                    k,
-		ctrl:                 ctrl,
-		conf:                 conf,
-		console:              console,
-		watchdog:             watchdog,
-		ioFDs:                ioFDs,
-		spec:                 spec,
-		stopSignalForwarding: stopSignalForwarding,
-		rootProcArgs:         procArgs,
+		k:                     k,
+		ctrl:                  ctrl,
+		conf:                  conf,
+		console:               console,
+		watchdog:              watchdog,
+		ioFDs:                 ioFDs,
+		spec:                  spec,
+		startSignalForwarding: startSignalForwarding,
+		rootProcArgs:          procArgs,
 	}
 	ctrl.manager.l = l
 	return l, nil
@@ -291,7 +295,9 @@ func (l *Loader) Destroy() {
 	if l.ctrl != nil {
 		l.ctrl.srv.Stop()
 	}
-	l.stopSignalForwarding()
+	if l.stopSignalForwarding != nil {
+		l.stopSignalForwarding()
+	}
 	l.watchdog.Stop()
 }
 
@@ -380,6 +386,9 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
+	// Start signal forwarding only after an init process is created.
+	l.stopSignalForwarding = l.startSignalForwarding()
+
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
-- 
cgit v1.2.3


From 6cfb5cd56d4660cc0de6cd991a7ed4601824a7e6 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 7 Sep 2018 16:52:02 -0700
Subject: Add additional sanity checks for walk.

PiperOrigin-RevId: 212058684
Change-Id: I319709b9ffcfccb3231bac98df345d2a20eca24b
---
 pkg/p9/file.go                      |   7 +-
 pkg/p9/handlers.go                  | 221 +++++++++++++++++++++++++++---------
 pkg/p9/local_server/local_server.go |   4 +-
 pkg/p9/p9test/client_test.go        |  71 +++++++++---
 pkg/p9/p9test/mocks.go              |  13 +--
 pkg/p9/server.go                    |   3 +
 runsc/fsgofer/fsgofer.go            |  23 ++--
 runsc/fsgofer/fsgofer_test.go       |  46 ++------
 8 files changed, 257 insertions(+), 131 deletions(-)

(limited to 'runsc')

diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index ae726f0b9..9723fa24d 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -20,10 +20,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 )
 
-// Attacher is provided by the user.
+// Attacher is provided by the server.
 type Attacher interface {
 	// Attach returns a new File.
-	Attach(attachName string) (File, error)
+	//
+	// The client-side attach will be translate to a series of walks from
+	// the file returned by this Attach call.
+	Attach() (File, error)
 }
 
 // File is a set of operations corresponding to a single node.
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 7da9eff5f..ea41f97c7 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -17,6 +17,7 @@ package p9
 import (
 	"io"
 	"os"
+	"path"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -93,39 +94,6 @@ func isSafeName(name string) bool {
 	return name != "" && !strings.Contains(name, "/") && name != "." && name != ".."
 }
 
-// handle implements handler.handle.
-func (t *Twalk) handle(cs *connState) message {
-	// Check the names.
-	for _, name := range t.Names {
-		if !isSafeName(name) {
-			return newErr(syscall.EINVAL)
-		}
-	}
-
-	// Lookup the FID.
-	ref, ok := cs.LookupFID(t.FID)
-	if !ok {
-		return newErr(syscall.EBADF)
-	}
-	defer ref.DecRef()
-
-	// Has it been opened already?
-	if _, opened := ref.OpenFlags(); opened {
-		return newErr(syscall.EBUSY)
-	}
-
-	// Do the walk.
-	qids, sf, err := ref.file.Walk(t.Names)
-	if err != nil {
-		return newErr(err)
-	}
-
-	// Install the new FID.
-	cs.InsertFID(t.NewFID, &fidRef{file: sf})
-
-	return &Rwalk{QIDs: qids}
-}
-
 // handle implements handler.handle.
 func (t *Tclunk) handle(cs *connState) message {
 	if !cs.DeleteFID(t.FID) {
@@ -175,14 +143,57 @@ func (t *Tattach) handle(cs *connState) message {
 		return newErr(syscall.EINVAL)
 	}
 
-	// Do the attach.
-	sf, err := cs.server.attacher.Attach(t.Auth.AttachName)
+	// Must provide an absolute path.
+	if path.IsAbs(t.Auth.AttachName) {
+		// Trim off the leading / if the path is absolute. We always
+		// treat attach paths as absolute and call attach with the root
+		// argument on the server file for clarity.
+		t.Auth.AttachName = t.Auth.AttachName[1:]
+	}
+
+	// Do the attach on the root.
+	sf, err := cs.server.attacher.Attach()
 	if err != nil {
 		return newErr(err)
 	}
-	cs.InsertFID(t.FID, &fidRef{file: sf})
+	_, valid, attr, err := sf.GetAttr(AttrMaskAll())
+	if err != nil {
+		sf.Close() // Drop file.
+		return newErr(err)
+	}
+	if !valid.Mode {
+		sf.Close() // Drop file.
+		return newErr(syscall.EINVAL)
+	}
+
+	// Build a transient reference.
+	root := &fidRef{
+		file:     sf,
+		refs:     1,
+		walkable: attr.Mode.IsDir(),
+	}
+	defer root.DecRef()
+
+	// Attach the root?
+	if len(t.Auth.AttachName) == 0 {
+		cs.InsertFID(t.FID, root)
+		return &Rattach{}
+	}
+
+	// We want the same traversal checks to apply on attach, so always
+	// attach at the root and use the regular walk paths.
+	names := strings.Split(t.Auth.AttachName, "/")
+	_, target, _, attr, err := doWalk(cs, root, names)
+	if err != nil {
+		return newErr(err)
+	}
+
+	// Insert the FID.
+	cs.InsertFID(t.FID, &fidRef{
+		file:     target,
+		walkable: attr.Mode.IsDir(),
+	})
 
-	// Return an empty QID.
 	return &Rattach{}
 }
 
@@ -678,15 +689,104 @@ func (t *Tflushf) handle(cs *connState) message {
 	return &Rflushf{}
 }
 
-// handle implements handler.handle.
-func (t *Twalkgetattr) handle(cs *connState) message {
+// walkOne walks zero or one path elements.
+//
+// The slice passed as qids is append and returned.
+func walkOne(qids []QID, from File, names []string) ([]QID, File, AttrMask, Attr, error) {
+	if len(names) > 1 {
+		// We require exactly zero or one elements.
+		return nil, nil, AttrMask{}, Attr{}, syscall.EINVAL
+	}
+	var localQIDs []QID
+	localQIDs, sf, valid, attr, err := from.WalkGetAttr(names)
+	if err == syscall.ENOSYS {
+		localQIDs, sf, err = from.Walk(names)
+		if err != nil {
+			// No way to walk this element.
+			return nil, nil, AttrMask{}, Attr{}, err
+		}
+		// Need a manual getattr.
+		_, valid, attr, err = sf.GetAttr(AttrMaskAll())
+		if err != nil {
+			// Don't leak the file.
+			sf.Close()
+		}
+	}
+	if err != nil {
+		// Error walking, don't return anything.
+		return nil, nil, AttrMask{}, Attr{}, err
+	}
+	if len(localQIDs) != 1 {
+		// Expected a single QID.
+		sf.Close()
+		return nil, nil, AttrMask{}, Attr{}, syscall.EINVAL
+	}
+	return append(qids, localQIDs...), sf, valid, attr, nil
+}
+
+// doWalk walks from a given fidRef.
+//
+// This enforces that all intermediate nodes are walkable (directories).
+func doWalk(cs *connState, ref *fidRef, names []string) (qids []QID, sf File, valid AttrMask, attr Attr, err error) {
 	// Check the names.
-	for _, name := range t.Names {
+	for _, name := range names {
 		if !isSafeName(name) {
-			return newErr(syscall.EINVAL)
+			err = syscall.EINVAL
+			return
+		}
+	}
+
+	// Has it been opened already?
+	if _, opened := ref.OpenFlags(); opened {
+		err = syscall.EBUSY
+		return
+	}
+
+	// Is this an empty list? Handle specially. We don't actually need to
+	// validate anything since this is always permitted.
+	if len(names) == 0 {
+		return walkOne(nil, ref.file, nil)
+	}
+
+	// Is it walkable?
+	if !ref.walkable {
+		err = syscall.EINVAL
+		return
+	}
+
+	from := ref.file // Start at the passed ref.
+
+	// Do the walk, one element at a time.
+	for i := 0; i < len(names); i++ {
+		qids, sf, valid, attr, err = walkOne(qids, from, names[i:i+1])
+
+		// Close the intermediate file. Note that we don't close the
+		// first file because in that case we are walking from the
+		// existing reference.
+		if i > 0 {
+			from.Close()
+		}
+		from = sf // Use the new file.
+
+		// Was there an error walking?
+		if err != nil {
+			return nil, nil, AttrMask{}, Attr{}, err
+		}
+
+		// We won't allow beyond past symlinks; stop here if this isn't
+		// a proper directory and we have additional paths to walk.
+		if !valid.Mode || (!attr.Mode.IsDir() && i < len(names)-1) {
+			from.Close() // Not using the file object.
+			return nil, nil, AttrMask{}, Attr{}, syscall.EINVAL
 		}
 	}
 
+	// Success.
+	return qids, sf, valid, attr, nil
+}
+
+// handle implements handler.handle.
+func (t *Twalk) handle(cs *connState) message {
 	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
@@ -694,26 +794,41 @@ func (t *Twalkgetattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	if _, opened := ref.OpenFlags(); opened {
-		return newErr(syscall.EBUSY)
+	// Do the walk.
+	qids, sf, _, attr, err := doWalk(cs, ref, t.Names)
+	if err != nil {
+		return newErr(err)
 	}
 
-	// Do the walk.
-	qids, sf, valid, attr, err := ref.file.WalkGetAttr(t.Names)
-	if err == syscall.ENOSYS {
-		qids, sf, err = ref.file.Walk(t.Names)
-		if err != nil {
-			return newErr(err)
-		}
-		_, valid, attr, err = sf.GetAttr(AttrMaskAll())
+	// Install the new FID.
+	cs.InsertFID(t.NewFID, &fidRef{
+		file:     sf,
+		walkable: attr.Mode.IsDir(),
+	})
+
+	return &Rwalk{QIDs: qids}
+}
+
+// handle implements handler.handle.
+func (t *Twalkgetattr) handle(cs *connState) message {
+	// Lookup the FID.
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
 	}
+	defer ref.DecRef()
+
+	// Do the walk.
+	qids, sf, valid, attr, err := doWalk(cs, ref, t.Names)
 	if err != nil {
 		return newErr(err)
 	}
 
 	// Install the new FID.
-	cs.InsertFID(t.NewFID, &fidRef{file: sf})
+	cs.InsertFID(t.NewFID, &fidRef{
+		file:     sf,
+		walkable: attr.Mode.IsDir(),
+	})
 
 	return &Rwalkgetattr{QIDs: qids, Valid: valid, Attr: attr}
 }
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index b4db44e27..cef3701a7 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -70,8 +70,8 @@ func (l *local) info() (p9.QID, os.FileInfo, error) {
 }
 
 // Attach implements p9.Attacher.Attach.
-func (l *local) Attach(name string) (p9.File, error) {
-	return &local{path: path.Clean(name)}, nil
+func (l *local) Attach() (p9.File, error) {
+	return &local{path: "/"}, nil
 }
 
 // Walk implements p9.File.Walk.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 8e35d6017..34ddccd8b 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -50,8 +50,16 @@ func TestDonateFD(t *testing.T) {
 
 	// Craft attacher to attach to the mocked file which will return our
 	// temporary file.
-	fileMock := &FileMock{OpenMock: OpenMock{File: f}}
-	attacher := &AttachMock{File: fileMock}
+	fileMock := &FileMock{
+		OpenMock: OpenMock{File: f},
+		GetAttrMock: GetAttrMock{
+			// The mode must be valid always.
+			Valid: p9.AttrMask{Mode: true},
+		},
+	}
+	attacher := &AttachMock{
+		File: fileMock,
+	}
 
 	// Make socket pair.
 	serverSocket, clientSocket, err := unet.SocketPair(false)
@@ -139,15 +147,14 @@ func TestClient(t *testing.T) {
 				a.Called = false
 				a.File = sf
 				a.Err = nil
+				// The attached root must have a valid mode.
+				sf.GetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory}
+				sf.GetAttrMock.Valid = p9.AttrMask{Mode: true}
 				var err error
-				sfFile, err = c.Attach("foo")
+				sfFile, err = c.Attach("")
 				if !a.Called {
 					t.Errorf("Attach never Called?")
 				}
-				if a.AttachName != "foo" {
-					// This wasn't carried through?
-					t.Errorf("attachName got %v wanted foo", a.AttachName)
-				}
 				return err
 			},
 		},
@@ -155,6 +162,8 @@ func TestClient(t *testing.T) {
 			name: "bad-walk",
 			want: sentinelErr,
 			fn: func(c *p9.Client) error {
+				// Walk only called when WalkGetAttr not available.
+				sf.WalkGetAttrMock.Err = syscall.ENOSYS
 				sf.WalkMock.File = d
 				sf.WalkMock.Err = sentinelErr
 				_, _, err := sfFile.Walk([]string{"foo", "bar"})
@@ -164,21 +173,39 @@ func TestClient(t *testing.T) {
 		{
 			name: "walk-to-dir",
 			fn: func(c *p9.Client) error {
+				// Walk only called when WalkGetAttr not available.
+				sf.WalkGetAttrMock.Err = syscall.ENOSYS
 				sf.WalkMock.Called = false
+				sf.WalkMock.Names = nil
 				sf.WalkMock.File = d
 				sf.WalkMock.Err = nil
 				sf.WalkMock.QIDs = []p9.QID{{Type: 1}}
+				// All intermediate values must be directories.
+				d.WalkGetAttrMock.Err = syscall.ENOSYS
+				d.WalkMock.Called = false
+				d.WalkMock.Names = nil
+				d.WalkMock.File = d // Walk to self.
+				d.WalkMock.Err = nil
+				d.WalkMock.QIDs = []p9.QID{{Type: 1}}
+				d.GetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory}
+				d.GetAttrMock.Valid = p9.AttrMask{Mode: true}
 				var qids []p9.QID
 				var err error
 				qids, _, err = sfFile.Walk([]string{"foo", "bar"})
 				if !sf.WalkMock.Called {
 					t.Errorf("Walk never Called?")
 				}
-				if !reflect.DeepEqual(sf.WalkMock.Names, []string{"foo", "bar"}) {
-					t.Errorf("got names %v wanted []{foo, bar}", sf.WalkMock.Names)
+				if !d.GetAttrMock.Called {
+					t.Errorf("GetAttr never Called?")
 				}
-				if len(qids) != 1 || qids[0].Type != 1 {
-					t.Errorf("got qids %v wanted []{{Type: 1}}", qids)
+				if !reflect.DeepEqual(sf.WalkMock.Names, []string{"foo"}) {
+					t.Errorf("got names %v wanted []{foo}", sf.WalkMock.Names)
+				}
+				if !reflect.DeepEqual(d.WalkMock.Names, []string{"bar"}) {
+					t.Errorf("got names %v wanted []{bar}", d.WalkMock.Names)
+				}
+				if len(qids) != 2 || qids[len(qids)-1].Type != 1 {
+					t.Errorf("got qids %v wanted []{..., {Type: 1}}", qids)
 				}
 				return err
 			},
@@ -187,11 +214,20 @@ func TestClient(t *testing.T) {
 			name: "walkgetattr-to-dir",
 			fn: func(c *p9.Client) error {
 				sf.WalkGetAttrMock.Called = false
+				sf.WalkGetAttrMock.Names = nil
 				sf.WalkGetAttrMock.File = d
 				sf.WalkGetAttrMock.Err = nil
 				sf.WalkGetAttrMock.QIDs = []p9.QID{{Type: 1}}
-				sf.WalkGetAttrMock.Attr = p9.Attr{UID: 1}
+				sf.WalkGetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory, UID: 1}
 				sf.WalkGetAttrMock.Valid = p9.AttrMask{Mode: true}
+				// See above.
+				d.WalkGetAttrMock.Called = false
+				d.WalkGetAttrMock.Names = nil
+				d.WalkGetAttrMock.File = d // Walk to self.
+				d.WalkGetAttrMock.Err = nil
+				d.WalkGetAttrMock.QIDs = []p9.QID{{Type: 1}}
+				d.WalkGetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory, UID: 1}
+				d.WalkGetAttrMock.Valid = p9.AttrMask{Mode: true}
 				var qids []p9.QID
 				var err error
 				var mask p9.AttrMask
@@ -200,11 +236,14 @@ func TestClient(t *testing.T) {
 				if !sf.WalkGetAttrMock.Called {
 					t.Errorf("Walk never Called?")
 				}
-				if !reflect.DeepEqual(sf.WalkGetAttrMock.Names, []string{"foo", "bar"}) {
-					t.Errorf("got names %v wanted []{foo, bar}", sf.WalkGetAttrMock.Names)
+				if !reflect.DeepEqual(sf.WalkGetAttrMock.Names, []string{"foo"}) {
+					t.Errorf("got names %v wanted []{foo}", sf.WalkGetAttrMock.Names)
+				}
+				if !reflect.DeepEqual(d.WalkGetAttrMock.Names, []string{"bar"}) {
+					t.Errorf("got names %v wanted []{bar}", d.WalkGetAttrMock.Names)
 				}
-				if len(qids) != 1 || qids[0].Type != 1 {
-					t.Errorf("got qids %v wanted []{{Type: 1}}", qids)
+				if len(qids) != 2 || qids[len(qids)-1].Type != 1 {
+					t.Errorf("got qids %v wanted []{..., {Type: 1}}", qids)
 				}
 				if !reflect.DeepEqual(attr, sf.WalkGetAttrMock.Attr) {
 					t.Errorf("got attrs %s wanted %s", attr, sf.WalkGetAttrMock.Attr)
diff --git a/pkg/p9/p9test/mocks.go b/pkg/p9/p9test/mocks.go
index e10f206dd..9d039ac63 100644
--- a/pkg/p9/p9test/mocks.go
+++ b/pkg/p9/p9test/mocks.go
@@ -71,7 +71,8 @@ type WalkGetAttrMock struct {
 
 // WalkGetAttr implements p9.File.WalkGetAttr.
 func (w *WalkGetAttrMock) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) {
-	w.Called, w.Names = true, names
+	w.Called = true
+	w.Names = append(w.Names, names...)
 	return w.QIDs, w.File, w.Valid, w.Attr, w.Err
 }
 
@@ -300,17 +301,14 @@ func (r *ReadlinkMock) Readlink() (string, error) {
 type AttachMock struct {
 	Called bool
 
-	// Args.
-	AttachName string
-
 	// Return.
 	File p9.File
 	Err  error
 }
 
 // Attach implements p9.Attacher.Attach.
-func (a *AttachMock) Attach(attachName string) (p9.File, error) {
-	a.Called, a.AttachName = true, attachName
+func (a *AttachMock) Attach() (p9.File, error) {
+	a.Called = true
 	return a.File, a.Err
 }
 
@@ -329,7 +327,8 @@ type WalkMock struct {
 
 // Walk implements p9.File.Walk.
 func (w *WalkMock) Walk(names []string) ([]p9.QID, p9.File, error) {
-	w.Called, w.Names = true, names
+	w.Called = true
+	w.Names = append(w.Names, names...)
 	return w.QIDs, w.File, w.Err
 }
 
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 2965ae16e..28a273ac6 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -97,6 +97,9 @@ type fidRef struct {
 	// This is updated in handlers.go.
 	opened bool
 
+	// walkable indicates this fidRef may be walked.
+	walkable bool
+
 	// openFlags is the mode used in the open.
 	//
 	// This is updated in handlers.go.
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index b325afa63..9c4864cf1 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -117,17 +117,9 @@ func NewAttachPoint(prefix string, c Config) p9.Attacher {
 }
 
 // Attach implements p9.Attacher.
-func (a *attachPoint) Attach(appPath string) (p9.File, error) {
-	// Only proceed if 'appPath' is valid.
-	if !path.IsAbs(appPath) {
-		return nil, fmt.Errorf("invalid path %q", appPath)
-	}
-	if path.Clean(appPath) != appPath {
-		return nil, fmt.Errorf("invalid path %q", appPath)
-	}
-
-	root := path.Join(a.prefix, appPath)
-	fi, err := os.Stat(root)
+func (a *attachPoint) Attach() (p9.File, error) {
+	// Sanity check the prefix.
+	fi, err := os.Stat(a.prefix)
 	if err != nil {
 		return nil, err
 	}
@@ -136,14 +128,15 @@ func (a *attachPoint) Attach(appPath string) (p9.File, error) {
 		mode = os.O_RDONLY
 	}
 
-	f, err := os.OpenFile(root, mode|openFlags, 0)
+	// Open the root directory.
+	f, err := os.OpenFile(a.prefix, mode|openFlags, 0)
 	if err != nil {
-		return nil, fmt.Errorf("unable to open file %q, err: %v", root, err)
+		return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err)
 	}
 	stat, err := stat(int(f.Fd()))
 	if err != nil {
 		f.Close()
-		return nil, fmt.Errorf("failed to stat file %q, err: %v", root, err)
+		return nil, fmt.Errorf("failed to stat file %q, err: %v", a.prefix, err)
 	}
 
 	a.attachedMu.Lock()
@@ -154,7 +147,7 @@ func (a *attachPoint) Attach(appPath string) (p9.File, error) {
 	}
 	a.attached = true
 
-	return newLocalFile(a, f, root, stat)
+	return newLocalFile(a, f, a.prefix, stat)
 }
 
 // makeQID returns a unique QID for the given stat buffer.
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index fcece4e83..a500a2976 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -19,7 +19,6 @@ import (
 	"io/ioutil"
 	"os"
 	"path"
-	"strings"
 	"syscall"
 	"testing"
 
@@ -88,9 +87,9 @@ func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testin
 			defer os.RemoveAll(path)
 
 			a := NewAttachPoint(path, c)
-			root, err := a.Attach("/")
+			root, err := a.Attach()
 			if err != nil {
-				t.Fatalf("Attach(%q) failed, err: %v", "/", err)
+				t.Fatalf("Attach failed, err: %v", err)
 			}
 
 			_, file, err := root.Walk([]string{name})
@@ -115,9 +114,9 @@ func setup(ft fileType) (string, string, error) {
 
 	// First attach with writable configuration to setup tree.
 	a := NewAttachPoint(path, Config{})
-	root, err := a.Attach("/")
+	root, err := a.Attach()
 	if err != nil {
-		return "", "", fmt.Errorf("Attach(%q) failed, err: %v", "/", err)
+		return "", "", fmt.Errorf("Attach failed, err: %v", err)
 	}
 	defer root.Close()
 
@@ -618,9 +617,9 @@ func TestAttachFile(t *testing.T) {
 	}
 
 	a := NewAttachPoint(path, conf)
-	root, err := a.Attach("/")
+	root, err := a.Attach()
 	if err != nil {
-		t.Fatalf("Attach(%q) failed, err: %v", "/", err)
+		t.Fatalf("Attach failed, err: %v", err)
 	}
 
 	if _, _, _, err := root.Open(p9.ReadWrite); err != nil {
@@ -649,31 +648,6 @@ func TestAttachFile(t *testing.T) {
 	}
 }
 
-func TestAttachError(t *testing.T) {
-	conf := Config{ROMount: false}
-	root, err := ioutil.TempDir("", "root-")
-	if err != nil {
-		t.Fatalf("ioutil.TempDir() failed, err: %v", err)
-	}
-	defer os.RemoveAll(root)
-	a := NewAttachPoint(root, conf)
-
-	c := path.Join(root, "test")
-	if err := os.Mkdir(c, 0700); err != nil {
-		t.Fatalf("os.Create(%q) failed, err: %v", c, err)
-	}
-
-	for _, p := range []string{"test", "/test/../", "/test/./", "/test//"} {
-		_, err := a.Attach(p)
-		if err == nil {
-			t.Fatalf("Attach(%q) should have failed", p)
-		}
-		if want := "invalid path"; !strings.Contains(err.Error(), want) {
-			t.Fatalf("Attach(%q) wrong error, got: %v, wanted: %v", p, err, want)
-		}
-	}
-}
-
 func TestDoubleAttachError(t *testing.T) {
 	conf := Config{ROMount: false}
 	root, err := ioutil.TempDir("", "root-")
@@ -683,10 +657,10 @@ func TestDoubleAttachError(t *testing.T) {
 	defer os.RemoveAll(root)
 	a := NewAttachPoint(root, conf)
 
-	if _, err := a.Attach("/"); err != nil {
-		t.Fatalf("Attach(%q) failed: %v", "/", err)
+	if _, err := a.Attach(); err != nil {
+		t.Fatalf("Attach failed: %v", err)
 	}
-	if _, err := a.Attach("/"); err == nil {
-		t.Fatalf("Attach(%q) should have failed", "test")
+	if _, err := a.Attach(); err == nil {
+		t.Fatalf("Attach should have failed, got %v want non-nil", err)
 	}
 }
-- 
cgit v1.2.3


From cf5006ff24c966a652f5b9cbce3ba363208c197a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 7 Sep 2018 16:59:33 -0700
Subject: Disable test until we figure out what's broken

PiperOrigin-RevId: 212059579
Change-Id: I052c2192d3483d7bd0fd2232ef2023a12da66446
---
 runsc/test/image/image_test.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 13fd8f1ee..bc9891de4 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -147,7 +147,8 @@ func TestNginx(t *testing.T) {
 	}
 }
 
-func TestMysql(t *testing.T) {
+// TODO: Enable again when bug is fixed.
+func DISABLED_TestMysql(t *testing.T) {
 	if err := testutil.Pull("mysql"); err != nil {
 		t.Fatalf("docker pull failed: %v", err)
 	}
-- 
cgit v1.2.3


From 9751b800a6835f7febf99f1dee22a5aedd43f381 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 7 Sep 2018 17:38:34 -0700
Subject: runsc: Support multi-container exec.

We must use a context.Context with a Root Dirent that corresponds to the
container's chroot. Previously we were using the root context, which does not
have a chroot.

Getting the correct context required refactoring some of the path-lookup code.
We can't lookup the path without a context.Context, which requires
kernel.CreateProcArgs, which we only get inside control.Execute.  So we have to
do the path lookup much later than we previously were.

PiperOrigin-RevId: 212064734
Change-Id: I84a5cfadacb21fd9c3ab9c393f7e308a40b9b537
---
 pkg/sentry/control/proc.go   | 18 +++++++++++-
 pkg/sentry/fs/mounts.go      | 66 ++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/controller.go     | 36 ++++++++++++++++++------
 runsc/boot/fs.go             | 58 ++------------------------------------
 runsc/sandbox/sandbox.go     |  7 ++++-
 runsc/specutils/specutils.go | 12 --------
 6 files changed, 118 insertions(+), 79 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 6949a3ae5..289b8ba0e 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -54,6 +54,11 @@ type ExecArgs struct {
 	// Envv is a list of environment variables.
 	Envv []string `json:"envv"`
 
+	// Root defines the root directory for the new process. A reference on
+	// Root must be held for the lifetime of the ExecArgs. If Root is nil,
+	// it will default to the VFS root.
+	Root *fs.Dirent
+
 	// WorkingDirectory defines the working directory for the new process.
 	WorkingDirectory string `json:"wd"`
 
@@ -99,6 +104,7 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		Argv:                    args.Argv,
 		Envv:                    args.Envv,
 		WorkingDirectory:        args.WorkingDirectory,
+		Root:                    args.Root,
 		Credentials:             creds,
 		FDMap:                   fdm,
 		Umask:                   0022,
@@ -109,8 +115,18 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
 	}
 	ctx := initArgs.NewContext(proc.Kernel)
-	mounter := fs.FileOwnerFromContext(ctx)
 
+	if initArgs.Filename == "" {
+		// Get the full path to the filename from the PATH env variable.
+		paths := fs.GetPath(initArgs.Envv)
+		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+		if err != nil {
+			return fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+		}
+		initArgs.Filename = f
+	}
+
+	mounter := fs.FileOwnerFromContext(ctx)
 	for appFD, f := range args.FilePayload.Files {
 		enableIoctl := args.StdioIsPty && appFD <= 2
 
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 0318f135d..c0a803b2d 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -16,9 +16,13 @@ package fs
 
 import (
 	"fmt"
+	"path"
+	"strings"
 	"sync"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -509,3 +513,65 @@ func (mns *MountNamespace) SyncAll(ctx context.Context) {
 	defer mns.mu.Unlock()
 	mns.root.SyncAll(ctx)
 }
+
+// ResolveExecutablePath resolves the given executable name given a set of
+// paths that might contain it.
+func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name string, paths []string) (string, error) {
+	// Absolute paths can be used directly.
+	if path.IsAbs(name) {
+		return name, nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(name, '/') > 0 {
+		if wd == "" {
+			wd = "/"
+		}
+		if !path.IsAbs(wd) {
+			return "", fmt.Errorf("working directory %q must be absolute", wd)
+		}
+		return path.Join(wd, name), nil
+	}
+
+	// Otherwise, We must lookup the name in the paths, starting from the
+	// calling context's root directory.
+	root := RootFromContext(ctx)
+	if root == nil {
+		// Caller has no root. Don't bother traversing anything.
+		return "", syserror.ENOENT
+	}
+	defer root.DecRef()
+	for _, p := range paths {
+		binPath := path.Join(p, name)
+		d, err := mns.FindInode(ctx, root, nil, binPath, linux.MaxSymlinkTraversals)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return "", err
+		}
+		defer d.DecRef()
+
+		// Check whether we can read and execute the found file.
+		if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil {
+			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
+			continue
+		}
+		return path.Join("/", p, name), nil
+	}
+	return "", syserror.ENOENT
+}
+
+// GetPath returns the PATH as a slice of strings given the environemnt
+// variables.
+func GetPath(env []string) []string {
+	const prefix = "PATH="
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			return strings.Split(strings.TrimPrefix(e, prefix), ":")
+		}
+	}
+	return nil
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 45aa255c4..fd5b7cc9e 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -224,21 +224,39 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	return nil
 }
 
+// ExecArgs contains arguments to Execute.
+type ExecArgs struct {
+	control.ExecArgs
+
+	// CID is the ID of the container to exec in.
+	CID string
+}
+
 // Execute runs a command on a created or running sandbox.
-func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
+func (cm *containerManager) Execute(e *ExecArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Execute: %+v", *e)
 
-	if e.Filename == "" {
-		rootCtx := cm.l.rootProcArgs.NewContext(cm.l.k)
-		rootMns := cm.l.k.RootMountNamespace()
-		var err error
-		if e.Filename, err = getExecutablePath(rootCtx, rootMns, e.Argv[0], e.Envv); err != nil {
-			return fmt.Errorf("error getting executable path for %q: %v", e.Argv[0], err)
-		}
+	// Get the container Root Dirent from the Task, since we must run this
+	// process with the same Root.
+	cm.l.mu.Lock()
+	tgid, ok := cm.l.containerRootTGIDs[e.CID]
+	cm.l.mu.Unlock()
+	if !ok {
+		return fmt.Errorf("cannot exec in container %q: no such container", e.CID)
+	}
+	t := cm.l.k.TaskSet().Root.TaskWithID(kernel.ThreadID(tgid))
+	if t == nil {
+		return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", e.CID, tgid)
+	}
+	t.WithMuLocked(func(t *kernel.Task) {
+		e.Root = t.FSContext().RootDirectory()
+	})
+	if e.Root != nil {
+		defer e.Root.DecRef()
 	}
 
 	proc := control.Proc{Kernel: cm.l.k}
-	if err := proc.Exec(e, waitStatus); err != nil {
+	if err := proc.Exec(&e.ExecArgs, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
 	}
 	return nil
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 3df276170..5ec9a7d03 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -683,64 +682,11 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	return nil
 }
 
-// getExecutablePath traverses the *container's* filesystem to resolve exec's
-// absolute path. For example, if the container is being served files by the
-// fsgofer serving /foo/bar as the container root, it will search within
-// /foo/bar, not the host root.
-// TODO: Unit test this.
-func getExecutablePath(ctx context.Context, mns *fs.MountNamespace, filename string, env []string) (string, error) {
-	exec := filepath.Clean(filename)
-
-	// Don't search PATH if exec is a path to a file (absolute or relative).
-	if strings.IndexByte(exec, '/') >= 0 {
-		return exec, nil
-	}
-
-	// Search the PATH for a file whose name matches the one we are looking
-	// for.
-	pathDirs := specutils.GetPath(env)
-	for _, p := range pathDirs {
-		// Try to find the binary inside path p.
-		binPath := path.Join(p, filename)
-		root := fs.RootFromContext(ctx)
-		defer root.DecRef()
-		d, err := mns.FindInode(ctx, root, nil, binPath, linux.MaxSymlinkTraversals)
-		if err == syserror.ENOENT || err == syserror.EACCES {
-			continue
-		}
-		if err != nil {
-			return "", fmt.Errorf("FindInode(%q) failed: %v", binPath, err)
-		}
-		defer d.DecRef()
-
-		// Check whether we can read and execute the found file.
-		if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
-			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
-			continue
-		}
-		return path.Join("/", p, exec), nil
-	}
-
-	return "", fmt.Errorf("could not find executable %q in path %v", exec, pathDirs)
-}
-
 // setExecutablePath sets the procArgs.Filename by searching the PATH for an
 // executable matching the procArgs.Argv[0].
 func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
-	if procArgs.Filename != "" {
-		// Sanity check.
-		if !path.IsAbs(procArgs.Filename) {
-			return fmt.Errorf("filename must be absolute: %q", procArgs.Filename)
-		}
-		// Nothing to set.
-		return nil
-	}
-
-	if len(procArgs.Argv) == 0 {
-		return fmt.Errorf("Argv must not be empty")
-	}
-
-	f, err := getExecutablePath(ctx, mns, procArgs.Argv[0], procArgs.Envv)
+	paths := fs.GetPath(procArgs.Envv)
+	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, procArgs.Argv[0], paths)
 	if err != nil {
 		return err
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 697210669..f272496a1 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -187,11 +187,16 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 	}
 	defer conn.Close()
 
+	ea := &boot.ExecArgs{
+		ExecArgs: *e,
+		CID:      cid,
+	}
+
 	// Send a message to the sandbox control server to start the container.
 	var waitStatus uint32
 	// TODO: Pass in the container id (cid) here. The sandbox
 	// should execute in the context of that container.
-	if err := conn.Call(boot.ContainerExecute, e, &waitStatus); err != nil {
+	if err := conn.Call(boot.ContainerExecute, ea, &waitStatus); err != nil {
 		return 0, fmt.Errorf("error executing in sandbox: %v", err)
 	}
 
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index f3fa8d129..fdc9007e0 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -163,18 +163,6 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error)
 	return &spec, nil
 }
 
-// GetPath returns the PATH as a slice of strings given the environemnt
-// variables.
-func GetPath(env []string) []string {
-	const prefix = "PATH="
-	for _, e := range env {
-		if strings.HasPrefix(e, prefix) {
-			return strings.Split(strings.TrimPrefix(e, prefix), ":")
-		}
-	}
-	return nil
-}
-
 // Capabilities takes in spec and returns a TaskCapabilities corresponding to
 // the spec.
 func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
-- 
cgit v1.2.3


From 922d8c3c8c7ecc1d893857425f8e850513ab682b Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 7 Sep 2018 17:55:04 -0700
Subject: Automated rollback of changelist 211992321

PiperOrigin-RevId: 212066419
Change-Id: Icded56e7e117bfd9b644e6541bddcd110460a9b8
---
 runsc/main.go | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'runsc')

diff --git a/runsc/main.go b/runsc/main.go
index c30b29b81..44d30768f 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -179,10 +179,6 @@ func main() {
 
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
-		// Dup f to stderr so we capture stack traces on panic.
-		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
-			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
-		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	} else if *debugLogDir != "" {
 		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
@@ -193,10 +189,6 @@ func main() {
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
 		}
-		// Dup f to stderr so we capture stack traces on panic.
-		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
-			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
-		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	}
 
-- 
cgit v1.2.3


From 0c0c942327468e605f5b71cd6ffa75dda6e24fdc Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 7 Sep 2018 18:22:22 -0700
Subject: Automated rollback of changelist 212059579

PiperOrigin-RevId: 212069131
Change-Id: I01476f957bbf29d4ee5a3c11d59d4f863ba9f2df
---
 runsc/test/image/image_test.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index bc9891de4..13fd8f1ee 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -147,8 +147,7 @@ func TestNginx(t *testing.T) {
 	}
 }
 
-// TODO: Enable again when bug is fixed.
-func DISABLED_TestMysql(t *testing.T) {
+func TestMysql(t *testing.T) {
 	if err := testutil.Pull("mysql"); err != nil {
 		t.Fatalf("docker pull failed: %v", err)
 	}
-- 
cgit v1.2.3


From e198f9ab02874caeef65f16c0546af1e52e9a7d3 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 10 Sep 2018 09:59:03 -0700
Subject: runsc: Chmod all mounted files to 777 inside chroot.

Inside the chroot, we run as user nobody, so all mounted files and directories
must be accessible to all users.

PiperOrigin-RevId: 212284805
Change-Id: I705e0dbbf15e01e04e0c7f378a99daffe6866807
---
 runsc/sandbox/chroot.go | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
index a77a186c2..f35d9c72d 100644
--- a/runsc/sandbox/chroot.go
+++ b/runsc/sandbox/chroot.go
@@ -36,7 +36,16 @@ func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
 	chrootDst := filepath.Join(chroot, dst)
 	log.Infof("Mounting %q at %q", src, chrootDst)
 
-	return specutils.Mount(src, chrootDst, typ, flags)
+	if err := specutils.Mount(src, chrootDst, typ, flags); err != nil {
+		return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err)
+	}
+
+	// Make sure the mount is accessible to all users, since we will be
+	// running as nobody inside the chroot.
+	if err := os.Chmod(chrootDst, 0777); err != nil {
+		return fmt.Errorf("Chmod(%q) failed: %v", chroot, err)
+	}
+	return nil
 }
 
 // setUpChroot creates an empty directory with runsc mounted at /runsc, proc
-- 
cgit v1.2.3


From c44bc6612fc4554d0aa4e484a46cd1f6b6a7b5c5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 11 Sep 2018 11:04:06 -0700
Subject: Allow fstatat back in syscall filters

PiperOrigin-RevId: 212483372
Change-Id: If95f32a8e41126cf3dc8bd6c8b2fb0fcfefedc6d
---
 runsc/boot/filter/config.go | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 1a0c426ab..8cdf56963 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -205,13 +205,14 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
 		},
 	},
-	syscall.SYS_MPROTECT:  {},
-	syscall.SYS_MUNMAP:    {},
-	syscall.SYS_NANOSLEEP: {},
-	syscall.SYS_POLL:      {},
-	syscall.SYS_PREAD64:   {},
-	syscall.SYS_PWRITE64:  {},
-	syscall.SYS_READ:      {},
+	syscall.SYS_MPROTECT:   {},
+	syscall.SYS_MUNMAP:     {},
+	syscall.SYS_NANOSLEEP:  {},
+	syscall.SYS_NEWFSTATAT: {},
+	syscall.SYS_POLL:       {},
+	syscall.SYS_PREAD64:    {},
+	syscall.SYS_PWRITE64:   {},
+	syscall.SYS_READ:       {},
 	syscall.SYS_READV: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
-- 
cgit v1.2.3


From 6cc9b311af3633d244f526abed50c0d3b0ce06a1 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 11 Sep 2018 13:08:36 -0700
Subject: platform: Pass device fd into platform constructor.

We were previously openining the platform device (i.e. /dev/kvm) inside the
platfrom constructor (i.e. kvm.New).  This requires that we have RW access to
the platform device when constructing the platform.

However, now that the runsc sandbox process runs as user "nobody", it is not
able to open the platform device.

This CL changes the kvm constructor to take the platform device FD, rather than
opening the device file itself. The device file is opened outside of the
sandbox and passed to the sandbox process.

PiperOrigin-RevId: 212505804
Change-Id: I427e1d9de5eb84c84f19d513356e1bb148a52910
---
 pkg/sentry/platform/kvm/kvm.go      | 25 ++++++++++++++---------
 pkg/sentry/platform/kvm/kvm_test.go |  6 +++++-
 runsc/boot/controller.go            | 24 +++++++++++++++++-----
 runsc/boot/loader.go                | 11 ++++++----
 runsc/boot/loader_test.go           |  2 +-
 runsc/cmd/boot.go                   |  6 +++++-
 runsc/sandbox/BUILD                 |  1 +
 runsc/sandbox/chroot.go             | 40 +++----------------------------------
 runsc/sandbox/sandbox.go            | 40 ++++++++++++++++++++++++++++++++++++-
 9 files changed, 96 insertions(+), 59 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 2dc3239a5..19bc2d515 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -17,6 +17,7 @@ package kvm
 
 import (
 	"fmt"
+	"os"
 	"sync"
 	"syscall"
 
@@ -44,25 +45,29 @@ var (
 	globalErr  error
 )
 
+// OpenDevice opens the KVM device at /dev/kvm and returns the File.
+func OpenDevice() (*os.File, error) {
+	f, err := os.OpenFile("/dev/kvm", syscall.O_RDWR, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error opening /dev/kvm: %v", err)
+	}
+	return f, nil
+}
+
 // New returns a new KVM-based implementation of the platform interface.
-func New() (*KVM, error) {
+func New(deviceFile *os.File) (*KVM, error) {
 	// Allocate physical memory for the vCPUs.
 	fm, err := filemem.New("kvm-memory")
 	if err != nil {
 		return nil, err
 	}
 
-	// Try opening KVM.
-	fd, err := syscall.Open("/dev/kvm", syscall.O_RDWR, 0)
-	if err != nil {
-		return nil, fmt.Errorf("opening /dev/kvm: %v", err)
-	}
-	defer syscall.Close(fd)
+	fd := deviceFile.Fd()
 
 	// Ensure global initialization is done.
 	globalOnce.Do(func() {
 		physicalInit()
-		globalErr = updateSystemValues(fd)
+		globalErr = updateSystemValues(int(fd))
 		ring0.Init(cpuid.HostFeatureSet())
 	})
 	if globalErr != nil {
@@ -70,10 +75,12 @@ func New() (*KVM, error) {
 	}
 
 	// Create a new VM fd.
-	vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_CREATE_VM, 0)
+	vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, fd, _KVM_CREATE_VM, 0)
 	if errno != 0 {
 		return nil, fmt.Errorf("creating VM: %v", errno)
 	}
+	// We are done with the device file.
+	deviceFile.Close()
 
 	// Create a VM context.
 	machine, err := newMachine(int(vm))
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 180bf7bb0..52448839f 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -39,7 +39,11 @@ type testHarness interface {
 
 func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) {
 	// Create the machine.
-	k, err := New()
+	deviceFile, err := OpenDevice()
+	if err != nil {
+		t.Fatalf("error opening device file: %v", err)
+	}
+	k, err := New(deviceFile)
 	if err != nil {
 		t.Fatalf("error creating KVM instance: %v", err)
 	}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index fd5b7cc9e..257f275f9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"errors"
 	"fmt"
+	"os"
 	"path"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -287,7 +288,8 @@ func (cm *containerManager) WaitForLoader(_, _ *struct{}) error {
 
 // RestoreOpts contains options related to restoring a container's file system.
 type RestoreOpts struct {
-	// FilePayload contains the state file to be restored.
+	// FilePayload contains the state file to be restored, followed by the
+	// platform device file if necessary.
 	urpc.FilePayload
 
 	// SandboxID contains the ID of the sandbox.
@@ -300,16 +302,28 @@ type RestoreOpts struct {
 // signal to start.
 func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	log.Debugf("containerManager.Restore")
-	if len(o.FilePayload.Files) != 1 {
-		return fmt.Errorf("exactly one file must be provided")
+
+	var specFile, deviceFile *os.File
+	switch numFiles := len(o.FilePayload.Files); numFiles {
+	case 2:
+		// The device file is donated to the platform, so don't Close
+		// it here.
+		deviceFile = o.FilePayload.Files[1]
+		fallthrough
+	case 1:
+		specFile = o.FilePayload.Files[0]
+		defer specFile.Close()
+	case 0:
+		return fmt.Errorf("at least one file must be passed to Restore")
+	default:
+		return fmt.Errorf("at most two files may be passed to Restore")
 	}
-	defer o.FilePayload.Files[0].Close()
 
 	// Destroy the old kernel and create a new kernel.
 	cm.l.k.Pause()
 	cm.l.k.Destroy()
 
-	p, err := createPlatform(cm.l.conf)
+	p, err := createPlatform(cm.l.conf, int(deviceFile.Fd()))
 	if err != nil {
 		return fmt.Errorf("error creating platform: %v", err)
 	}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 994b3d2e2..30d22b9c6 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -122,9 +122,9 @@ func init() {
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) {
+func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, ioFDs []int, console bool) (*Loader, error) {
 	// Create kernel and platform.
-	p, err := createPlatform(conf)
+	p, err := createPlatform(conf, deviceFD)
 	if err != nil {
 		return nil, fmt.Errorf("error creating platform: %v", err)
 	}
@@ -301,14 +301,17 @@ func (l *Loader) Destroy() {
 	l.watchdog.Stop()
 }
 
-func createPlatform(conf *Config) (platform.Platform, error) {
+func createPlatform(conf *Config, deviceFD int) (platform.Platform, error) {
 	switch conf.Platform {
 	case PlatformPtrace:
 		log.Infof("Platform: ptrace")
 		return ptrace.New()
 	case PlatformKVM:
 		log.Infof("Platform: kvm")
-		return kvm.New()
+		if deviceFD < 0 {
+			return nil, fmt.Errorf("kvm device fd must be provided")
+		}
+		return kvm.New(os.NewFile(uintptr(deviceFD), "kvm device"))
 	default:
 		return nil, fmt.Errorf("invalid platform %v", conf.Platform)
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index d6bfe9ff1..9398292ff 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -101,7 +101,7 @@ func createLoader() (*Loader, func(), error) {
 		return nil, nil, err
 	}
 
-	l, err := New(spec, conf, fd, []int{sandEnd}, false)
+	l, err := New(spec, conf, fd, -1 /* device fd */, []int{sandEnd}, false)
 	if err != nil {
 		cleanup()
 		return nil, nil, err
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index d8c7b9cd3..035147cf1 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -42,6 +42,9 @@ type Boot struct {
 	// control server that is donated to this process.
 	controllerFD int
 
+	// deviceFD is the file descriptor for the platform device file.
+	deviceFD int
+
 	// ioFDs is the list of FDs used to connect to FS gofers.
 	ioFDs intFlags
 
@@ -74,6 +77,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
 	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
 	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
@@ -134,7 +138,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-	l, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
+	l, err := boot.New(spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.console)
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 8ebd14c4e..5cf8f0cda 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -18,6 +18,7 @@ go_library(
         "//pkg/control/server",
         "//pkg/log",
         "//pkg/sentry/control",
+        "//pkg/sentry/platform/kvm",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/console",
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
index f35d9c72d..749bf3782 100644
--- a/runsc/sandbox/chroot.go
+++ b/runsc/sandbox/chroot.go
@@ -22,7 +22,6 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -39,18 +38,12 @@ func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
 	if err := specutils.Mount(src, chrootDst, typ, flags); err != nil {
 		return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err)
 	}
-
-	// Make sure the mount is accessible to all users, since we will be
-	// running as nobody inside the chroot.
-	if err := os.Chmod(chrootDst, 0777); err != nil {
-		return fmt.Errorf("Chmod(%q) failed: %v", chroot, err)
-	}
 	return nil
 }
 
-// setUpChroot creates an empty directory with runsc mounted at /runsc, proc
-// mounted at /proc, and any dev files needed for the platform.
-func setUpChroot(platform boot.PlatformType) (string, error) {
+// setUpChroot creates an empty directory with runsc mounted at /runsc and proc
+// mounted at /proc.
+func setUpChroot() (string, error) {
 	// Create the chroot directory and make it accessible to all users.
 	chroot, err := ioutil.TempDir("", "runsc-sandbox-chroot-")
 	if err != nil {
@@ -75,18 +68,6 @@ func setUpChroot(platform boot.PlatformType) (string, error) {
 		return "", fmt.Errorf("error mounting runsc in chroot: %v", err)
 	}
 
-	// Mount dev files needed for platform.
-	var devMount string
-	switch platform {
-	case boot.PlatformKVM:
-		devMount = "/dev/kvm"
-	}
-	if devMount != "" {
-		if err := mountInChroot(chroot, devMount, devMount, "bind", syscall.MS_BIND); err != nil {
-			return "", fmt.Errorf("error mounting platform device in chroot: %v", err)
-		}
-	}
-
 	return chroot, nil
 }
 
@@ -105,21 +86,6 @@ func tearDownChroot(chroot string) error {
 		return fmt.Errorf("error unmounting %q: %v", exe, err)
 	}
 
-	// Unmount platform dev files.
-	devFiles := []string{"dev/kvm"}
-	for _, f := range devFiles {
-		devPath := filepath.Join(chroot, f)
-		if _, err := os.Stat(devPath); err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-			return fmt.Errorf("Stat(%q) failed: %v", devPath, err)
-		}
-		if err := syscall.Unmount(devPath, 0); err != nil {
-			return fmt.Errorf("error unmounting %q: %v", devPath, err)
-		}
-	}
-
 	// Remove chroot directory.
 	if err := os.RemoveAll(chroot); err != nil {
 		return fmt.Errorf("error removing %q: %v", chroot, err)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f272496a1..195deda1e 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/console"
@@ -140,6 +141,14 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 		SandboxID: s.ID,
 	}
 
+	// If the platform needs a device fd we must pass it in.
+	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+		return err
+	} else if deviceFile != nil {
+		defer deviceFile.Close()
+		opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile)
+	}
+
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
@@ -315,6 +324,16 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nextFD++
 	}
 
+	// If the platform needs a device fd we must pass it in.
+	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+		return err
+	} else if deviceFile != nil {
+		defer deviceFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile)
+		cmd.Args = append(cmd.Args, "--device-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
 	// Sandbox stdio defaults to current process stdio.
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
@@ -428,7 +447,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
 		} else if specutils.HasCapSysAdmin() {
 			log.Infof("Sandbox will be started in minimal chroot")
-			chroot, err := setUpChroot(conf.Platform)
+			chroot, err := setUpChroot()
 			if err != nil {
 				return fmt.Errorf("error setting up chroot: %v", err)
 			}
@@ -660,3 +679,22 @@ func signalProcess(pid int, sig syscall.Signal) error {
 	}
 	return nil
 }
+
+// deviceFileForPlatform opens the device file for the given platform. If the
+// platform does not need a device file, then nil is returned.
+func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) {
+	var (
+		f   *os.File
+		err error
+	)
+	switch p {
+	case boot.PlatformKVM:
+		f, err = kvm.OpenDevice()
+	default:
+		return nil, nil
+	}
+	if err != nil {
+		return nil, fmt.Errorf("error opening device file for platform %q: %v", p, err)
+	}
+	return f, err
+}
-- 
cgit v1.2.3


From b4aed01bf227bfc0b29ce3100858366f60c0647b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 11 Sep 2018 17:53:48 -0700
Subject: Rollback of changelist 212483372

PiperOrigin-RevId: 212557844
Change-Id: I414de848e75d57ecee2c05e851d05b607db4aa57
---
 runsc/boot/filter/config.go | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 8cdf56963..1a0c426ab 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -205,14 +205,13 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
 		},
 	},
-	syscall.SYS_MPROTECT:   {},
-	syscall.SYS_MUNMAP:     {},
-	syscall.SYS_NANOSLEEP:  {},
-	syscall.SYS_NEWFSTATAT: {},
-	syscall.SYS_POLL:       {},
-	syscall.SYS_PREAD64:    {},
-	syscall.SYS_PWRITE64:   {},
-	syscall.SYS_READ:       {},
+	syscall.SYS_MPROTECT:  {},
+	syscall.SYS_MUNMAP:    {},
+	syscall.SYS_NANOSLEEP: {},
+	syscall.SYS_POLL:      {},
+	syscall.SYS_PREAD64:   {},
+	syscall.SYS_PWRITE64:  {},
+	syscall.SYS_READ:      {},
 	syscall.SYS_READV: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
-- 
cgit v1.2.3


From 0efde2bfbde2fea78134a32f5fb34332ec0ce531 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 12 Sep 2018 10:50:22 -0700
Subject: Remove getdents from filters

It was only used by whitelistfs, which was removed in
bc81f3fe4a042a15343d2eab44da32d818ac1ade.

PiperOrigin-RevId: 212666374
Change-Id: Ia35e6dc9d68c1a3b015d5b5f71ea3e68e46c5bed
---
 runsc/boot/filter/config.go | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 1a0c426ab..0bcc640d5 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -98,9 +98,8 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(0),
 		},
 	},
-	syscall.SYS_GETDENTS64: {},
-	syscall.SYS_GETPID:     {},
-	unix.SYS_GETRANDOM:     {},
+	syscall.SYS_GETPID: {},
+	unix.SYS_GETRANDOM: {},
 	syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
-- 
cgit v1.2.3


From 2eff1fdd061be9cfabc36532dda8cbefeb02e534 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 12 Sep 2018 15:22:24 -0700
Subject: runsc: Add exec flag that specifies where to save the
 sandbox-internal pid.

This is different from the existing -pid-file flag, which saves a host pid.

PiperOrigin-RevId: 212713968
Change-Id: I2c486de8dd5cfd9b923fb0970165ef7c5fc597f0
---
 pkg/sentry/control/proc.go        | 35 ++++++++++------
 runsc/boot/controller.go          | 33 +++++++++------
 runsc/cmd/exec.go                 | 31 +++++++++++----
 runsc/container/container.go      |  9 +++--
 runsc/container/container_test.go | 84 ++++++++++++++++++++++++---------------
 runsc/sandbox/sandbox.go          | 19 ++++-----
 6 files changed, 129 insertions(+), 82 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 289b8ba0e..1623ed19a 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -87,6 +87,24 @@ type ExecArgs struct {
 
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
+	newTG, err := proc.execAsync(args)
+	if err != nil {
+		return err
+	}
+
+	// Wait for completion.
+	newTG.WaitExited()
+	*waitStatus = newTG.ExitStatus().Status()
+	return nil
+}
+
+// ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
+// as a function rather than a method to avoid exposing execAsync as an RPC.
+func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, error) {
+	return proc.execAsync(args)
+}
+
+func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 	// Import file descriptors.
 	l := limits.NewLimitSet()
 	fdm := proc.Kernel.NewFDMap()
@@ -121,7 +139,7 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		paths := fs.GetPath(initArgs.Envv)
 		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 		if err != nil {
-			return fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			return nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
 		}
 		initArgs.Filename = f
 	}
@@ -133,7 +151,7 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		// Import the given file FD. This dups the FD as well.
 		file, err := host.ImportFile(ctx, int(f.Fd()), mounter, enableIoctl)
 		if err != nil {
-			return err
+			return nil, err
 		}
 		defer file.DecRef()
 
@@ -141,20 +159,11 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		f.Close()
 
 		if err := fdm.NewFDAt(kdefs.FD(appFD), file, kernel.FDFlags{}, l); err != nil {
-			return err
+			return nil, err
 		}
 	}
 
-	// Start the new task.
-	newTG, err := proc.Kernel.CreateProcess(initArgs)
-	if err != nil {
-		return err
-	}
-
-	// Wait for completion.
-	newTG.WaitExited()
-	*waitStatus = newTG.ExitStatus().Status()
-	return nil
+	return proc.Kernel.CreateProcess(initArgs)
 }
 
 // PsArgs is the set of arguments to ps.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 257f275f9..aaac852e0 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -41,9 +41,9 @@ const (
 	// container used by "runsc events".
 	ContainerEvent = "containerManager.Event"
 
-	// ContainerExecute is the URPC endpoint for executing a command in a
+	// ContainerExecuteAsync is the URPC endpoint for executing a command in a
 	// container..
-	ContainerExecute = "containerManager.Execute"
+	ContainerExecuteAsync = "containerManager.ExecuteAsync"
 
 	// ContainerPause pauses the container.
 	ContainerPause = "containerManager.Pause"
@@ -233,33 +233,40 @@ type ExecArgs struct {
 	CID string
 }
 
-// Execute runs a command on a created or running sandbox.
-func (cm *containerManager) Execute(e *ExecArgs, waitStatus *uint32) error {
-	log.Debugf("containerManager.Execute: %+v", *e)
+// ExecuteAsync starts running a command on a created or running sandbox. It
+// returns the pid of the new process.
+func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error {
+	log.Debugf("containerManager.ExecuteAsync: %+v", args)
 
 	// Get the container Root Dirent from the Task, since we must run this
 	// process with the same Root.
 	cm.l.mu.Lock()
-	tgid, ok := cm.l.containerRootTGIDs[e.CID]
+	tgid, ok := cm.l.containerRootTGIDs[args.CID]
 	cm.l.mu.Unlock()
 	if !ok {
-		return fmt.Errorf("cannot exec in container %q: no such container", e.CID)
+		return fmt.Errorf("cannot exec in container %q: no such container", args.CID)
 	}
 	t := cm.l.k.TaskSet().Root.TaskWithID(kernel.ThreadID(tgid))
 	if t == nil {
-		return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", e.CID, tgid)
+		return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", args.CID, tgid)
 	}
 	t.WithMuLocked(func(t *kernel.Task) {
-		e.Root = t.FSContext().RootDirectory()
+		args.Root = t.FSContext().RootDirectory()
 	})
-	if e.Root != nil {
-		defer e.Root.DecRef()
+	if args.Root != nil {
+		defer args.Root.DecRef()
 	}
 
+	// Start the process.
 	proc := control.Proc{Kernel: cm.l.k}
-	if err := proc.Exec(&e.ExecArgs, waitStatus); err != nil {
-		return fmt.Errorf("error executing: %+v: %v", e, err)
+	newTG, err := control.ExecAsync(&proc, &args.ExecArgs)
+	if err != nil {
+		return fmt.Errorf("error executing: %+v: %v", args, err)
 	}
+
+	// Return the pid of the newly-created process.
+	ts := cm.l.k.TaskSet()
+	*pid = int32(ts.Root.IDOfThreadGroup(newTG))
 	return nil
 }
 
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index da1642c08..0d1fa6e20 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -45,12 +45,13 @@ type Exec struct {
 	cwd string
 	env stringSlice
 	// user contains the UID and GID with which to run the new process.
-	user        user
-	extraKGIDs  stringSlice
-	caps        stringSlice
-	detach      bool
-	processPath string
-	pidFile     string
+	user            user
+	extraKGIDs      stringSlice
+	caps            stringSlice
+	detach          bool
+	processPath     string
+	pidFile         string
+	internalPidFile string
 
 	// consoleSocket is the path to an AF_UNIX socket which will receive a
 	// file descriptor referencing the master end of the console's
@@ -97,6 +98,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
 	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
+	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
 	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
 }
 
@@ -146,10 +148,25 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 
-	ws, err := c.Execute(e)
+	// Start the new process and get it pid.
+	pid, err := c.Execute(e)
 	if err != nil {
 		Fatalf("error getting processes for container: %v", err)
 	}
+
+	// Write the sandbox-internal pid if required.
+	if ex.internalPidFile != "" {
+		pidStr := []byte(strconv.Itoa(int(pid)))
+		if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
+			Fatalf("error writing internal pid file %q: %v", ex.internalPidFile, err)
+		}
+	}
+
+	// Wait for the process to exit.
+	ws, err := c.WaitPID(pid)
+	if err != nil {
+		Fatalf("error waiting on pid %d: %v", pid, err)
+	}
 	*waitStatus = ws
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 9a05a1dc5..38848d02f 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -353,13 +353,14 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	return c.Wait()
 }
 
-// Execute runs the specified command in the container.
-func (c *Container) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) {
-	log.Debugf("Execute in container %q, args: %+v", c.ID, e)
+// Execute runs the specified command in the container. It returns the pid of
+// the newly created process.
+func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
+	log.Debugf("Execute in container %q, args: %+v", c.ID, args)
 	if c.Status != Created && c.Status != Running {
 		return 0, fmt.Errorf("cannot exec in container in state %s", c.Status)
 	}
-	return c.Sandbox.Execute(c.ID, e)
+	return c.Sandbox.Execute(c.ID, args)
 }
 
 // Event returns events for the container.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index c45eb79a3..790334249 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -49,11 +49,11 @@ func init() {
 }
 
 // waitForProcessList waits for the given process list to show up in the container.
-func waitForProcessList(s *Container, expected []*control.Process) error {
+func waitForProcessList(cont *Container, expected []*control.Process) error {
 	var got []*control.Process
 	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
 		var err error
-		got, err = s.Processes()
+		got, err = cont.Processes()
 		if err != nil {
 			return fmt.Errorf("error getting process data from container: %v", err)
 		}
@@ -485,12 +485,12 @@ func TestExec(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
-		defer s.Destroy()
-		if err := s.Start(conf); err != nil {
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
 
@@ -513,11 +513,11 @@ func TestExec(t *testing.T) {
 		}
 
 		// Verify that "sleep 100" is running.
-		if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
 			t.Error(err)
 		}
 
-		execArgs := control.ExecArgs{
+		args := &control.ExecArgs{
 			Filename:         "/bin/sleep",
 			Argv:             []string{"sleep", "5"},
 			WorkingDirectory: "/",
@@ -528,17 +528,19 @@ func TestExec(t *testing.T) {
 		// First, start running exec (whick blocks).
 		status := make(chan error, 1)
 		go func() {
-			exitStatus, err := s.Execute(&execArgs)
+			exitStatus, err := cont.executeSync(args)
 			if err != nil {
+				log.Debugf("error executing: %v", err)
 				status <- err
 			} else if exitStatus != 0 {
+				log.Debugf("bad status: %d", exitStatus)
 				status <- fmt.Errorf("failed with exit status: %v", exitStatus)
 			} else {
 				status <- nil
 			}
 		}()
 
-		if err := waitForProcessList(s, expectedPL); err != nil {
+		if err := waitForProcessList(cont, expectedPL); err != nil {
 			t.Fatal(err)
 		}
 
@@ -548,7 +550,7 @@ func TestExec(t *testing.T) {
 			t.Fatalf("container timed out waiting for exec to finish.")
 		case st := <-status:
 			if st != nil {
-				t.Errorf("container failed to exec %v: %v", execArgs, err)
+				t.Errorf("container failed to exec %v: %v", args, err)
 			}
 		}
 	}
@@ -884,15 +886,18 @@ func TestPauseResume(t *testing.T) {
 		}
 
 		script := fmt.Sprintf("while [[ -f %q ]]; do sleep 0.1; done", lock.Name())
-		execArgs := control.ExecArgs{
+		args := &control.ExecArgs{
 			Filename:         "/bin/bash",
 			Argv:             []string{"bash", "-c", script},
 			WorkingDirectory: "/",
 			KUID:             uid,
 		}
 
-		// First, start running exec (which blocks).
-		go cont.Execute(&execArgs)
+		// First, start running exec.
+		_, err = cont.Execute(args)
+		if err != nil {
+			t.Fatalf("error executing: %v", err)
+		}
 
 		// Verify that "sleep 5" is running.
 		if err := waitForProcessList(cont, expectedPL); err != nil {
@@ -1022,12 +1027,12 @@ func TestCapabilities(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
-		defer s.Destroy()
-		if err := s.Start(conf); err != nil {
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
 
@@ -1048,7 +1053,7 @@ func TestCapabilities(t *testing.T) {
 				Cmd:  "exe",
 			},
 		}
-		if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
 			t.Fatalf("Failed to wait for sleep to start, err: %v", err)
 		}
 
@@ -1064,7 +1069,7 @@ func TestCapabilities(t *testing.T) {
 		// Need to traverse the intermediate directory.
 		os.Chmod(rootDir, 0755)
 
-		execArgs := control.ExecArgs{
+		args := &control.ExecArgs{
 			Filename:         exePath,
 			Argv:             []string{exePath},
 			WorkingDirectory: "/",
@@ -1074,17 +1079,17 @@ func TestCapabilities(t *testing.T) {
 		}
 
 		// "exe" should fail because we don't have the necessary permissions.
-		if _, err := s.Execute(&execArgs); err == nil {
+		if _, err := cont.executeSync(args); err == nil {
 			t.Fatalf("container executed without error, but an error was expected")
 		}
 
 		// Now we run with the capability enabled and should succeed.
-		execArgs.Capabilities = &auth.TaskCapabilities{
+		args.Capabilities = &auth.TaskCapabilities{
 			EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
 		}
 		// "exe" should not fail this time.
-		if _, err := s.Execute(&execArgs); err != nil {
-			t.Fatalf("container failed to exec %v: %v", execArgs, err)
+		if _, err := cont.executeSync(args); err != nil {
+			t.Fatalf("container failed to exec %v: %v", args, err)
 		}
 	}
 }
@@ -1404,11 +1409,11 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	filename := filepath.Join(dir, "file")
 
 	// File does not exist yet. Reading from the sandbox should fail.
-	execArgsTestFile := control.ExecArgs{
+	argsTestFile := &control.ExecArgs{
 		Filename: "/usr/bin/test",
 		Argv:     []string{"test", "-f", filename},
 	}
-	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+	if ws, err := c.executeSync(argsTestFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() == 0 {
 		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
@@ -1420,7 +1425,7 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// Now we should be able to test the file from within the sandbox.
-	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+	if ws, err := c.executeSync(argsTestFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
 		t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus())
@@ -1433,18 +1438,18 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// File should no longer exist at the old path within the sandbox.
-	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+	if ws, err := c.executeSync(argsTestFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() == 0 {
 		t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus())
 	}
 
 	// We should be able to test the new filename from within the sandbox.
-	execArgsTestNewFile := control.ExecArgs{
+	argsTestNewFile := &control.ExecArgs{
 		Filename: "/usr/bin/test",
 		Argv:     []string{"test", "-f", newFilename},
 	}
-	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
 	} else if ws.ExitStatus() != 0 {
 		t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus())
@@ -1456,20 +1461,20 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// Renamed file should no longer exist at the old path within the sandbox.
-	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
 	} else if ws.ExitStatus() == 0 {
 		t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus())
 	}
 
 	// Now create the file from WITHIN the sandbox.
-	execArgsTouch := control.ExecArgs{
+	argsTouch := &control.ExecArgs{
 		Filename: "/usr/bin/touch",
 		Argv:     []string{"touch", filename},
 		KUID:     auth.KUID(os.Getuid()),
 		KGID:     auth.KGID(os.Getgid()),
 	}
-	if ws, err := c.Execute(&execArgsTouch); err != nil {
+	if ws, err := c.executeSync(argsTouch); err != nil {
 		t.Fatalf("unexpected error touching file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
 		t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus())
@@ -1486,11 +1491,11 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// Delete the file from within the sandbox.
-	execArgsRemove := control.ExecArgs{
+	argsRemove := &control.ExecArgs{
 		Filename: "/bin/rm",
 		Argv:     []string{"rm", filename},
 	}
-	if ws, err := c.Execute(&execArgsRemove); err != nil {
+	if ws, err := c.executeSync(argsRemove); err != nil {
 		t.Fatalf("unexpected error removing file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
 		t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus())
@@ -1547,6 +1552,19 @@ func TestGoferExits(t *testing.T) {
 	}
 }
 
+// executeSync synchronously executes a new process.
+func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
+	pid, err := cont.Execute(args)
+	if err != nil {
+		return 0, fmt.Errorf("error executing: %v", err)
+	}
+	ws, err := cont.WaitPID(pid)
+	if err != nil {
+		return 0, fmt.Errorf("error waiting: %v", err)
+	}
+	return ws, nil
+}
+
 func TestMain(m *testing.M) {
 	testutil.RunAsRoot(m)
 }
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 195deda1e..8e90dcc70 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -187,8 +187,9 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	return pl, nil
 }
 
-// Execute runs the specified command in the container.
-func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, error) {
+// Execute runs the specified command in the container. It returns the pid of
+// the newly created process.
+func (s *Sandbox) Execute(cid string, args *control.ExecArgs) (int32, error) {
 	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -196,20 +197,14 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 	}
 	defer conn.Close()
 
-	ea := &boot.ExecArgs{
-		ExecArgs: *e,
-		CID:      cid,
-	}
+	rpcArgs := &boot.ExecArgs{ExecArgs: *args, CID: cid}
 
 	// Send a message to the sandbox control server to start the container.
-	var waitStatus uint32
-	// TODO: Pass in the container id (cid) here. The sandbox
-	// should execute in the context of that container.
-	if err := conn.Call(boot.ContainerExecute, ea, &waitStatus); err != nil {
+	var pid int32
+	if err := conn.Call(boot.ContainerExecuteAsync, rpcArgs, &pid); err != nil {
 		return 0, fmt.Errorf("error executing in sandbox: %v", err)
 	}
-
-	return syscall.WaitStatus(waitStatus), nil
+	return pid, nil
 }
 
 // Event retrieves stats about the sandbox such as memory and CPU utilization.
-- 
cgit v1.2.3


From bde2a91433cfbac426577a691bf13817115b53be Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Thu, 13 Sep 2018 16:36:53 -0700
Subject: runsc: Support container signal/wait.

This CL:
1) Fix `runsc wait`, it now also works after the container exits;
2) Generate correct container state in Load;
2) Make sure `Destory` cleanup everything before successfully return.

PiperOrigin-RevId: 212900107
Change-Id: Ie129cbb9d74f8151a18364f1fc0b2603eac4109a
---
 runsc/boot/controller.go          |  25 +++---
 runsc/boot/loader.go              |  75 ++++++++--------
 runsc/cmd/checkpoint.go           |   2 +-
 runsc/cmd/debug.go                |   6 +-
 runsc/container/container.go      | 178 ++++++++++++++++----------------------
 runsc/container/container_test.go |  52 +++++++++--
 runsc/sandbox/BUILD               |   2 +-
 runsc/sandbox/sandbox.go          |  38 ++++----
 8 files changed, 195 insertions(+), 183 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index aaac852e0..69154ff23 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -161,8 +161,11 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
 	log.Debugf("containerManager.StartRoot")
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
+	if err := <-cm.startResultChan; err != nil {
+		return fmt.Errorf("failed to start sandbox: %v", err)
+	}
 	cm.l.setRootContainerID(*cid)
-	return <-cm.startResultChan
+	return nil
 }
 
 // Processes retrieves information about processes running in the sandbox.
@@ -216,11 +219,11 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 		return fmt.Errorf("start arguments must contain at least one file for the container root")
 	}
 
-	tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
 		return err
 	}
-	log.Debugf("Container %q started with root PID of %d", args.CID, tgid)
+	log.Debugf("Container %q started", args.CID)
 
 	return nil
 }
@@ -241,16 +244,12 @@ func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error {
 	// Get the container Root Dirent from the Task, since we must run this
 	// process with the same Root.
 	cm.l.mu.Lock()
-	tgid, ok := cm.l.containerRootTGIDs[args.CID]
+	tg, ok := cm.l.containerRootTGs[args.CID]
 	cm.l.mu.Unlock()
 	if !ok {
 		return fmt.Errorf("cannot exec in container %q: no such container", args.CID)
 	}
-	t := cm.l.k.TaskSet().Root.TaskWithID(kernel.ThreadID(tgid))
-	if t == nil {
-		return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", args.CID, tgid)
-	}
-	t.WithMuLocked(func(t *kernel.Task) {
+	tg.Leader().WithMuLocked(func(t *kernel.Task) {
 		args.Root = t.FSContext().RootDirectory()
 	})
 	if args.Root != nil {
@@ -378,12 +377,15 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 	cm.l.watchdog = watchdog
 	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
-	cm.l.setRootContainerID(o.SandboxID)
 	cm.l.restore = true
 
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
-	return <-cm.startResultChan
+	if err := <-cm.startResultChan; err != nil {
+		return fmt.Errorf("failed to start sandbox: %v", err)
+	}
+	cm.l.setRootContainerID(o.SandboxID)
+	return nil
 }
 
 // Resume unpauses a container.
@@ -423,6 +425,7 @@ type SignalArgs struct {
 }
 
 // Signal sends a signal to the init process of the container.
+// TODO: Send signal to exec process.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
 	log.Debugf("containerManager.Signal")
 	return cm.l.signal(args.CID, args.Signo)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 30d22b9c6..2ddb358bd 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,7 +16,6 @@
 package boot
 
 import (
-	"errors"
 	"fmt"
 	"math/rand"
 	"os"
@@ -101,15 +100,15 @@ type Loader struct {
 	// sandboxID is the ID for the whole sandbox.
 	sandboxID string
 
-	// mu guards containerRootTGIDs.
+	// mu guards containerRootTGs.
 	mu sync.Mutex
 
-	// containerRootTGIDs maps container IDs to their root processes. It
+	// containerRootTGs maps container IDs to their root processes. It
 	// can be used to determine which process to manipulate when clients
 	// call methods on particular containers.
 	//
-	// containerRootTGIDs is guarded by mu.
-	containerRootTGIDs map[string]kernel.ThreadID
+	// containerRootTGs is guarded by mu.
+	containerRootTGs map[string]*kernel.ThreadGroup
 }
 
 func init() {
@@ -399,11 +398,11 @@ func (l *Loader) run() error {
 
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process.
-func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) (kernel.ThreadID, error) {
+func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
-		return 0, fmt.Errorf("error creating capabilities: %v", err)
+		return fmt.Errorf("error creating capabilities: %v", err)
 	}
 
 	// Convert the spec's additional GIDs to KGIDs.
@@ -429,7 +428,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	procArgs, err := newProcess(spec, creds, l.k)
 	if err != nil {
-		return 0, fmt.Errorf("failed to create new process: %v", err)
+		return fmt.Errorf("failed to create new process: %v", err)
 	}
 
 	// Can't take ownership away from os.File. dup them to get a new FDs.
@@ -437,7 +436,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	for _, f := range files {
 		fd, err := syscall.Dup(int(f.Fd()))
 		if err != nil {
-			return 0, fmt.Errorf("failed to dup file: %v", err)
+			return fmt.Errorf("failed to dup file: %v", err)
 		}
 		f.Close()
 		ioFDs = append(ioFDs, fd)
@@ -453,24 +452,18 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		procArgs.Limits,
 		k,
 		cid); err != nil {
-		return 0, fmt.Errorf("failed to create new process: %v", err)
+		return fmt.Errorf("failed to create new process: %v", err)
 	}
 
 	ctx := procArgs.NewContext(l.k)
 	mns := k.RootMountNamespace()
 	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
-		return 0, fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
+		return fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
 	}
 
 	tg, err := l.k.CreateProcess(procArgs)
 	if err != nil {
-		return 0, fmt.Errorf("failed to create process in sentry: %v", err)
-	}
-
-	ts := l.k.TaskSet()
-	tgid := ts.Root.IDOfThreadGroup(tg)
-	if tgid == 0 {
-		return 0, errors.New("failed to get thread group ID of new process")
+		return fmt.Errorf("failed to create process in sentry: %v", err)
 	}
 
 	// CreateProcess takes a reference on FDMap if successful.
@@ -478,9 +471,9 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	l.containerRootTGIDs[cid] = tgid
+	l.containerRootTGs[cid] = tg
 
-	return tgid, nil
+	return nil
 }
 
 // TODO: Per-container namespaces must be supported for -pid.
@@ -490,53 +483,56 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
 	l.mu.Lock()
-	tgid, ok := l.containerRootTGIDs[cid]
+	tg, ok := l.containerRootTGs[cid]
 	if !ok {
 		defer l.mu.Unlock()
-		return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGIDs)
+		return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGs)
 	}
 	l.mu.Unlock()
 
 	// If the thread either has already exited or exits during waiting,
 	// consider the container exited.
+	// TODO: Multiple calls to waitContainer() should return
+	// the same exit status.
 	defer func() {
 		l.mu.Lock()
 		defer l.mu.Unlock()
 		// TODO: Containers don't map 1:1 with their root
 		// processes. Container exits should be managed explicitly
 		// rather than via PID.
-		delete(l.containerRootTGIDs, cid)
+		delete(l.containerRootTGs, cid)
 	}()
-	return l.wait(tgid, cid, waitStatus)
+	l.wait(tg, waitStatus)
+	return nil
 }
 
 func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
 	// TODO: Containers all currently share a PID namespace.
 	// When per-container PID namespaces are supported, wait should use cid
 	// to find the appropriate PID namespace.
-	if cid != l.sandboxID {
+	/*if cid != l.sandboxID {
 		return errors.New("non-sandbox PID namespaces are not yet implemented")
+	}*/
+	// TODO: This won't work if the exec process already exited.
+	tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
+	if tg == nil {
+		return fmt.Errorf("no thread group with ID %d", tgid)
 	}
-	return l.wait(tgid, cid, waitStatus)
+	l.wait(tg, waitStatus)
+	return nil
 }
 
 // wait waits for the process with TGID 'tgid' in a container's PID namespace
 // to exit.
-func (l *Loader) wait(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
-	tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
-	if tg == nil {
-		return fmt.Errorf("no thread group with ID %d", tgid)
-	}
+func (l *Loader) wait(tg *kernel.ThreadGroup, waitStatus *uint32) {
 	tg.WaitExited()
 	*waitStatus = tg.ExitStatus().Status()
-	return nil
 }
 
 func (l *Loader) setRootContainerID(cid string) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	// The root container has PID 1.
-	l.containerRootTGIDs = map[string]kernel.ThreadID{cid: 1}
+	l.containerRootTGs = map[string]*kernel.ThreadGroup{cid: l.k.GlobalInit()}
 	l.sandboxID = cid
 }
 
@@ -579,18 +575,15 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	}
 }
 
+// TODO: Support sending signal to all.
 func (l *Loader) signal(cid string, signo int32) error {
 	l.mu.Lock()
-	tgid, ok := l.containerRootTGIDs[cid]
+	tg, ok := l.containerRootTGs[cid]
 	l.mu.Unlock()
 	if !ok {
 		return fmt.Errorf("failed to signal container %q: no such container", cid)
 	}
 
-	// The thread group ID of a process is the leading task's thread ID.
-	t := l.k.TaskSet().Root.TaskWithID(tgid)
-	if t == nil {
-		return fmt.Errorf("cannot signal: no task with ID %d", tgid)
-	}
-	return t.SendSignal(&arch.SignalInfo{Signo: signo})
+	si := arch.SignalInfo{Signo: signo}
+	return tg.Leader().SendSignal(&si)
 }
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 05014ba3d..7c2c3f59e 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -129,7 +129,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		log.Warningf("ignoring console socket since it cannot be restored")
 	}
 
-	if err := cont.DestroyAndWait(); err != nil {
+	if err := cont.Destroy(); err != nil {
 		Fatalf("error destroying container: %v", err)
 	}
 
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index b20987b2c..caa44168b 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -95,10 +95,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 
-	log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
-	if !c.IsRunning() {
-		Fatalf("sandbox %q is not running", c.Sandbox.ID)
+	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+		Fatalf("container sandbox is not running")
 	}
+	log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
 
 	if d.signal > 0 {
 		log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid)
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 38848d02f..792b7967b 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -136,13 +136,17 @@ func Load(rootDir, id string) (*Container, error) {
 	// This is inherently racey.
 	if c.Status == Running || c.Status == Created {
 		// Check if the sandbox process is still running.
-		if c.IsRunning() {
-			// TODO: Send a message into the sandbox to
-			// see if this particular container is still running.
-		} else {
+		if !c.Sandbox.IsRunning() {
 			// Sandbox no longer exists, so this container definitely does not exist.
 			c.Status = Stopped
 			c.Sandbox = nil
+		} else if c.Status == Running {
+			// Container state should reflect the actual state of
+			// the application, so we don't consider gofer process
+			// here.
+			if err := c.Signal(syscall.Signal(0)); err != nil {
+				c.Status = Stopped
+			}
 		}
 	}
 
@@ -382,10 +386,12 @@ func (c *Container) Pid() int {
 }
 
 // Wait waits for the container to exit, and returns its WaitStatus.
+// Call to wait on a stopped container is needed to retrieve the exit status
+// and wait returns immediately.
 func (c *Container) Wait() (syscall.WaitStatus, error) {
 	log.Debugf("Wait on container %q", c.ID)
-	if c.Status == Stopped {
-		return 0, fmt.Errorf("container is stopped")
+	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+		return 0, fmt.Errorf("container sandbox is not running")
 	}
 	return c.Sandbox.Wait(c.ID)
 }
@@ -394,8 +400,8 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 // returns its WaitStatus.
 func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
-	if c.Status == Stopped {
-		return 0, fmt.Errorf("container is stopped")
+	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+		return 0, fmt.Errorf("container sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(pid, c.Sandbox.ID)
 }
@@ -404,29 +410,19 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 // its WaitStatus.
 func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in container %q", pid, c.ID)
-	if c.Status == Stopped {
-		return 0, fmt.Errorf("container is stopped")
-	}
-	ws, err := c.Sandbox.WaitPID(pid, c.ID)
-	if err != nil {
-		return 0, err
-	}
-	if c.Sandbox.IsRootContainer(c.ID) {
-		// If waiting for the root, give some time for the sandbox process to exit
-		// to prevent races with resources that might still be in use.
-		if err := c.waitForStopped(); err != nil {
-			return 0, err
-		}
+	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+		return 0, fmt.Errorf("container sandbox is not running")
 	}
-	return ws, nil
+	return c.Sandbox.WaitPID(pid, c.ID)
 }
 
 // Signal sends the signal to the container.
+// Signal returns an error if the container is already stopped.
+// TODO: Distinguish different error types.
 func (c *Container) Signal(sig syscall.Signal) error {
 	log.Debugf("Signal container %q", c.ID)
 	if c.Status == Stopped {
-		log.Warningf("container %q not running, not sending signal %v", c.ID, sig)
-		return nil
+		return fmt.Errorf("container sandbox is stopped")
 	}
 	// TODO: Query the container for its state, then save it.
 	return c.Sandbox.Signal(c.ID, sig)
@@ -437,8 +433,7 @@ func (c *Container) Signal(sig syscall.Signal) error {
 func (c *Container) Checkpoint(f *os.File) error {
 	log.Debugf("Checkpoint container %q", c.ID)
 	if c.Status == Stopped {
-		log.Warningf("container %q not running, not checkpointing", c.ID)
-		return nil
+		return fmt.Errorf("container sandbox is stopped")
 	}
 	return c.Sandbox.Checkpoint(c.ID, f)
 }
@@ -496,93 +491,36 @@ func (c *Container) Processes() ([]*control.Process, error) {
 }
 
 // Destroy frees all resources associated with the container.
+// Destroy returns error if any step fails, and the function can be safely retried.
 func (c *Container) Destroy() error {
 	log.Debugf("Destroy container %q", c.ID)
 
-	// First stop the container.
-	if c.Sandbox != nil {
-		if err := c.Sandbox.Stop(c.ID); err != nil {
-			return err
-		}
+	if err := c.stop(); err != nil {
+		return fmt.Errorf("error stopping container: %v", err)
 	}
 
-	// "If any poststop hook fails, the runtime MUST log a warning, but the
-	// remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
-	if c.Spec.Hooks != nil && (c.Status == Created || c.Status == Running) {
-		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
-	}
-
-	// If we are the first container in the sandbox, take the sandbox down
-	// as well.
-	if c.Sandbox != nil && c.Sandbox.IsRootContainer(c.ID) {
-		if err := c.Sandbox.Destroy(); err != nil {
-			log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err)
-		}
-	}
-	c.Status = Stopped
-	c.Sandbox = nil
-
-	if err := c.destroyGofer(); err != nil {
-		return fmt.Errorf("error destroying gofer: %v", err)
+	if err := destroyFS(c.Spec); err != nil {
+		return fmt.Errorf("error destroying container fs: %v", err)
 	}
 
 	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
 		return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
 	}
 
-	return nil
-}
-
-func (c *Container) destroyGofer() error {
-	if c.GoferPid != 0 {
-		log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
-		if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
-			log.Warningf("error sending signal %d to pid %d: %v", syscall.SIGKILL, c.GoferPid, err)
-		}
-	}
-
-	// Gofer process may take some time to teardown. Retry in case of failure.
-	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
-	defer cancel()
-	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
-	err := backoff.Retry(func() error { return destroyFS(c.Spec) }, b)
-	if err == nil {
-		// Success!
-		c.GoferPid = 0
-	}
-	return err
-}
-
-// IsRunning returns true if the sandbox or gofer process is running.
-func (c *Container) IsRunning() bool {
-	if c.Sandbox != nil && c.Sandbox.IsRunning() {
-		return true
-	}
-	if c.GoferPid != 0 {
-		// Send a signal 0 to the gofer process.
-		if err := syscall.Kill(c.GoferPid, 0); err == nil {
-			log.Warningf("Found orphan gofer process, pid: %d", c.GoferPid)
-			if err := c.destroyGofer(); err != nil {
-				log.Warningf("Error destroying gofer: %v", err)
-			}
-
-			// Don't wait for gofer to die. Return 'running' and hope gofer is dead
-			// next time around.
-			return true
-		}
+	// "If any poststop hook fails, the runtime MUST log a warning, but the
+	// remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
+	// Based on the OCI, "The post-stop hooks MUST be called after the container is
+	// deleted but before the delete operation returns"
+	// Run it here to:
+	// 1) Conform to the OCI.
+	// 2) Make sure it only runs once, because the root has been deleted, the container
+	// can't be loaded again.
+	if c.Spec.Hooks != nil {
+		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
 	}
-	return false
-}
 
-// DestroyAndWait frees all resources associated with the container
-// and waits for destroy to finish before returning.
-//
-// TODO: This only works for single container.
-func (c *Container) DestroyAndWait() error {
-	if err := c.Destroy(); err != nil {
-		return fmt.Errorf("error destroying container %v: %v", c, err)
-	}
-	return c.waitForStopped()
+	c.Status = Stopped
+	return nil
 }
 
 // save saves the container metadata to a file.
@@ -602,13 +540,49 @@ func (c *Container) save() error {
 	return nil
 }
 
+// stop stops the container (for regular containers) or the sandbox (for
+// root containers), and waits for the container or sandbox and the gofer
+// to stop. If any of them doesn't stop before timeout, an error is returned.
+func (c *Container) stop() error {
+	if c.Sandbox != nil && c.Sandbox.IsRunning() {
+		log.Debugf("Killing container %q", c.ID)
+		if c.Sandbox.IsRootContainer(c.ID) {
+			if err := c.Sandbox.Destroy(); err != nil {
+				return fmt.Errorf("error destroying sandbox %q: %v", c.Sandbox.ID, err)
+			}
+		} else {
+			if err := c.Signal(syscall.SIGKILL); err != nil {
+				// The container may already be stopped, log the error.
+				log.Warningf("Error sending signal %d to container %q: %v", syscall.SIGKILL, c.ID, err)
+			}
+		}
+	}
+
+	// Try killing gofer if it does not exit with container.
+	if c.GoferPid != 0 {
+		log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
+		if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+			// The gofer may already be stopped, log the error.
+			log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err)
+		}
+	}
+	return c.waitForStopped()
+}
+
 func (c *Container) waitForStopped() error {
 	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
 	defer cancel()
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
-		if !c.IsRunning() {
-			return fmt.Errorf("container is still running")
+		if c.Sandbox != nil && c.Sandbox.IsRunning() {
+			if err := c.Signal(syscall.Signal(0)); err == nil {
+				return fmt.Errorf("container is still running")
+			}
+		}
+		if c.GoferPid != 0 {
+			if err := syscall.Kill(c.GoferPid, 0); err == nil {
+				return fmt.Errorf("gofer is still running")
+			}
 		}
 		return nil
 	}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 790334249..ab1823f1c 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -200,6 +200,7 @@ func run(spec *specs.Spec, conf *boot.Config) error {
 	if err := s.Start(conf); err != nil {
 		return fmt.Errorf("error starting container: %v", err)
 	}
+
 	ws, err := s.Wait()
 	if err != nil {
 		return fmt.Errorf("error waiting on container: %v", err)
@@ -251,6 +252,35 @@ func configs(opts ...configOption) []*boot.Config {
 	return cs
 }
 
+// In normal runsc usage, sandbox processes will be parented to
+// init and init will reap the them. However, in the test environment
+// the test runner is the parent and will not reap the sandbox
+// processes, so we must do it ourselves, or else they will left
+// as zombies.
+// The function returns a wait group, and the caller can reap
+// children synchronously by waiting on the wait group.
+func reapChildren(c *Container) (*sync.WaitGroup, error) {
+	var wg sync.WaitGroup
+	p, err := os.FindProcess(c.Sandbox.Pid)
+	if err != nil {
+		return nil, fmt.Errorf("error finding sandbox process: %v", err)
+	}
+	g, err := os.FindProcess(c.GoferPid)
+	if err != nil {
+		return nil, fmt.Errorf("error finding gofer process: %v", err)
+	}
+	wg.Add(2)
+	go func() {
+		p.Wait()
+		wg.Done()
+	}()
+	go func() {
+		g.Wait()
+		wg.Done()
+	}()
+	return &wg, nil
+}
+
 // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
@@ -306,6 +336,7 @@ func TestLifecycle(t *testing.T) {
 		if err := s.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
+
 		// Load the container from disk and check the status.
 		s, err = Load(rootDir, id)
 		if err != nil {
@@ -352,10 +383,11 @@ func TestLifecycle(t *testing.T) {
 		// and init will reap the sandbox. However, in this case the
 		// test runner is the parent and will not reap the sandbox
 		// process, so we must do it ourselves.
-		p, _ := os.FindProcess(s.Sandbox.Pid)
-		p.Wait()
-		g, _ := os.FindProcess(s.GoferPid)
-		g.Wait()
+		reapWg, err := reapChildren(s)
+		if err != nil {
+			t.Fatalf("error reaping children: %v", err)
+		}
+		reapWg.Wait()
 
 		// Load the container from disk and check the status.
 		s, err = Load(rootDir, id)
@@ -1164,6 +1196,11 @@ func TestConsoleSocket(t *testing.T) {
 			t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
 		}
 
+		// Reap the sandbox process.
+		if _, err := reapChildren(s); err != nil {
+			t.Fatalf("error reaping children: %v", err)
+		}
+
 		// Shut it down.
 		if err := s.Destroy(); err != nil {
 			t.Fatalf("error destroying container: %v", err)
@@ -1259,6 +1296,7 @@ func TestReadonlyRoot(t *testing.T) {
 		if err := s.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
+
 		ws, err := s.Wait()
 		if err != nil {
 			t.Fatalf("error waiting on container: %v", err)
@@ -1302,6 +1340,7 @@ func TestReadonlyMount(t *testing.T) {
 		if err := s.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
+
 		ws, err := s.Wait()
 		if err != nil {
 			t.Fatalf("error waiting on container: %v", err)
@@ -1547,8 +1586,9 @@ func TestGoferExits(t *testing.T) {
 	if _, err := gofer.Wait(); err != nil {
 		t.Fatalf("error waiting for gofer process: %v", err)
 	}
-	if c.IsRunning() {
-		t.Errorf("container shouldn't be running, container: %+v", c)
+
+	if err := c.waitForStopped(); err != nil {
+		t.Errorf("container is not stopped: %v", err)
 	}
 }
 
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 5cf8f0cda..7ae19ff35 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -23,8 +23,8 @@ go_library(
         "//runsc/boot",
         "//runsc/console",
         "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_vishvananda_netlink//:go_default_library",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 8e90dcc70..156b2f769 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -16,6 +16,7 @@
 package sandbox
 
 import (
+	"context"
 	"fmt"
 	"os"
 	"os/exec"
@@ -23,8 +24,8 @@ import (
 	"syscall"
 	"time"
 
+	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/control/client"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -543,20 +544,18 @@ func (s *Sandbox) IsRootContainer(cid string) bool {
 	return s.ID == cid
 }
 
-// Stop stops the container in the sandbox.
-func (s *Sandbox) Stop(cid string) error {
-	// TODO: This should stop the container with the given ID
-	// in the sandbox.
-	return nil
-}
-
 // Destroy frees all resources associated with the sandbox.
+// Destroy returns error if any step fails, and the function can be safely retried.
 func (s *Sandbox) Destroy() error {
 	log.Debugf("Destroy sandbox %q", s.ID)
 	if s.Pid != 0 {
-		// TODO: Too harsh?
 		log.Debugf("Killing sandbox %q", s.ID)
-		signalProcess(s.Pid, unix.SIGKILL)
+		if err := syscall.Kill(s.Pid, syscall.SIGKILL); err != nil && err != syscall.ESRCH {
+			return fmt.Errorf("error killing sandbox %q PID %q: %v", s.ID, s.Pid, err)
+		}
+		if err := s.waitForStopped(); err != nil {
+			return fmt.Errorf("error waiting sandbox %q stop: %v", s.ID, err)
+		}
 	}
 
 	if s.Chroot != "" {
@@ -641,7 +640,7 @@ func (s *Sandbox) Resume(cid string) error {
 func (s *Sandbox) IsRunning() bool {
 	if s.Pid != 0 {
 		// Send a signal 0 to the sandbox process.
-		if err := signalProcess(s.Pid, 0); err == nil {
+		if err := syscall.Kill(s.Pid, 0); err == nil {
 			// Succeeded, process is running.
 			return true
 		}
@@ -665,14 +664,17 @@ func (s *Sandbox) Stacks() (string, error) {
 	return stacks, nil
 }
 
-// signalProcess sends a signal to the host process (i.e. a sandbox or gofer
-// process). Sandbox.Signal should be used to send a signal to a process
-// running inside the sandbox.
-func signalProcess(pid int, sig syscall.Signal) error {
-	if err := syscall.Kill(pid, sig); err != nil {
-		return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err)
+func (s *Sandbox) waitForStopped() error {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+	op := func() error {
+		if s.IsRunning() {
+			return fmt.Errorf("sandbox is still running")
+		}
+		return nil
 	}
-	return nil
+	return backoff.Retry(op, b)
 }
 
 // deviceFileForPlatform opens the device file for the given platform. If the
-- 
cgit v1.2.3


From 25add7b22b1b0b6a4bac1e72536d3f3a0c70f048 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 17 Sep 2018 11:30:16 -0700
Subject: runsc: Fix stdin/out/err in multi-container mode.

Stdin/out/err weren't being sent to the sentry.

PiperOrigin-RevId: 213307171
Change-Id: Ie4b634a58b1b69aa934ce8597e5cc7a47a2bcda2
---
 runsc/boot/controller.go | 12 +++++++-----
 runsc/boot/fds.go        | 13 ++++++++-----
 runsc/boot/fs.go         | 12 ++++++------
 runsc/boot/loader.go     | 18 ++++++++++++------
 runsc/sandbox/sandbox.go |  9 ++++++---
 5 files changed, 39 insertions(+), 25 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 69154ff23..4d41dcd6c 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -186,8 +186,10 @@ type StartArgs struct {
 	// CID is the ID of the container to start.
 	CID string
 
-	// FilePayload contains the file descriptor over which the sandbox will
-	// request files from its root filesystem.
+	// FilePayload contains, in order:
+	//   * stdin, stdout, and stderr.
+	//   * the file descriptor over which the sandbox will
+	//     request files from its root filesystem.
 	urpc.FilePayload
 }
 
@@ -215,8 +217,8 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if path.Clean(args.CID) != args.CID {
 		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
 	}
-	if len(args.FilePayload.Files) == 0 {
-		return fmt.Errorf("start arguments must contain at least one file for the container root")
+	if len(args.FilePayload.Files) < 4 {
+		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
 	}
 
 	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
@@ -339,7 +341,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
-	fds := &fdDispenser{fds: cm.l.ioFDs}
+	fds := &fdDispenser{fds: cm.l.goferFDs}
 	renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
 	if err != nil {
 		return fmt.Errorf("error creating RestoreEnvironment: %v", err)
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 9de5a78b1..91c698fea 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -28,15 +27,19 @@ import (
 
 // createFDMap creates an fd map that contains stdin, stdout, and stderr. If
 // console is true, then ioctl calls will be passed through to the host fd.
-func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool) (*kernel.FDMap, error) {
+func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+	if len(stdioFDs) != 3 {
+		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+	}
+
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
 
 	// Maps sandbox fd to host fd.
 	fdMap := map[int]int{
-		0: syscall.Stdin,
-		1: syscall.Stdout,
-		2: syscall.Stderr,
+		0: stdioFDs[0],
+		1: stdioFDs[1],
+		2: stdioFDs[2],
 	}
 	mounter := fs.FileOwnerFromContext(ctx)
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 5ec9a7d03..45843fe7b 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -82,7 +82,7 @@ func (f *fdDispenser) empty() bool {
 
 // createMountNamespace creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
-func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
+func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int) (*fs.MountNamespace, error) {
 	mounts := compileMounts(spec)
 	// Create a tmpfs mount where we create and mount a root filesystem for
 	// each child container.
@@ -90,7 +90,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 		Type:        tmpfs,
 		Destination: childContainersDir,
 	})
-	fds := &fdDispenser{fds: ioFDs}
+	fds := &fdDispenser{fds: goferFDs}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount: %v", err)
@@ -595,13 +595,13 @@ func subtargets(root string, mnts []specs.Mount) []string {
 
 // setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
 // procArgs are passed by reference and the FDMap field is modified.
-func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
 	ctx := procArgs.NewContext(k)
 
 	// Create the FD map, which will set stdin, stdout, and stderr.  If
 	// console is true, then ioctl calls will be passed through to the host
 	// fd.
-	fdm, err := createFDMap(ctx, k, ls, console)
+	fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
 	if err != nil {
 		return fmt.Errorf("error importing fds: %v", err)
 	}
@@ -625,7 +625,7 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	mns := k.RootMountNamespace()
 	if mns == nil {
 		// Create the virtual filesystem.
-		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
+		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, goferFDs)
 		if err != nil {
 			return fmt.Errorf("error creating mounts: %v", err)
 		}
@@ -637,7 +637,7 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 
 	// Create the container's root filesystem mount.
 	log.Infof("Creating new process in child container.")
-	fds := &fdDispenser{fds: append([]int{}, ioFDs...)}
+	fds := &fdDispenser{fds: append([]int{}, goferFDs...)}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
 	if err != nil {
 		return fmt.Errorf("error creating filesystem for container: %v", err)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 2ddb358bd..5e9ccb96f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -77,8 +77,11 @@ type Loader struct {
 
 	watchdog *watchdog.Watchdog
 
-	// ioFDs are the FDs that attach the sandbox to the gofers.
-	ioFDs []int
+	// stdioFDs contains stdin, stdout, and stderr.
+	stdioFDs []int
+
+	// goferFDs are the FDs that attach the sandbox to the gofers.
+	goferFDs []int
 
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
@@ -121,7 +124,7 @@ func init() {
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, ioFDs []int, console bool) (*Loader, error) {
+func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, console bool) (*Loader, error) {
 	// Create kernel and platform.
 	p, err := createPlatform(conf, deviceFD)
 	if err != nil {
@@ -252,7 +255,8 @@ func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, ioFDs []int
 		conf:                  conf,
 		console:               console,
 		watchdog:              watchdog,
-		ioFDs:                 ioFDs,
+		stdioFDs:              []int{syscall.Stdin, syscall.Stdout, syscall.Stderr},
+		goferFDs:              goferFDs,
 		spec:                  spec,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
@@ -364,7 +368,8 @@ func (l *Loader) run() error {
 			&l.rootProcArgs,
 			l.spec,
 			l.conf,
-			l.ioFDs,
+			l.stdioFDs,
+			l.goferFDs,
 			l.console,
 			l.rootProcArgs.Credentials,
 			l.rootProcArgs.Limits,
@@ -446,7 +451,8 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		&procArgs,
 		spec,
 		conf,
-		ioFDs,
+		ioFDs[:3], // stdioFDs
+		ioFDs[3:], // goferFDs
 		false,
 		creds,
 		procArgs.Limits,
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 156b2f769..8c4d0d495 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -100,8 +100,8 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 }
 
 // Start starts running a non-root container inside the sandbox.
-func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, ioFiles []*os.File) error {
-	for _, f := range ioFiles {
+func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+	for _, f := range goferFiles {
 		defer f.Close()
 	}
 
@@ -112,12 +112,15 @@ func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, ioFiles
 	}
 	defer sandboxConn.Close()
 
+	// The payload must container stdin/stdout/stderr followed by gofer
+	// files.
+	files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...)
 	// Start running the container.
 	args := boot.StartArgs{
 		Spec:        spec,
 		Conf:        conf,
 		CID:         cid,
-		FilePayload: urpc.FilePayload{Files: ioFiles},
+		FilePayload: urpc.FilePayload{Files: files},
 	}
 	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
 		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
-- 
cgit v1.2.3


From bb88c187c5457df14fa78e5e6b6f48cbc90fb489 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 17 Sep 2018 16:24:05 -0700
Subject: runsc: Enable waiting on exited processes.

This makes `runsc wait` behave more like waitpid()/wait4() in that:
- Once a process has run to completion, you can wait on it and get its exit
  code.
- Processes not waited on will consume memory (like a zombie process)

PiperOrigin-RevId: 213358916
Change-Id: I5b5eca41ce71eea68e447380df8c38361a4d1558
---
 pkg/sentry/control/proc.go              |  14 ++--
 pkg/sentry/kernel/kernel.go             |  17 ++---
 runsc/boot/controller.go                |  33 +++------
 runsc/boot/loader.go                    | 114 +++++++++++++++++++++++++++-----
 runsc/boot/loader_test.go               |  25 +++----
 runsc/cmd/exec.go                       |  14 +++-
 runsc/cmd/wait.go                       |   4 +-
 runsc/container/container.go            |   8 +--
 runsc/container/container_test.go       |   4 +-
 runsc/container/multi_container_test.go |  94 ++++++++++++++++++++++++--
 runsc/sandbox/sandbox.go                |   7 +-
 11 files changed, 248 insertions(+), 86 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 19bc76f5c..68d3b179b 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -87,7 +87,7 @@ type ExecArgs struct {
 
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
-	newTG, err := proc.execAsync(args)
+	newTG, _, err := proc.execAsync(args)
 	if err != nil {
 		return err
 	}
@@ -100,11 +100,13 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 
 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
 // as a function rather than a method to avoid exposing execAsync as an RPC.
-func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, error) {
+func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, error) {
 	return proc.execAsync(args)
 }
 
-func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
+// execAsync runs a new task, but doesn't wait for it to finish. It returns the
+// newly created thread group and its PID.
+func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, error) {
 	// Import file descriptors.
 	l := limits.NewLimitSet()
 	fdm := proc.Kernel.NewFDMap()
@@ -144,7 +146,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 		paths := fs.GetPath(initArgs.Envv)
 		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 		if err != nil {
-			return nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			return nil, 0, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
 		}
 		initArgs.Filename = f
 	}
@@ -156,7 +158,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 		// Import the given file FD. This dups the FD as well.
 		file, err := host.ImportFile(ctx, int(f.Fd()), mounter, enableIoctl)
 		if err != nil {
-			return nil, err
+			return nil, 0, err
 		}
 		defer file.DecRef()
 
@@ -164,7 +166,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 		f.Close()
 
 		if err := fdm.NewFDAt(kdefs.FD(appFD), file, kernel.FDFlags{}, l); err != nil {
-			return nil, err
+			return nil, 0, err
 		}
 	}
 
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 316612b37..f71e32ac9 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -596,13 +596,13 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 //
 // CreateProcess has no analogue in Linux; it is used to create the initial
 // application task, as well as processes started by the control server.
-func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
+func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
 	k.extMu.Lock()
 	defer k.extMu.Unlock()
 	log.Infof("EXEC: %v", args.Argv)
 
 	if k.mounts == nil {
-		return nil, fmt.Errorf("no kernel MountNamespace")
+		return nil, 0, fmt.Errorf("no kernel MountNamespace")
 	}
 
 	tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
@@ -622,7 +622,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 		var err error
 		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals)
 		if err != nil {
-			return nil, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
 		}
 		defer wd.DecRef()
 	}
@@ -630,10 +630,10 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	if args.Filename == "" {
 		// Was anything provided?
 		if len(args.Argv) == 0 {
-			return nil, fmt.Errorf("no filename or command provided")
+			return nil, 0, fmt.Errorf("no filename or command provided")
 		}
 		if !filepath.IsAbs(args.Argv[0]) {
-			return nil, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
+			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
 		}
 		args.Filename = args.Argv[0]
 	}
@@ -641,7 +641,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	// Create a fresh task context.
 	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
 	// Take a reference on the FDMap, which will be transferred to
@@ -663,17 +663,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	}
 	t, err := k.tasks.NewTask(config)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
 	// Success.
+	tgid := k.tasks.Root.IDOfThreadGroup(tg)
 	if k.started {
 		tid := k.tasks.Root.IDOfTask(t)
 		t.Start(tid)
 	} else if k.globalInit == nil {
 		k.globalInit = tg
 	}
-	return tg, nil
+	return tg, tgid, nil
 }
 
 // Start starts execution of all tasks in k.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 4d41dcd6c..dc9359092 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -242,32 +242,11 @@ type ExecArgs struct {
 // returns the pid of the new process.
 func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error {
 	log.Debugf("containerManager.ExecuteAsync: %+v", args)
-
-	// Get the container Root Dirent from the Task, since we must run this
-	// process with the same Root.
-	cm.l.mu.Lock()
-	tg, ok := cm.l.containerRootTGs[args.CID]
-	cm.l.mu.Unlock()
-	if !ok {
-		return fmt.Errorf("cannot exec in container %q: no such container", args.CID)
-	}
-	tg.Leader().WithMuLocked(func(t *kernel.Task) {
-		args.Root = t.FSContext().RootDirectory()
-	})
-	if args.Root != nil {
-		defer args.Root.DecRef()
-	}
-
-	// Start the process.
-	proc := control.Proc{Kernel: cm.l.k}
-	newTG, err := control.ExecAsync(&proc, &args.ExecArgs)
+	tgid, err := cm.l.executeAsync(&args.ExecArgs, args.CID)
 	if err != nil {
-		return fmt.Errorf("error executing: %+v: %v", args, err)
+		return err
 	}
-
-	// Return the pid of the newly-created process.
-	ts := cm.l.k.TaskSet()
-	*pid = int32(ts.Root.IDOfThreadGroup(newTG))
+	*pid = int32(tgid)
 	return nil
 }
 
@@ -409,12 +388,16 @@ type WaitPIDArgs struct {
 
 	// CID is the container ID.
 	CID string
+
+	// ClearStatus determines whether the exit status of the process should
+	// be cleared when WaitPID returns.
+	ClearStatus bool
 }
 
 // WaitPID waits for the process with PID 'pid' in the sandbox.
 func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Wait")
-	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
+	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
 }
 
 // SignalArgs are arguments to the Signal method.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 5e9ccb96f..665240ab6 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -103,7 +104,7 @@ type Loader struct {
 	// sandboxID is the ID for the whole sandbox.
 	sandboxID string
 
-	// mu guards containerRootTGs.
+	// mu guards containerRootTGs and execProcesses.
 	mu sync.Mutex
 
 	// containerRootTGs maps container IDs to their root processes. It
@@ -111,7 +112,24 @@ type Loader struct {
 	// call methods on particular containers.
 	//
 	// containerRootTGs is guarded by mu.
+	//
+	// TODO: When containers are removed via `runsc delete`,
+	// containerRootTGs should be cleaned up.
 	containerRootTGs map[string]*kernel.ThreadGroup
+
+	// execProcesses maps each invocation of exec to the process it spawns.
+	//
+	// execProcesses is guardded by mu.
+	//
+	// TODO: When containers are removed via `runsc delete`,
+	// execProcesses should be cleaned up.
+	execProcesses map[execID]*kernel.ThreadGroup
+}
+
+// execID uniquely identifies a sentry process.
+type execID struct {
+	cid string
+	pid kernel.ThreadID
 }
 
 func init() {
@@ -385,7 +403,8 @@ func (l *Loader) run() error {
 		}
 
 		// Create the root container init task.
-		if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+		_, _, err := l.k.CreateProcess(l.rootProcArgs)
+		if err != nil {
 			return fmt.Errorf("failed to create init process: %v", err)
 		}
 
@@ -393,6 +412,11 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
+	if l.execProcesses != nil {
+		return fmt.Errorf("there shouldn't already be a cache of exec'd processes, but found: %v", l.execProcesses)
+	}
+	l.execProcesses = make(map[execID]*kernel.ThreadGroup)
+
 	// Start signal forwarding only after an init process is created.
 	l.stopSignalForwarding = l.startSignalForwarding()
 
@@ -467,7 +491,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
 	}
 
-	tg, err := l.k.CreateProcess(procArgs)
+	tg, _, err := l.k.CreateProcess(procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to create process in sentry: %v", err)
 	}
@@ -482,6 +506,40 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	return nil
 }
 
+func (l *Loader) executeAsync(args *control.ExecArgs, cid string) (kernel.ThreadID, error) {
+	// Get the container Root Dirent from the Task, since we must run this
+	// process with the same Root.
+	l.mu.Lock()
+	tg, ok := l.containerRootTGs[cid]
+	l.mu.Unlock()
+	if !ok {
+		return 0, fmt.Errorf("cannot exec in container %q: no such container", cid)
+	}
+	tg.Leader().WithMuLocked(func(t *kernel.Task) {
+		args.Root = t.FSContext().RootDirectory()
+	})
+	if args.Root != nil {
+		defer args.Root.DecRef()
+	}
+
+	// Start the process.
+	proc := control.Proc{Kernel: l.k}
+	tg, tgid, err := control.ExecAsync(&proc, args)
+	if err != nil {
+		return 0, fmt.Errorf("error executing: %+v: %v", args, err)
+	}
+
+	// Insert the process into execProcesses so that we can wait on it
+	// later.
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	eid := execID{cid: cid, pid: tgid}
+	l.execProcesses[eid] = tg
+	log.Debugf("updated execProcesses: %v", l.execProcesses)
+
+	return tgid, nil
+}
+
 // TODO: Per-container namespaces must be supported for -pid.
 
 // waitContainer waits for the root process of a container to exit.
@@ -500,39 +558,59 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// consider the container exited.
 	// TODO: Multiple calls to waitContainer() should return
 	// the same exit status.
-	defer func() {
-		l.mu.Lock()
-		defer l.mu.Unlock()
-		// TODO: Containers don't map 1:1 with their root
-		// processes. Container exits should be managed explicitly
-		// rather than via PID.
-		delete(l.containerRootTGs, cid)
-	}()
-	l.wait(tg, waitStatus)
+	ws := l.wait(tg)
+	*waitStatus = ws
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	delete(l.containerRootTGs, cid)
+
 	return nil
 }
 
-func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
 	// TODO: Containers all currently share a PID namespace.
 	// When per-container PID namespaces are supported, wait should use cid
 	// to find the appropriate PID namespace.
 	/*if cid != l.sandboxID {
 		return errors.New("non-sandbox PID namespaces are not yet implemented")
 	}*/
-	// TODO: This won't work if the exec process already exited.
-	tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
+
+	// If the process was started via runsc exec, it will have an
+	// entry in l.execProcesses.
+	l.mu.Lock()
+	eid := execID{cid: cid, pid: tgid}
+	tg, ok := l.execProcesses[eid]
+	l.mu.Unlock()
+	if ok {
+		ws := l.wait(tg)
+		*waitStatus = ws
+		if clearStatus {
+			// Remove tg from the cache.
+			l.mu.Lock()
+			delete(l.execProcesses, eid)
+			log.Debugf("updated execProcesses (removal): %v", l.execProcesses)
+			l.mu.Unlock()
+		}
+		return nil
+	}
+
+	// This process wasn't created by runsc exec or start, so just find it
+	// by pid and hope it hasn't exited yet.
+	tg = l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
 	if tg == nil {
 		return fmt.Errorf("no thread group with ID %d", tgid)
 	}
-	l.wait(tg, waitStatus)
+	ws := l.wait(tg)
+	*waitStatus = ws
 	return nil
 }
 
 // wait waits for the process with TGID 'tgid' in a container's PID namespace
 // to exit.
-func (l *Loader) wait(tg *kernel.ThreadGroup, waitStatus *uint32) {
+func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
 	tg.WaitExited()
-	*waitStatus = tg.ExitStatus().Status()
+	return tg.ExitStatus().Status()
 }
 
 func (l *Loader) setRootContainerID(cid string) {
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 9398292ff..a8a796445 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -111,11 +111,11 @@ func createLoader() (*Loader, func(), error) {
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
 func TestRun(t *testing.T) {
-	s, cleanup, err := createLoader()
+	l, cleanup, err := createLoader()
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
-	defer s.Destroy()
+	defer l.Destroy()
 	defer cleanup()
 
 	// Start a goroutine to read the start chan result, otherwise Run will
@@ -124,12 +124,13 @@ func TestRun(t *testing.T) {
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
-		resultChanErr = <-s.ctrl.manager.startResultChan
+		resultChanErr = <-l.ctrl.manager.startResultChan
 		wg.Done()
 	}()
 
-	// Run the container..
-	if err := s.Run(); err != nil {
+	// Run the container.
+	l.setRootContainerID("foo")
+	if err := l.Run(); err != nil {
 		t.Errorf("error running container: %v", err)
 	}
 
@@ -140,7 +141,7 @@ func TestRun(t *testing.T) {
 	}
 
 	// Wait for the application to exit.  It should succeed.
-	if status := s.WaitExit(); status.Code != 0 || status.Signo != 0 {
+	if status := l.WaitExit(); status.Code != 0 || status.Signo != 0 {
 		t.Errorf("application exited with status %+v, want 0", status)
 	}
 }
@@ -148,24 +149,24 @@ func TestRun(t *testing.T) {
 // TestStartSignal tests that the controller Start message will cause
 // WaitForStartSignal to return.
 func TestStartSignal(t *testing.T) {
-	s, cleanup, err := createLoader()
+	l, cleanup, err := createLoader()
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
-	defer s.Destroy()
+	defer l.Destroy()
 	defer cleanup()
 
 	// We aren't going to wait on this application, so the control server
 	// needs to be shut down manually.
-	defer s.ctrl.srv.Stop()
+	defer l.ctrl.srv.Stop()
 
 	// Start a goroutine that calls WaitForStartSignal and writes to a
 	// channel when it returns.
 	waitFinished := make(chan struct{})
 	go func() {
-		s.WaitForStartSignal()
+		l.WaitForStartSignal()
 		// Pretend that Run() executed and returned no error.
-		s.ctrl.manager.startResultChan <- nil
+		l.ctrl.manager.startResultChan <- nil
 		waitFinished <- struct{}{}
 	}()
 
@@ -181,7 +182,7 @@ func TestStartSignal(t *testing.T) {
 
 	// Trigger the control server StartRoot method.
 	cid := "foo"
-	if err := s.ctrl.manager.StartRoot(&cid, nil); err != nil {
+	if err := l.ctrl.manager.StartRoot(&cid, nil); err != nil {
 		t.Errorf("error calling StartRoot: %v", err)
 	}
 
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 0d1fa6e20..957c4f0ff 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -49,6 +49,7 @@ type Exec struct {
 	extraKGIDs      stringSlice
 	caps            stringSlice
 	detach          bool
+	clearStatus     bool
 	processPath     string
 	pidFile         string
 	internalPidFile string
@@ -100,6 +101,9 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
 	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
 	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+
+	// clear-status is expected to only be set when we fork due to --detach being set.
+	f.BoolVar(&ex.clearStatus, "clear-status", true, "clear the status of the exec'd process upon completion")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
@@ -163,7 +167,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Wait for the process to exit.
-	ws, err := c.WaitPID(pid)
+	ws, err := c.WaitPID(pid, ex.clearStatus)
 	if err != nil {
 		Fatalf("error waiting on pid %d: %v", pid, err)
 	}
@@ -194,10 +198,16 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 
 	// Add the rest of the args, excluding the "detach" flag.
 	for _, a := range os.Args[1:] {
-		if !strings.Contains(a, "detach") {
+		if strings.Contains(a, "detach") {
+			// Replace with the "clear-status" flag, which tells
+			// the new process it's a detached child and shouldn't
+			// clear the exit status of the sentry process.
+			args = append(args, "--clear-status=false")
+		} else {
 			args = append(args, a)
 		}
 	}
+
 	cmd := exec.Command(binPath, args...)
 
 	// Exec stdio defaults to current process stdio.
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index b41edc725..956349140 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -88,14 +88,14 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		waitStatus = ws
 	// Wait on a PID in the root PID namespace.
 	case wt.rootPID != unsetPID:
-		ws, err := c.WaitRootPID(int32(wt.rootPID))
+		ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */)
 		if err != nil {
 			Fatalf("error waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
 		}
 		waitStatus = ws
 	// Wait on a PID in the container's PID namespace.
 	case wt.pid != unsetPID:
-		ws, err := c.WaitPID(int32(wt.pid))
+		ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */)
 		if err != nil {
 			Fatalf("error waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
 		}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 792b7967b..a24c6cc31 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -398,22 +398,22 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 
 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
 // returns its WaitStatus.
-func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
+func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
 	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
 		return 0, fmt.Errorf("container sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(pid, c.Sandbox.ID)
+	return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
 }
 
 // WaitPID waits for process 'pid' in the container's PID namespace and returns
 // its WaitStatus.
-func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
+func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in container %q", pid, c.ID)
 	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
 		return 0, fmt.Errorf("container sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(pid, c.ID)
+	return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
 }
 
 // Signal sends the signal to the container.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index ab1823f1c..5fe80f20f 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -551,7 +551,7 @@ func TestExec(t *testing.T) {
 
 		args := &control.ExecArgs{
 			Filename:         "/bin/sleep",
-			Argv:             []string{"sleep", "5"},
+			Argv:             []string{"/bin/sleep", "5"},
 			WorkingDirectory: "/",
 			KUID:             uid,
 		}
@@ -1598,7 +1598,7 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %v", err)
 	}
-	ws, err := cont.WaitPID(pid)
+	ws, err := cont.WaitPID(pid, true /* clearStatus */)
 	if err != nil {
 		return 0, fmt.Errorf("error waiting: %v", err)
 	}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 84e0ec080..09888cb86 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -163,16 +163,15 @@ func TestMultiContainerWait(t *testing.T) {
 		go func(c *Container) {
 			defer wg.Done()
 			const pid = 2
-			if ws, err := c.WaitPID(pid); err != nil {
+			if ws, err := c.WaitPID(pid, true /* clearStatus */); err != nil {
 				t.Errorf("failed to wait for PID %d: %v", pid, err)
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("PID %d exited with non-zero status %d", pid, es)
 			}
-			if _, err := c.WaitPID(pid); err == nil {
+			if _, err := c.WaitPID(pid, true /* clearStatus */); err == nil {
 				t.Errorf("wait for stopped PID %d should fail", pid)
 			}
-			// TODO: use 'container[1]' when PID namespace is supported.
-		}(containers[0])
+		}(containers[1])
 	}
 
 	wg.Wait()
@@ -184,6 +183,93 @@ func TestMultiContainerWait(t *testing.T) {
 	}
 }
 
+// TestExecWait ensures what we can wait containers and individual processes in the
+// sandbox that have already exited.
+func TestExecWait(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// The first container should run the entire duration of the test.
+	cmd1 := []string{"sleep", "100"}
+	// We'll wait on the second container, which is much shorter lived.
+	cmd2 := []string{"sleep", "1"}
+	specs, ids := createSpecs(cmd1, cmd2)
+
+	// Setup the containers.
+	var containers []*Container
+	for i, spec := range specs {
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	// Check via ps that multiple processes are running.
+	expectedPL := []*control.Process{
+		{PID: 1, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Fatalf("failed to wait for sleep to start: %v", err)
+	}
+
+	// Wait for the second container to finish.
+	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		t.Fatalf("failed to wait for second container to stop: %v", err)
+	}
+
+	// Get the second container exit status.
+	if ws, err := containers[1].Wait(); err != nil {
+		t.Fatalf("failed to wait for process %s: %v", containers[1].Spec.Process.Args, err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Fatalf("process %s exited with non-zero status %d", containers[1].Spec.Process.Args, es)
+	}
+	if _, err := containers[1].Wait(); err == nil {
+		t.Fatalf("wait for stopped process %s should fail", containers[1].Spec.Process.Args)
+	}
+
+	// Execute another process in the first container.
+	args := &control.ExecArgs{
+		Filename:         "/bin/sleep",
+		Argv:             []string{"/bin/sleep", "1"},
+		WorkingDirectory: "/",
+		KUID:             0,
+	}
+	pid, err := containers[0].Execute(args)
+	if err != nil {
+		t.Fatalf("error executing: %v", err)
+	}
+
+	// Wait for the exec'd process to exit.
+	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		t.Fatalf("failed to wait for second container to stop: %v", err)
+	}
+
+	// Get the exit status from the exec'd process.
+	if ws, err := containers[0].WaitPID(pid, true /* clearStatus */); err != nil {
+		t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Fatalf("process %+v exited with non-zero status %d", args, es)
+	}
+	if _, err := containers[0].WaitPID(pid, true /* clearStatus */); err == nil {
+		t.Fatalf("wait for stopped process %+v should fail", args)
+	}
+}
+
 // TestMultiContainerMount tests that bind mounts can be used with multiple
 // containers.
 func TestMultiContainerMount(t *testing.T) {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 8c4d0d495..3b10fd20e 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -522,7 +522,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 
 // WaitPID waits for process 'pid' in the container's sandbox and returns its
 // WaitStatus.
-func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) {
+func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
 	var ws syscall.WaitStatus
 	conn, err := s.sandboxConnect()
@@ -532,8 +532,9 @@ func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) {
 	defer conn.Close()
 
 	args := &boot.WaitPIDArgs{
-		PID: pid,
-		CID: cid,
+		PID:         pid,
+		CID:         cid,
+		ClearStatus: clearStatus,
 	}
 	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
 		return ws, fmt.Errorf("error waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
-- 
cgit v1.2.3


From 26b08e182cd08371f14dc58fd54ed7865452cea7 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 17 Sep 2018 21:17:31 -0700
Subject: Rename container in test

's' used to stand for sandbox, before container exited.

PiperOrigin-RevId: 213390641
Change-Id: I7bda94a50398c46721baa92227e32a7a1d817412
---
 runsc/container/container_test.go | 54 +++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5fe80f20f..24beb2b75 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -192,16 +192,16 @@ func run(spec *specs.Spec, conf *boot.Config) error {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 	if err != nil {
 		return fmt.Errorf("error creating container: %v", err)
 	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
 		return fmt.Errorf("error starting container: %v", err)
 	}
 
-	ws, err := s.Wait()
+	ws, err := c.Wait()
 	if err != nil {
 		return fmt.Errorf("error waiting on container: %v", err)
 	}
@@ -315,11 +315,11 @@ func TestLifecycle(t *testing.T) {
 		}
 
 		// Load the container from disk and check the status.
-		s, err := Load(rootDir, id)
+		c, err := Load(rootDir, id)
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
-		if got, want := s.Status, Created; got != want {
+		if got, want := c.Status, Created; got != want {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
@@ -333,21 +333,21 @@ func TestLifecycle(t *testing.T) {
 		}
 
 		// Start the container.
-		if err := s.Start(conf); err != nil {
+		if err := c.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
 
 		// Load the container from disk and check the status.
-		s, err = Load(rootDir, id)
+		c, err = Load(rootDir, id)
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
-		if got, want := s.Status, Running; got != want {
+		if got, want := c.Status, Running; got != want {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
 		// Verify that "sleep 100" is running.
-		if err := waitForProcessList(s, expectedPL); err != nil {
+		if err := waitForProcessList(c, expectedPL); err != nil {
 			t.Error(err)
 		}
 
@@ -357,7 +357,7 @@ func TestLifecycle(t *testing.T) {
 		ch := make(chan struct{})
 		go func() {
 			ch <- struct{}{}
-			ws, err := s.Wait()
+			ws, err := c.Wait()
 			if err != nil {
 				t.Fatalf("error waiting on container: %v", err)
 			}
@@ -372,7 +372,7 @@ func TestLifecycle(t *testing.T) {
 		<-ch
 		time.Sleep(100 * time.Millisecond)
 		// Send the container a SIGTERM which will cause it to stop.
-		if err := s.Signal(syscall.SIGTERM); err != nil {
+		if err := c.Signal(syscall.SIGTERM); err != nil {
 			t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
 		}
 		// Wait for it to die.
@@ -383,23 +383,23 @@ func TestLifecycle(t *testing.T) {
 		// and init will reap the sandbox. However, in this case the
 		// test runner is the parent and will not reap the sandbox
 		// process, so we must do it ourselves.
-		reapWg, err := reapChildren(s)
+		reapWg, err := reapChildren(c)
 		if err != nil {
 			t.Fatalf("error reaping children: %v", err)
 		}
 		reapWg.Wait()
 
 		// Load the container from disk and check the status.
-		s, err = Load(rootDir, id)
+		c, err = Load(rootDir, id)
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
-		if got, want := s.Status, Stopped; got != want {
+		if got, want := c.Status, Stopped; got != want {
 			t.Errorf("container status got %v, want %v", got, want)
 		}
 
 		// Destroy the container.
-		if err := s.Destroy(); err != nil {
+		if err := c.Destroy(); err != nil {
 			t.Fatalf("error destroying container: %v", err)
 		}
 
@@ -1160,7 +1160,7 @@ func TestConsoleSocket(t *testing.T) {
 
 		// Create the container and pass the socket name.
 		id := testutil.UniqueContainerID()
-		s, err := Create(id, spec, conf, bundleDir, socketRelPath, "")
+		c, err := Create(id, spec, conf, bundleDir, socketRelPath, "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1197,12 +1197,12 @@ func TestConsoleSocket(t *testing.T) {
 		}
 
 		// Reap the sandbox process.
-		if _, err := reapChildren(s); err != nil {
+		if _, err := reapChildren(c); err != nil {
 			t.Fatalf("error reaping children: %v", err)
 		}
 
 		// Shut it down.
-		if err := s.Destroy(); err != nil {
+		if err := c.Destroy(); err != nil {
 			t.Fatalf("error destroying container: %v", err)
 		}
 
@@ -1288,16 +1288,16 @@ func TestReadonlyRoot(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create, start and wait for the container.
-		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
-		defer s.Destroy()
-		if err := s.Start(conf); err != nil {
+		defer c.Destroy()
+		if err := c.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
 
-		ws, err := s.Wait()
+		ws, err := c.Wait()
 		if err != nil {
 			t.Fatalf("error waiting on container: %v", err)
 		}
@@ -1332,16 +1332,16 @@ func TestReadonlyMount(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create, start and wait for the container.
-		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
-		defer s.Destroy()
-		if err := s.Start(conf); err != nil {
+		defer c.Destroy()
+		if err := c.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
 
-		ws, err := s.Wait()
+		ws, err := c.Wait()
 		if err != nil {
 			t.Fatalf("error waiting on container: %v", err)
 		}
-- 
cgit v1.2.3


From 5d9816be41a967fa1fa9bbbe0c638dd322c7c0b1 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 17 Sep 2018 21:33:51 -0700
Subject: Remove memory usage static init

panic() during init() can be hard to debug.

Updates #100

PiperOrigin-RevId: 213391932
Change-Id: Ic103f1981c5b48f1e12da3b42e696e84ffac02a9
---
 pkg/sentry/usage/memory.go | 16 +++++++++-------
 runsc/boot/loader.go       |  4 ++++
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 4a1527b5f..f13a77779 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -117,15 +117,16 @@ type MemoryLocked struct {
 	File *os.File
 }
 
-func newMemoryLocked() MemoryLocked {
-	name := "memory-usage"
+// Init initializes global 'MemoryAccounting'.
+func Init() error {
+	const name = "memory-usage"
 	fd, err := memutil.CreateMemFD(name, 0)
 	if err != nil {
-		panic("error creating usage file: " + err.Error())
+		return fmt.Errorf("error creating usage file: %v", err)
 	}
 	file := os.NewFile(uintptr(fd), name)
 	if err := file.Truncate(int64(RTMemoryStatsSize)); err != nil {
-		panic("error truncating usage file: " + err.Error())
+		return fmt.Errorf("error truncating usage file: %v", err)
 	}
 	// Note: We rely on the returned page being initially zeroed. This will
 	// always be the case for a newly mapped page from /dev/shm. If we obtain
@@ -133,13 +134,14 @@ func newMemoryLocked() MemoryLocked {
 	// explicitly zero the page.
 	mmap, err := syscall.Mmap(int(file.Fd()), 0, int(RTMemoryStatsSize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
 	if err != nil {
-		panic("error mapping usage file: " + err.Error())
+		return fmt.Errorf("error mapping usage file: %v", err)
 	}
 
-	return MemoryLocked{
+	MemoryAccounting = &MemoryLocked{
 		File:          file,
 		RTMemoryStats: RTMemoryStatsPointer(mmap),
 	}
+	return nil
 }
 
 // MemoryAccounting is the global memory stats.
@@ -147,7 +149,7 @@ func newMemoryLocked() MemoryLocked {
 // There is no need to save or restore the global memory accounting object,
 // because individual frame kinds are saved and charged only when they become
 // resident.
-var MemoryAccounting = newMemoryLocked()
+var MemoryAccounting *MemoryLocked
 
 func (m *MemoryLocked) incLocked(val uint64, kind MemoryKind) {
 	switch kind {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 665240ab6..faaf3e800 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -42,6 +42,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
 	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
@@ -143,6 +144,9 @@ func init() {
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
 func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, console bool) (*Loader, error) {
+	if err := usage.Init(); err != nil {
+		return nil, fmt.Errorf("Error setting up memory usage: %v", err)
+	}
 	// Create kernel and platform.
 	p, err := createPlatform(conf, deviceFD)
 	if err != nil {
-- 
cgit v1.2.3


From 7e00f3705480313a63c9db7d087db711abc720bc Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 18 Sep 2018 13:21:13 -0700
Subject: Automated rollback of changelist 213307171

PiperOrigin-RevId: 213504354
Change-Id: Iadd42f0ca4b7e7a9eae780bee9900c7233fb4f3f
---
 runsc/boot/controller.go | 12 +++++-------
 runsc/boot/fds.go        | 13 +++++--------
 runsc/boot/fs.go         | 12 ++++++------
 runsc/boot/loader.go     | 18 ++++++------------
 runsc/sandbox/sandbox.go |  9 +++------
 5 files changed, 25 insertions(+), 39 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index dc9359092..f5b0d371c 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -186,10 +186,8 @@ type StartArgs struct {
 	// CID is the ID of the container to start.
 	CID string
 
-	// FilePayload contains, in order:
-	//   * stdin, stdout, and stderr.
-	//   * the file descriptor over which the sandbox will
-	//     request files from its root filesystem.
+	// FilePayload contains the file descriptor over which the sandbox will
+	// request files from its root filesystem.
 	urpc.FilePayload
 }
 
@@ -217,8 +215,8 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if path.Clean(args.CID) != args.CID {
 		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
 	}
-	if len(args.FilePayload.Files) < 4 {
-		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
+	if len(args.FilePayload.Files) == 0 {
+		return fmt.Errorf("start arguments must contain at least one file for the container root")
 	}
 
 	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
@@ -320,7 +318,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
-	fds := &fdDispenser{fds: cm.l.goferFDs}
+	fds := &fdDispenser{fds: cm.l.ioFDs}
 	renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
 	if err != nil {
 		return fmt.Errorf("error creating RestoreEnvironment: %v", err)
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 91c698fea..9de5a78b1 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -16,6 +16,7 @@ package boot
 
 import (
 	"fmt"
+	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -27,19 +28,15 @@ import (
 
 // createFDMap creates an fd map that contains stdin, stdout, and stderr. If
 // console is true, then ioctl calls will be passed through to the host fd.
-func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
-	if len(stdioFDs) != 3 {
-		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
-	}
-
+func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool) (*kernel.FDMap, error) {
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
 
 	// Maps sandbox fd to host fd.
 	fdMap := map[int]int{
-		0: stdioFDs[0],
-		1: stdioFDs[1],
-		2: stdioFDs[2],
+		0: syscall.Stdin,
+		1: syscall.Stdout,
+		2: syscall.Stderr,
 	}
 	mounter := fs.FileOwnerFromContext(ctx)
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 45843fe7b..5ec9a7d03 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -82,7 +82,7 @@ func (f *fdDispenser) empty() bool {
 
 // createMountNamespace creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
-func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int) (*fs.MountNamespace, error) {
+func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
 	mounts := compileMounts(spec)
 	// Create a tmpfs mount where we create and mount a root filesystem for
 	// each child container.
@@ -90,7 +90,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 		Type:        tmpfs,
 		Destination: childContainersDir,
 	})
-	fds := &fdDispenser{fds: goferFDs}
+	fds := &fdDispenser{fds: ioFDs}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount: %v", err)
@@ -595,13 +595,13 @@ func subtargets(root string, mnts []specs.Mount) []string {
 
 // setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
 // procArgs are passed by reference and the FDMap field is modified.
-func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
 	ctx := procArgs.NewContext(k)
 
 	// Create the FD map, which will set stdin, stdout, and stderr.  If
 	// console is true, then ioctl calls will be passed through to the host
 	// fd.
-	fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
+	fdm, err := createFDMap(ctx, k, ls, console)
 	if err != nil {
 		return fmt.Errorf("error importing fds: %v", err)
 	}
@@ -625,7 +625,7 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	mns := k.RootMountNamespace()
 	if mns == nil {
 		// Create the virtual filesystem.
-		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, goferFDs)
+		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
 		if err != nil {
 			return fmt.Errorf("error creating mounts: %v", err)
 		}
@@ -637,7 +637,7 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 
 	// Create the container's root filesystem mount.
 	log.Infof("Creating new process in child container.")
-	fds := &fdDispenser{fds: append([]int{}, goferFDs...)}
+	fds := &fdDispenser{fds: append([]int{}, ioFDs...)}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
 	if err != nil {
 		return fmt.Errorf("error creating filesystem for container: %v", err)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index faaf3e800..623d04171 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -79,11 +79,8 @@ type Loader struct {
 
 	watchdog *watchdog.Watchdog
 
-	// stdioFDs contains stdin, stdout, and stderr.
-	stdioFDs []int
-
-	// goferFDs are the FDs that attach the sandbox to the gofers.
-	goferFDs []int
+	// ioFDs are the FDs that attach the sandbox to the gofers.
+	ioFDs []int
 
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
@@ -143,7 +140,7 @@ func init() {
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, console bool) (*Loader, error) {
+func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, ioFDs []int, console bool) (*Loader, error) {
 	if err := usage.Init(); err != nil {
 		return nil, fmt.Errorf("Error setting up memory usage: %v", err)
 	}
@@ -277,8 +274,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []
 		conf:                  conf,
 		console:               console,
 		watchdog:              watchdog,
-		stdioFDs:              []int{syscall.Stdin, syscall.Stdout, syscall.Stderr},
-		goferFDs:              goferFDs,
+		ioFDs:                 ioFDs,
 		spec:                  spec,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
@@ -390,8 +386,7 @@ func (l *Loader) run() error {
 			&l.rootProcArgs,
 			l.spec,
 			l.conf,
-			l.stdioFDs,
-			l.goferFDs,
+			l.ioFDs,
 			l.console,
 			l.rootProcArgs.Credentials,
 			l.rootProcArgs.Limits,
@@ -479,8 +474,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		&procArgs,
 		spec,
 		conf,
-		ioFDs[:3], // stdioFDs
-		ioFDs[3:], // goferFDs
+		ioFDs,
 		false,
 		creds,
 		procArgs.Limits,
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 3b10fd20e..21625a7c6 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -100,8 +100,8 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 }
 
 // Start starts running a non-root container inside the sandbox.
-func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
-	for _, f := range goferFiles {
+func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, ioFiles []*os.File) error {
+	for _, f := range ioFiles {
 		defer f.Close()
 	}
 
@@ -112,15 +112,12 @@ func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, goferFi
 	}
 	defer sandboxConn.Close()
 
-	// The payload must container stdin/stdout/stderr followed by gofer
-	// files.
-	files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...)
 	// Start running the container.
 	args := boot.StartArgs{
 		Spec:        spec,
 		Conf:        conf,
 		CID:         cid,
-		FilePayload: urpc.FilePayload{Files: files},
+		FilePayload: urpc.FilePayload{Files: ioFiles},
 	}
 	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
 		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
-- 
cgit v1.2.3


From 7967d8ecd57383f406d202f7e2f65e275bb36fc8 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 18 Sep 2018 15:20:19 -0700
Subject: Handle children processes better in tests

Reap children more systematically in container tests. Previously,
container_test was taking ~5 mins to run because constainer.Destroy()
would timeout waiting for the sandbox process to exit. Now the test
running in less than a minute.

Also made the contract around Container and Sandbox destroy clearer.

PiperOrigin-RevId: 213527471
Change-Id: Icca84ee1212bbdcb62bdfc9cc7b71b12c6d1688d
---
 runsc/cmd/checkpoint.go                 |  1 +
 runsc/container/container.go            | 14 +++---
 runsc/container/container_test.go       | 79 +++++++--------------------------
 runsc/container/multi_container_test.go | 59 +++++++++++++-----------
 runsc/sandbox/sandbox.go                |  3 +-
 runsc/test/testutil/testutil.go         | 45 +++++++++++++++++--
 6 files changed, 99 insertions(+), 102 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 7c2c3f59e..d074b8617 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -137,6 +137,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	if err != nil {
 		Fatalf("error restoring container: %v", err)
 	}
+	defer cont.Destroy()
 
 	if err := cont.Restore(spec, conf, fullImagePath); err != nil {
 		Fatalf("error starting container: %v", err)
diff --git a/runsc/container/container.go b/runsc/container/container.go
index a24c6cc31..9bf2f4625 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -198,7 +198,8 @@ func List(rootDir string) ([]string, error) {
 }
 
 // Create creates the container in a new Sandbox process, unless the metadata
-// indicates that an existing Sandbox should be used.
+// indicates that an existing Sandbox should be used. The caller must call
+// Destroy() on the container.
 func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (*Container, error) {
 	log.Debugf("Create container %q in root dir: %s", id, conf.RootDir)
 	if err := validateID(id); err != nil {
@@ -295,14 +296,12 @@ func (c *Container) Start(conf *boot.Config) error {
 	// stop and destroy the container" -OCI spec.
 	if c.Spec.Hooks != nil {
 		if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
-			c.Destroy()
 			return err
 		}
 	}
 
 	if specutils.ShouldCreateSandbox(c.Spec) || !conf.MultiContainer {
 		if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
-			c.Destroy()
 			return err
 		}
 	} else {
@@ -312,7 +311,6 @@ func (c *Container) Start(conf *boot.Config) error {
 			return err
 		}
 		if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil {
-			c.Destroy()
 			return err
 		}
 	}
@@ -351,6 +349,8 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	if err != nil {
 		return 0, fmt.Errorf("error creating container: %v", err)
 	}
+	defer c.Destroy()
+
 	if err := c.Start(conf); err != nil {
 		return 0, fmt.Errorf("error starting container: %v", err)
 	}
@@ -420,7 +420,7 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 // Signal returns an error if the container is already stopped.
 // TODO: Distinguish different error types.
 func (c *Container) Signal(sig syscall.Signal) error {
-	log.Debugf("Signal container %q", c.ID)
+	log.Debugf("Signal container %q: %v", c.ID, sig)
 	if c.Status == Stopped {
 		return fmt.Errorf("container sandbox is stopped")
 	}
@@ -490,8 +490,8 @@ func (c *Container) Processes() ([]*control.Process, error) {
 	return c.Sandbox.Processes(c.ID)
 }
 
-// Destroy frees all resources associated with the container.
-// Destroy returns error if any step fails, and the function can be safely retried.
+// Destroy frees all resources associated with the container. It fails fast and
+// is idempotent.
 func (c *Container) Destroy() error {
 	log.Debugf("Destroy container %q", c.ID)
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 24beb2b75..996d80a89 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -252,35 +252,6 @@ func configs(opts ...configOption) []*boot.Config {
 	return cs
 }
 
-// In normal runsc usage, sandbox processes will be parented to
-// init and init will reap the them. However, in the test environment
-// the test runner is the parent and will not reap the sandbox
-// processes, so we must do it ourselves, or else they will left
-// as zombies.
-// The function returns a wait group, and the caller can reap
-// children synchronously by waiting on the wait group.
-func reapChildren(c *Container) (*sync.WaitGroup, error) {
-	var wg sync.WaitGroup
-	p, err := os.FindProcess(c.Sandbox.Pid)
-	if err != nil {
-		return nil, fmt.Errorf("error finding sandbox process: %v", err)
-	}
-	g, err := os.FindProcess(c.GoferPid)
-	if err != nil {
-		return nil, fmt.Errorf("error finding gofer process: %v", err)
-	}
-	wg.Add(2)
-	go func() {
-		p.Wait()
-		wg.Done()
-	}()
-	go func() {
-		g.Wait()
-		wg.Done()
-	}()
-	return &wg, nil
-}
-
 // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
@@ -310,12 +281,14 @@ func TestLifecycle(t *testing.T) {
 		}
 		// Create the container.
 		id := testutil.UniqueContainerID()
-		if _, err := Create(id, spec, conf, bundleDir, "", ""); err != nil {
+		c, err := Create(id, spec, conf, bundleDir, "", "")
+		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
+		defer c.Destroy()
 
 		// Load the container from disk and check the status.
-		c, err := Load(rootDir, id)
+		c, err = Load(rootDir, id)
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
@@ -378,17 +351,6 @@ func TestLifecycle(t *testing.T) {
 		// Wait for it to die.
 		wg.Wait()
 
-		// The sandbox process should have exited by now, but it is a
-		// zombie. In normal runsc usage, it will be parented to init,
-		// and init will reap the sandbox. However, in this case the
-		// test runner is the parent and will not reap the sandbox
-		// process, so we must do it ourselves.
-		reapWg, err := reapChildren(c)
-		if err != nil {
-			t.Fatalf("error reaping children: %v", err)
-		}
-		reapWg.Wait()
-
 		// Load the container from disk and check the status.
 		c, err = Load(rootDir, id)
 		if err != nil {
@@ -1164,6 +1126,7 @@ func TestConsoleSocket(t *testing.T) {
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
+		c.Destroy()
 
 		// Open the othe end of the socket.
 		sock, err := srv.Accept()
@@ -1196,11 +1159,6 @@ func TestConsoleSocket(t *testing.T) {
 			t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
 		}
 
-		// Reap the sandbox process.
-		if _, err := reapChildren(c); err != nil {
-			t.Fatalf("error reaping children: %v", err)
-		}
-
 		// Shut it down.
 		if err := c.Destroy(); err != nil {
 			t.Fatalf("error destroying container: %v", err)
@@ -1566,29 +1524,21 @@ func TestGoferExits(t *testing.T) {
 		t.Fatalf("error starting container: %v", err)
 	}
 
+	// Kill sandbox and expect gofer to exit on its own.
 	sandboxProc, err := os.FindProcess(c.Sandbox.Pid)
 	if err != nil {
 		t.Fatalf("error finding sandbox process: %v", err)
 	}
-	gofer, err := os.FindProcess(c.GoferPid)
-	if err != nil {
-		t.Fatalf("error finding sandbox process: %v", err)
-	}
-
-	// Kill sandbox and expect gofer to exit on its own.
 	if err := sandboxProc.Kill(); err != nil {
 		t.Fatalf("error killing sandbox process: %v", err)
 	}
-	if _, err := sandboxProc.Wait(); err != nil {
-		t.Fatalf("error waiting for sandbox process: %v", err)
-	}
-
-	if _, err := gofer.Wait(); err != nil {
-		t.Fatalf("error waiting for gofer process: %v", err)
-	}
 
-	if err := c.waitForStopped(); err != nil {
-		t.Errorf("container is not stopped: %v", err)
+	_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
+		cpid, err := syscall.Wait4(c.GoferPid, nil, 0, nil)
+		return uintptr(cpid), 0, err
+	})
+	if err != nil && err != syscall.ECHILD {
+		t.Errorf("error waiting for gofer to exit: %v", err)
 	}
 }
 
@@ -1606,5 +1556,8 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 }
 
 func TestMain(m *testing.M) {
-	testutil.RunAsRoot(m)
+	testutil.RunAsRoot()
+	stop := testutil.StartReaper()
+	defer stop()
+	os.Exit(m.Run())
 }
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 09888cb86..349ea755a 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -15,7 +15,6 @@
 package container
 
 import (
-	"fmt"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -379,6 +378,22 @@ func TestMultiContainerSignal(t *testing.T) {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 
+		// Destroy container and ensure container's gofer process has exited.
+		if err := containers[1].Destroy(); err != nil {
+			t.Errorf("failed to destroy container: %v", err)
+		}
+		_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
+			cpid, err := syscall.Wait4(containers[1].GoferPid, nil, 0, nil)
+			return uintptr(cpid), 0, err
+		})
+		if err != nil && err != syscall.ECHILD {
+			t.Errorf("error waiting for gofer to exit: %v", err)
+		}
+		// Make sure process 1 is still running.
+		if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
+
 		// Now that process 2 is gone, ensure we get an error trying to
 		// signal it again.
 		if err := containers[1].Signal(syscall.SIGKILL); err == nil {
@@ -390,36 +405,26 @@ func TestMultiContainerSignal(t *testing.T) {
 			t.Errorf("failed to kill process 1: %v", err)
 		}
 
-		if err := waitForSandboxExit(containers[0]); err != nil {
-			t.Errorf("failed to exit sandbox: %v", err)
+		// Ensure that container's gofer and sandbox process are no more.
+		_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
+			cpid, err := syscall.Wait4(containers[0].GoferPid, nil, 0, nil)
+			return uintptr(cpid), 0, err
+		})
+		if err != nil && err != syscall.ECHILD {
+			t.Errorf("error waiting for gofer to exit: %v", err)
+		}
+
+		_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
+			cpid, err := syscall.Wait4(containers[0].Sandbox.Pid, nil, 0, nil)
+			return uintptr(cpid), 0, err
+		})
+		if err != nil && err != syscall.ECHILD {
+			t.Errorf("error waiting for sandbox to exit: %v", err)
 		}
 
-		// The sentry should be gone, so signaling should yield an
-		// error.
+		// The sentry should be gone, so signaling should yield an error.
 		if err := containers[0].Signal(syscall.SIGKILL); err == nil {
 			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
 		}
 	}
 }
-
-// waitForSandboxExit waits until both the sandbox and gofer processes of the
-// container have exited.
-func waitForSandboxExit(container *Container) error {
-	goferProc, _ := os.FindProcess(container.GoferPid)
-	state, err := goferProc.Wait()
-	if err != nil {
-		return err
-	}
-	if !state.Exited() {
-		return fmt.Errorf("gofer with PID %d failed to exit", container.GoferPid)
-	}
-	sandboxProc, _ := os.FindProcess(container.Sandbox.Pid)
-	state, err = sandboxProc.Wait()
-	if err != nil {
-		return err
-	}
-	if !state.Exited() {
-		return fmt.Errorf("sandbox with PID %d failed to exit", container.Sandbox.Pid)
-	}
-	return nil
-}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 21625a7c6..f58d111bf 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -59,7 +59,8 @@ type Sandbox struct {
 	Chroot string `json:"chroot"`
 }
 
-// Create creates the sandbox process.
+// Create creates the sandbox process. The caller must call Destroy() on the
+// sandbox.
 func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
 
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4d354de31..2e7f95912 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -24,10 +24,10 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
+	"os/signal"
 	"path/filepath"
 	"runtime"
 	"syscall"
-	"testing"
 	"time"
 
 	"github.com/cenkalti/backoff"
@@ -236,10 +236,11 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 
 // RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create
 // a new user namespace and reexecute the test as root inside of the namespace.
-func RunAsRoot(m *testing.M) {
+// This functionr returns when it's running as root. If it needs to create
+// another process, it will exit from there and not return.
+func RunAsRoot() {
 	if specutils.HasCapSysAdmin() {
-		// Capability: check! Good to run.
-		os.Exit(m.Run())
+		return
 	}
 
 	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
@@ -278,3 +279,39 @@ func RunAsRoot(m *testing.M) {
 	}
 	os.Exit(0)
 }
+
+// StartReaper starts a gorouting that will reap all children processes created
+// by the tests. Caller must call the returned function to stop it.
+func StartReaper() func() {
+	ch := make(chan os.Signal, 1)
+	signal.Notify(ch, syscall.SIGCHLD)
+	stop := make(chan struct{})
+
+	go func() {
+		for {
+			select {
+			case <-ch:
+			case <-stop:
+				return
+			}
+			for {
+				cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil)
+				if cpid < 1 {
+					break
+				}
+			}
+		}
+	}()
+	return func() { stop <- struct{}{} }
+}
+
+// RetryEintr retries the function until an error different than EINTR is
+// returned.
+func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
+	for {
+		r1, r2, err := f()
+		if err != syscall.EINTR {
+			return r1, r2, err
+		}
+	}
+}
-- 
cgit v1.2.3


From 8aec7473a1cc106d1de2e6c072b84eecc1f239b5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 18 Sep 2018 19:11:49 -0700
Subject: Added state machine checks for Container.Status

For my own sanitity when thinking about possible transitions and state.

PiperOrigin-RevId: 213559482
Change-Id: I25588c86cf6098be4eda01f4e7321c102ceef33c
---
 runsc/container/container.go            | 64 +++++++++++++++++++++++++++------
 runsc/container/multi_container_test.go |  5 ++-
 2 files changed, 58 insertions(+), 11 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 9bf2f4625..3be88066c 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -92,7 +92,7 @@ type Container struct {
 	Status Status `json:"status"`
 
 	// GoferPid is the pid of the gofer running along side the sandbox. May
-	// be 0 if the gofer has been killed or it's not being used.
+	// be 0 if the gofer has been killed.
 	GoferPid int `json:"goferPid"`
 
 	// Sandbox is the sandbox this container is running in. It will be nil
@@ -138,14 +138,13 @@ func Load(rootDir, id string) (*Container, error) {
 		// Check if the sandbox process is still running.
 		if !c.Sandbox.IsRunning() {
 			// Sandbox no longer exists, so this container definitely does not exist.
-			c.Status = Stopped
-			c.Sandbox = nil
+			c.changeStatus(Stopped)
 		} else if c.Status == Running {
 			// Container state should reflect the actual state of
 			// the application, so we don't consider gofer process
 			// here.
 			if err := c.Signal(syscall.Signal(0)); err != nil {
-				c.Status = Stopped
+				c.changeStatus(Stopped)
 			}
 		}
 	}
@@ -265,7 +264,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		}
 		c.Sandbox = sb.Sandbox
 	}
-	c.Status = Created
+	c.changeStatus(Created)
 
 	// Save the metadata file.
 	if err := c.save(); err != nil {
@@ -322,7 +321,7 @@ func (c *Container) Start(conf *boot.Config) error {
 		executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
 	}
 
-	c.Status = Running
+	c.changeStatus(Running)
 	return c.save()
 }
 
@@ -338,7 +337,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 	if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil {
 		return err
 	}
-	c.Status = Running
+	c.changeStatus(Running)
 	return c.save()
 }
 
@@ -447,7 +446,7 @@ func (c *Container) Pause() error {
 		if err := c.Sandbox.Pause(c.ID); err != nil {
 			return fmt.Errorf("error pausing container: %v", err)
 		}
-		c.Status = Paused
+		c.changeStatus(Paused)
 		return c.save()
 	default:
 		return fmt.Errorf("container %q not created or running, not pausing", c.ID)
@@ -463,7 +462,7 @@ func (c *Container) Resume() error {
 		if err := c.Sandbox.Resume(c.ID); err != nil {
 			return fmt.Errorf("error resuming container: %v", err)
 		}
-		c.Status = Running
+		c.changeStatus(Running)
 		return c.save()
 	default:
 		return fmt.Errorf("container %q not paused, not resuming", c.ID)
@@ -519,7 +518,7 @@ func (c *Container) Destroy() error {
 		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
 	}
 
-	c.Status = Stopped
+	c.changeStatus(Stopped)
 	return nil
 }
 
@@ -583,6 +582,7 @@ func (c *Container) waitForStopped() error {
 			if err := syscall.Kill(c.GoferPid, 0); err == nil {
 				return fmt.Errorf("gofer is still running")
 			}
+			c.GoferPid = 0
 		}
 		return nil
 	}
@@ -652,3 +652,47 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	c.GoferPid = cmd.Process.Pid
 	return sandEnds, nil
 }
+
+// changeStatus transitions from one status to another ensuring that the
+// transition is valid.
+func (c *Container) changeStatus(s Status) {
+	switch s {
+	case Creating:
+		// Initial state, never transitions to it.
+		panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+
+	case Created:
+		if c.Status != Creating {
+			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+		}
+		if c.Sandbox == nil {
+			panic("sandbox cannot be nil")
+		}
+
+	case Paused:
+		if c.Status != Running {
+			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+		}
+		if c.Sandbox == nil {
+			panic("sandbox cannot be nil")
+		}
+
+	case Running:
+		if c.Status != Created && c.Status != Paused {
+			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+		}
+		if c.Sandbox == nil {
+			panic("sandbox cannot be nil")
+		}
+
+	case Stopped:
+		if c.Status != Created && c.Status != Running && c.Status != Stopped {
+			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+		}
+		c.Sandbox = nil
+
+	default:
+		panic(fmt.Sprintf("invalid new state: %v", s))
+	}
+	c.Status = s
+}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 349ea755a..d6418efb6 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -378,12 +378,15 @@ func TestMultiContainerSignal(t *testing.T) {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 
+		// goferPid is reset when container is destroyed.
+		goferPid := containers[1].GoferPid
+
 		// Destroy container and ensure container's gofer process has exited.
 		if err := containers[1].Destroy(); err != nil {
 			t.Errorf("failed to destroy container: %v", err)
 		}
 		_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
-			cpid, err := syscall.Wait4(containers[1].GoferPid, nil, 0, nil)
+			cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
 			return uintptr(cpid), 0, err
 		})
 		if err != nil && err != syscall.ECHILD {
-- 
cgit v1.2.3


From f0a92b6b67382a1f8da5ef2622c59afdb1c40f13 Mon Sep 17 00:00:00 2001
From: Lingfu <shanks.cyp@gmail.com>
Date: Wed, 19 Sep 2018 13:34:28 -0700
Subject: Add docker command line args support for --cpuset-cpus and --cpus

`docker run --cpuset-cpus=/--cpus=` will generate cpu resource info in config.json
(runtime spec file). When nginx worker_connections is configured as auto, the worker is
generated according to the number of CPUs. If the cgroup is already set on the host, but
it is not displayed correctly in the sandbox, performance may be degraded.

This patch can get cpus info from spec file and apply to sentry on bootup, so the
/proc/cpuinfo can show the correct cpu numbers. `lscpu` and other commands rely on
`/sys/devices/system/cpu/online` are also affected by this patch.

e.g.

--cpuset-cpus=2,3   ->  cpu number:2
--cpuset-cpus=4-7   ->  cpu number:4
--cpus=2.8          ->  cpu number:3
--cpus=0.5          ->  cpu number:1
Change-Id: Ideb22e125758d4322a12be7c51795f8018e3d316
PiperOrigin-RevId: 213685199
---
 runsc/boot/loader.go   | 18 ++++++----
 runsc/specutils/BUILD  |  1 +
 runsc/specutils/cpu.go | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 7 deletions(-)
 create mode 100644 runsc/specutils/cpu.go

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 623d04171..f906c9f95 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,7 +20,6 @@ import (
 	"math/rand"
 	"os"
 	"os/signal"
-	"runtime"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -201,15 +200,20 @@ func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, ioFDs []int
 		caps,
 		auth.NewRootUserNamespace())
 
+	// Get CPU numbers from spec.
+	cpuNum, err := specutils.CalculateCPUNumber(spec)
+	if err != nil {
+		return nil, fmt.Errorf("cannot get cpus from spec: %v", err)
+	}
+
 	// Initiate the Kernel object, which is required by the Context passed
 	// to createVFS in order to mount (among other things) procfs.
 	if err = k.Init(kernel.InitKernelArgs{
-		FeatureSet:        cpuid.HostFeatureSet(),
-		Timekeeper:        tk,
-		RootUserNamespace: creds.UserNamespace,
-		NetworkStack:      networkStack,
-		// TODO: use number of logical processors from cgroups.
-		ApplicationCores:            uint(runtime.NumCPU()),
+		FeatureSet:                  cpuid.HostFeatureSet(),
+		Timekeeper:                  tk,
+		RootUserNamespace:           creds.UserNamespace,
+		NetworkStack:                networkStack,
+		ApplicationCores:            uint(cpuNum),
 		Vdso:                        vdso,
 		RootUTSNamespace:            kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace),
 		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index e73b2293f..f1a99ce48 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "specutils",
     srcs = [
+        "cpu.go",
         "namespace.go",
         "specutils.go",
     ],
diff --git a/runsc/specutils/cpu.go b/runsc/specutils/cpu.go
new file mode 100644
index 000000000..9abe26b64
--- /dev/null
+++ b/runsc/specutils/cpu.go
@@ -0,0 +1,90 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	"fmt"
+	"runtime"
+	"strconv"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+// CalculateCPUNumber calculates the number of CPUs that should be exposed
+// inside the sandbox.
+func CalculateCPUNumber(spec *specs.Spec) (int, error) {
+	// If spec does not contain CPU field, then return the number of host CPUs.
+	if spec == nil || spec.Linux == nil || spec.Linux.Resources == nil || spec.Linux.Resources.CPU == nil {
+		return runtime.NumCPU(), nil
+	}
+	cpuSpec := spec.Linux.Resources.CPU
+
+	// If cpuSpec.Cpus is specified, then parse and return that. They must be in
+	// the list format for cpusets, which is "a comma-separated list of CPU
+	// numbers and ranges of numbers, in ASCII decimal." --man 7 cpuset.
+	cpus := cpuSpec.Cpus
+	if cpus != "" {
+		cpuNum := 0
+		for _, subs := range strings.Split(cpus, ",") {
+			result, err := parseCPUNumber(subs)
+			if err != nil {
+				return 0, err
+			}
+			cpuNum += result
+		}
+		return cpuNum, nil
+	}
+
+	// If CPU.Quota and CPU.Period are specified, we can divide them to get an
+	// approximation of the number of CPUs needed.
+	if cpuSpec.Quota != nil && cpuSpec.Period != nil && *cpuSpec.Period != 0 {
+		cpuQuota := *cpuSpec.Quota
+		cpuPeriod := *cpuSpec.Period
+		return int(cpuQuota)/int(cpuPeriod) + 1, nil
+	}
+
+	// Default to number of host cpus.
+	return runtime.NumCPU(), nil
+}
+
+// parseCPUNumber converts a cpuset string into the number of cpus included in
+// the string , e.g. "3-6" -> 4.
+func parseCPUNumber(cpus string) (int, error) {
+	switch cpusSlice := strings.Split(cpus, "-"); len(cpusSlice) {
+	case 1:
+		// cpus is not a range. We must only check that it is a valid number.
+		if _, err := strconv.Atoi(cpus); err != nil {
+			return 0, fmt.Errorf("invalid individual cpu number %q", cpus)
+		}
+		return 1, nil
+	case 2:
+		// cpus is a range. We must check that start and end are valid numbers,
+		// and calculate their difference (inclusively).
+		first, err := strconv.Atoi(cpusSlice[0])
+		if err != nil || first < 0 {
+			return 0, fmt.Errorf("invalid first cpu number %q in range %q", cpusSlice[0], cpus)
+		}
+		last, err := strconv.Atoi(cpusSlice[1])
+		if err != nil || last < 0 {
+			return 0, fmt.Errorf("invalid last cpu number %q in range %q", cpusSlice[1], cpus)
+		}
+		cpuNum := last - first + 1
+		if cpuNum <= 0 {
+			return 0, fmt.Errorf("cpu range %q does not include positive number of cpus", cpus)
+		}
+	}
+	return 0, fmt.Errorf("invalid cpu string %q", cpus)
+}
-- 
cgit v1.2.3


From 2ad3228cd0f226804cfc7ae3ae7fff561caa2eda Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 19 Sep 2018 16:09:50 -0700
Subject: runsc: Don't create __runsc_containers__ unless we are in
 multi-container mode.

PiperOrigin-RevId: 213715511
Change-Id: I3e41b583c6138edbdeba036dfb9df4864134fc12
---
 runsc/boot/fs.go | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 5ec9a7d03..420e57022 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -84,12 +84,14 @@ func (f *fdDispenser) empty() bool {
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
 func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
 	mounts := compileMounts(spec)
-	// Create a tmpfs mount where we create and mount a root filesystem for
-	// each child container.
-	mounts = append(mounts, specs.Mount{
-		Type:        tmpfs,
-		Destination: childContainersDir,
-	})
+	if conf.MultiContainer {
+		// Create a tmpfs mount where we create and mount a root filesystem for
+		// each child container.
+		mounts = append(mounts, specs.Mount{
+			Type:        tmpfs,
+			Destination: childContainersDir,
+		})
+	}
 	fds := &fdDispenser{fds: ioFDs}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
-- 
cgit v1.2.3


From e3952733011df912ecaa48974832a054a45c345a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 19 Sep 2018 17:14:20 -0700
Subject: Fix sandbox and gofer capabilities

Capabilities.Set() adds capabilities,
but doesn't remove existing ones that might have been loaded. Fixed
the code and added tests.

PiperOrigin-RevId: 213726369
Change-Id: Id7fa6fce53abf26c29b13b9157bb4c6616986fba
---
 runsc/boot/fs.go             |  13 +----
 runsc/cmd/BUILD              |   9 ++++
 runsc/cmd/capability.go      |  63 ++++++++++++----------
 runsc/cmd/capability_test.go | 121 +++++++++++++++++++++++++++++++++++++++++++
 runsc/cmd/gofer.go           |  33 ++++++------
 runsc/specutils/specutils.go |  10 ++++
 6 files changed, 197 insertions(+), 52 deletions(-)
 create mode 100644 runsc/cmd/capability_test.go

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 420e57022..59ae5faae 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -428,13 +428,13 @@ func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, erro
 		kv := strings.Split(o, "=")
 		switch len(kv) {
 		case 1:
-			if contains(allowedKeys, o) {
+			if specutils.ContainsStr(allowedKeys, o) {
 				out = append(out, o)
 				continue
 			}
 			log.Warningf("ignoring unsupported key %q", kv)
 		case 2:
-			if contains(allowedKeys, kv[0]) {
+			if specutils.ContainsStr(allowedKeys, kv[0]) {
 				out = append(out, o)
 				continue
 			}
@@ -540,15 +540,6 @@ func mountFlags(opts []string) fs.MountSourceFlags {
 	return mf
 }
 
-func contains(strs []string, str string) bool {
-	for _, s := range strs {
-		if s == str {
-			return true
-		}
-	}
-	return false
-}
-
 func mustFindFilesystem(name string) fs.Filesystem {
 	fs, ok := fs.FindFilesystem(name)
 	if !ok {
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index f9c091ba2..7c90ff2c5 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -55,18 +55,27 @@ go_test(
     name = "cmd_test",
     size = "small",
     srcs = [
+        "capability_test.go",
         "delete_test.go",
         "exec_test.go",
     ],
+    data = [
+        "//runsc",
+    ],
     embed = [":cmd"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/container",
+        "//runsc/specutils",
+        "//runsc/test/testutil",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
index e2410d4ad..affbb7ce3 100644
--- a/runsc/cmd/capability.go
+++ b/runsc/cmd/capability.go
@@ -16,56 +16,67 @@ package cmd
 
 import (
 	"fmt"
-	"os"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/syndtr/gocapability/capability"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
+var allCapTypes = []capability.CapType{
+	capability.BOUNDS,
+	capability.EFFECTIVE,
+	capability.PERMITTED,
+	capability.INHERITABLE,
+	capability.AMBIENT,
+}
+
 // applyCaps applies the capabilities in the spec to the current thread.
 //
 // Note that it must be called with current thread locked.
 func applyCaps(caps *specs.LinuxCapabilities) error {
-	setter, err := capability.NewPid2(os.Getpid())
+	// Load current capabilities to trim the ones not permitted.
+	curCaps, err := capability.NewPid2(0)
 	if err != nil {
 		return err
 	}
-	if err := setter.Load(); err != nil {
+	if err := curCaps.Load(); err != nil {
 		return err
 	}
 
-	bounding, err := trimCaps(caps.Bounding, setter)
+	// Create an empty capability set to populate.
+	newCaps, err := capability.NewPid2(0)
 	if err != nil {
 		return err
 	}
-	setter.Set(capability.BOUNDS, bounding...)
 
-	effective, err := trimCaps(caps.Effective, setter)
-	if err != nil {
-		return err
-	}
-	setter.Set(capability.EFFECTIVE, effective...)
-
-	permitted, err := trimCaps(caps.Permitted, setter)
-	if err != nil {
-		return err
+	for _, c := range allCapTypes {
+		if !newCaps.Empty(c) {
+			panic("unloaded capabilities must be empty")
+		}
+		set, err := trimCaps(getCaps(c, caps), curCaps)
+		if err != nil {
+			return err
+		}
+		newCaps.Set(c, set...)
 	}
-	setter.Set(capability.PERMITTED, permitted...)
 
-	inheritable, err := trimCaps(caps.Inheritable, setter)
-	if err != nil {
-		return err
-	}
-	setter.Set(capability.INHERITABLE, inheritable...)
+	return newCaps.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS)
+}
 
-	ambient, err := trimCaps(caps.Ambient, setter)
-	if err != nil {
-		return err
+func getCaps(which capability.CapType, caps *specs.LinuxCapabilities) []string {
+	switch which {
+	case capability.BOUNDS:
+		return caps.Bounding
+	case capability.EFFECTIVE:
+		return caps.Effective
+	case capability.PERMITTED:
+		return caps.Permitted
+	case capability.INHERITABLE:
+		return caps.Inheritable
+	case capability.AMBIENT:
+		return caps.Ambient
 	}
-	setter.Set(capability.AMBIENT, ambient...)
-
-	return setter.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS)
+	panic(fmt.Sprint("invalid capability type:", which))
 }
 
 func trimCaps(names []string, setter capability.Capabilities) ([]capability.Cap, error) {
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
new file mode 100644
index 000000000..be9ef2e7b
--- /dev/null
+++ b/runsc/cmd/capability_test.go
@@ -0,0 +1,121 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
+	}
+}
+
+func checkProcessCaps(pid int, wantCaps *specs.LinuxCapabilities) error {
+	curCaps, err := capability.NewPid2(pid)
+	if err != nil {
+		return fmt.Errorf("capability.NewPid2(%d) failed: %v", pid, err)
+	}
+	if err := curCaps.Load(); err != nil {
+		return fmt.Errorf("unable to load capabilities: %v", err)
+	}
+	fmt.Printf("Capabilities (PID: %d): %v\n", pid, curCaps)
+
+	for _, c := range allCapTypes {
+		if err := checkCaps(c, curCaps, wantCaps); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func checkCaps(which capability.CapType, curCaps capability.Capabilities, wantCaps *specs.LinuxCapabilities) error {
+	wantNames := getCaps(which, wantCaps)
+	for name, c := range capFromName {
+		want := specutils.ContainsStr(wantNames, name)
+		got := curCaps.Get(which, c)
+		if want != got {
+			if want {
+				return fmt.Errorf("capability %v:%s should be set", which, name)
+			}
+			return fmt.Errorf("capability %v:%s should NOT be set", which, name)
+		}
+	}
+	return nil
+}
+
+func TestCapabilities(t *testing.T) {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	caps := []string{
+		"CAP_CHOWN",
+		"CAP_SYS_PTRACE", // ptrace is added due to the platform choice.
+	}
+	spec.Process.Capabilities = &specs.LinuxCapabilities{
+		Permitted:   caps,
+		Bounding:    caps,
+		Effective:   caps,
+		Inheritable: caps,
+	}
+
+	conf := testutil.TestConfig()
+
+	// Use --network=host to make sandbox use spec's capabilities.
+	conf.Network = boot.NetworkHost
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Check that sandbox and gofer have the proper capabilities.
+	if err := checkProcessCaps(c.Sandbox.Pid, spec.Process.Capabilities); err != nil {
+		t.Error(err)
+	}
+	if err := checkProcessCaps(c.GoferPid, goferCaps); err != nil {
+		t.Error(err)
+	}
+}
+
+func TestMain(m *testing.M) {
+	testutil.RunAsRoot()
+	os.Exit(m.Run())
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 95926f5f9..fd4eee546 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -31,6 +31,23 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
+var caps = []string{
+	"CAP_CHOWN",
+	"CAP_DAC_OVERRIDE",
+	"CAP_DAC_READ_SEARCH",
+	"CAP_FOWNER",
+	"CAP_FSETID",
+	"CAP_SYS_CHROOT",
+}
+
+// goferCaps is the minimal set of capabilities needed by the Gofer to operate
+// on files.
+var goferCaps = &specs.LinuxCapabilities{
+	Bounding:  caps,
+	Effective: caps,
+	Permitted: caps,
+}
+
 // Gofer implements subcommands.Command for the "gofer" command, which starts a
 // filesystem gofer.  This command should not be called directly.
 type Gofer struct {
@@ -72,25 +89,11 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	if g.applyCaps {
-		// Minimal set of capabilities needed by the Gofer to operate on files.
-		caps := []string{
-			"CAP_CHOWN",
-			"CAP_DAC_OVERRIDE",
-			"CAP_DAC_READ_SEARCH",
-			"CAP_FOWNER",
-			"CAP_FSETID",
-		}
-		lc := &specs.LinuxCapabilities{
-			Bounding:  caps,
-			Effective: caps,
-			Permitted: caps,
-		}
-
 		// Disable caps when calling myself again.
 		// Note: minimal argument handling for the default case to keep it simple.
 		args := os.Args
 		args = append(args, "--apply-caps=false")
-		if err := setCapsAndCallSelf(args, lc); err != nil {
+		if err := setCapsAndCallSelf(args, goferCaps); err != nil {
 			Fatalf("Unable to apply caps: %v", err)
 		}
 		panic("unreachable")
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index fdc9007e0..daf10b875 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -392,3 +392,13 @@ func Mount(src, dst, typ string, flags uint32) error {
 	}
 	return nil
 }
+
+// ContainsStr returns true if 'str' is inside 'strs'.
+func ContainsStr(strs []string, str string) bool {
+	for _, s := range strs {
+		if s == str {
+			return true
+		}
+	}
+	return false
+}
-- 
cgit v1.2.3


From 639226c3d980634fe0ffc335a90ed16edf84f457 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 19 Sep 2018 18:02:30 -0700
Subject: runsc: Mark container_test flaky.

PiperOrigin-RevId: 213732520
Change-Id: Ife292987ec8b1de4c2e7e3b7d4452b00c1582e91
---
 runsc/container/BUILD | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index efdf43175..d289e43be 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -46,6 +46,8 @@ go_test(
     ],
     embed = [":container"],
     tags = [
+        # FIXME
+        "flaky",
         "requires-kvm",
     ],
     deps = [
-- 
cgit v1.2.3


From 915d76aa924c08b1fcb80a58e3caa24529a23d04 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 19 Sep 2018 18:52:53 -0700
Subject: Add container.Destroy urpc method.

This method will:
1. Stop the container process if it is still running.
2. Unmount all sanadbox-internal mounts for the container.
3. Delete the contaner root directory inside the sandbox.

Destroy is idempotent, and safe to call concurrantly.

This fixes a bug where after stopping a container, we cannot unmount the
container root directory on the host. This bug occured because the sandbox
dirent cache was holding a dirent with a host fd corresponding to a file inside
the container root on the host. The dirent cache did not know that the
container had exited, and kept the FD open, preventing us from unmounting on
the host.

Now that we unmount (and flush) all container mounts inside the sandbox, any
host FDs donated by the gofer will be closed, and we can unmount the container
root on the host.

PiperOrigin-RevId: 213737693
Change-Id: I28c0ff4cd19a08014cdd72fec5154497e92aacc9
---
 runsc/boot/controller.go                | 96 +++++++++++++++++++++++++++++++--
 runsc/boot/fs.go                        |  8 +--
 runsc/container/container.go            | 13 ++---
 runsc/container/multi_container_test.go | 70 ++++++++++++++++++++++++
 runsc/sandbox/sandbox.go                | 20 +++++++
 5 files changed, 190 insertions(+), 17 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index f5b0d371c..b4594c8b0 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -21,8 +21,10 @@ import (
 	"path"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -30,6 +32,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 )
 
@@ -37,6 +40,10 @@ const (
 	// ContainerCheckpoint checkpoints a container.
 	ContainerCheckpoint = "containerManager.Checkpoint"
 
+	// ContainerDestroy is used to stop a non-root container and free all
+	// associated resources in the sandbox.
+	ContainerDestroy = "containerManager.Destroy"
+
 	// ContainerEvent is the URPC endpoint for getting stats about the
 	// container used by "runsc events".
 	ContainerEvent = "containerManager.Event"
@@ -58,9 +65,6 @@ const (
 	// ContainerResume unpauses the paused container.
 	ContainerResume = "containerManager.Resume"
 
-	// ContainerWaitForLoader blocks until the container's loader has been created.
-	ContainerWaitForLoader = "containerManager.WaitForLoader"
-
 	// ContainerSignal is used to send a signal to a container.
 	ContainerSignal = "containerManager.Signal"
 
@@ -72,6 +76,9 @@ const (
 	// and return its ExitStatus.
 	ContainerWait = "containerManager.Wait"
 
+	// ContainerWaitForLoader blocks until the container's loader has been created.
+	ContainerWaitForLoader = "containerManager.WaitForLoader"
+
 	// ContainerWaitPID is used to wait on a process with a certain PID in
 	// the sandbox and return its ExitStatus.
 	ContainerWaitPID = "containerManager.WaitPID"
@@ -228,6 +235,89 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	return nil
 }
 
+// Destroy stops a container if it is still running and cleans up its
+// filesystem.
+func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
+	log.Debugf("containerManager.destroy %q", *cid)
+	cm.l.mu.Lock()
+	defer cm.l.mu.Unlock()
+
+	if tg, ok := cm.l.containerRootTGs[*cid]; ok {
+		// Send SIGKILL to threadgroup.
+		if err := tg.SendSignal(&arch.SignalInfo{
+			Signo: int32(linux.SIGKILL),
+			Code:  arch.SignalInfoUser,
+		}); err == nil {
+			// SIGKILL sent. Now wait for it to exit.
+			log.Debugf("Waiting for container process to exit.")
+			tg.WaitExited()
+			log.Debugf("Container process exited.")
+		} else if err != syserror.ESRCH {
+			return fmt.Errorf("error sending SIGKILL to container %q: %v", *cid, err)
+		}
+
+		// Remove the container thread group from the map.
+		delete(cm.l.containerRootTGs, *cid)
+	}
+
+	// Clean up the filesystem by unmounting all mounts for this container
+	// and deleting the container root directory.
+
+	// First get a reference to the container root directory.
+	mns := cm.l.k.RootMountNamespace()
+	mnsRoot := mns.Root()
+	defer mnsRoot.DecRef()
+	ctx := cm.l.rootProcArgs.NewContext(cm.l.k)
+	containerRoot := path.Join(ChildContainersDir, *cid)
+	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
+	if err == syserror.ENOENT {
+		// Container must have been destroyed already. That's fine.
+		return nil
+	}
+	if err != nil {
+		return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
+	}
+	defer containerRootDirent.DecRef()
+
+	// Iterate through all submounts and unmount them. We unmount lazily by
+	// setting detach=true, so we can unmount in any order.
+	for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
+		root := m.Root()
+		defer root.DecRef()
+
+		// Do a best-effort unmount by flushing the refs and unmount
+		// with "detach only = true".
+		log.Debugf("Unmounting container submount %q", root.BaseName())
+		m.FlushDirentRefs()
+		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
+			return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
+		}
+	}
+
+	// Unmount the container root itself.
+	log.Debugf("Unmounting container root %q", containerRoot)
+	containerRootDirent.Inode.MountSource.FlushDirentRefs()
+	if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
+		return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
+	}
+
+	// Get a reference to the parent directory and remove the root
+	// container directory.
+	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
+	}
+	defer containersDirDirent.DecRef()
+	log.Debugf("Deleting container root %q", containerRoot)
+	if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, *cid); err != nil {
+		return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
+	}
+
+	// We made it!
+	log.Debugf("Destroyed container %q", *cid)
+	return nil
+}
+
 // ExecArgs contains arguments to Execute.
 type ExecArgs struct {
 	control.ExecArgs
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 59ae5faae..110f67de8 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -49,9 +49,9 @@ const (
 	// Device name for root mount.
 	rootDevice = "9pfs-/"
 
-	// childContainersDir is the directory where child container root
+	// ChildContainersDir is the directory where child container root
 	// filesystems are mounted.
-	childContainersDir = "/__runsc_containers__"
+	ChildContainersDir = "/__runsc_containers__"
 
 	// Filesystems that runsc supports.
 	bind     = "bind"
@@ -89,7 +89,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 		// each child container.
 		mounts = append(mounts, specs.Mount{
 			Type:        tmpfs,
-			Destination: childContainersDir,
+			Destination: ChildContainersDir,
 		})
 	}
 	fds := &fdDispenser{fds: ioFDs}
@@ -639,7 +639,7 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	// Make directories for submounts within the container.
 	rootDir := mns.Root()
 	defer rootDir.DecRef()
-	containerRoot := filepath.Join(childContainersDir, cid)
+	containerRoot := filepath.Join(ChildContainersDir, cid)
 	mkdirAll(ctx, mns, containerRoot)
 
 	// Mount the container's root filesystem to the newly created
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 3be88066c..a2582611a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -544,16 +544,9 @@ func (c *Container) save() error {
 // to stop. If any of them doesn't stop before timeout, an error is returned.
 func (c *Container) stop() error {
 	if c.Sandbox != nil && c.Sandbox.IsRunning() {
-		log.Debugf("Killing container %q", c.ID)
-		if c.Sandbox.IsRootContainer(c.ID) {
-			if err := c.Sandbox.Destroy(); err != nil {
-				return fmt.Errorf("error destroying sandbox %q: %v", c.Sandbox.ID, err)
-			}
-		} else {
-			if err := c.Signal(syscall.SIGKILL); err != nil {
-				// The container may already be stopped, log the error.
-				log.Warningf("Error sending signal %d to container %q: %v", syscall.SIGKILL, c.ID, err)
-			}
+		log.Debugf("Destroying container %q", c.ID)
+		if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
+			return fmt.Errorf("error destroying container %q: %v", c.ID, err)
 		}
 	}
 
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index d6418efb6..0df587e30 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -17,6 +17,7 @@ package container
 import (
 	"io/ioutil"
 	"os"
+	"path"
 	"path/filepath"
 	"strings"
 	"sync"
@@ -25,6 +26,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
@@ -431,3 +433,71 @@ func TestMultiContainerSignal(t *testing.T) {
 		}
 	}
 }
+
+// TestMultiContainerDestroy checks that container are properly cleaned-up when
+// they are destroyed.
+func TestMultiContainerDestroy(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+
+		// Two containers that will run for a long time. We will
+		// destroy the second one.
+		specs, ids := createSpecs([]string{"sleep", "100"}, []string{"sleep", "100"})
+
+		// Setup the containers.
+		var containers []*Container
+		for i, spec := range specs {
+			conf := testutil.TestConfig()
+			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(bundleDir)
+			cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+			containers = append(containers, cont)
+		}
+
+		// Exec in the root container to check for the existence of the
+		// second containers root filesystem directory.
+		contDir := path.Join(boot.ChildContainersDir, containers[1].ID)
+		args := &control.ExecArgs{
+			Filename: "/usr/bin/test",
+			Argv:     []string{"test", "-d", contDir},
+		}
+		if ws, err := containers[0].executeSync(args); err != nil {
+			t.Fatalf("error executing %+v: %v", args, err)
+		} else if ws.ExitStatus() != 0 {
+			t.Errorf("exec 'test -f %q' got exit status %d, wanted 0", contDir, ws.ExitStatus())
+		}
+
+		// Destory the second container.
+		if err := containers[1].Destroy(); err != nil {
+			t.Fatalf("error destroying container: %v", err)
+		}
+
+		// Now the container dir should be gone.
+		if ws, err := containers[0].executeSync(args); err != nil {
+			t.Fatalf("error executing %+v: %v", args, err)
+		} else if ws.ExitStatus() == 0 {
+			t.Errorf("exec 'test -f %q' got exit status 0, wanted non-zero", contDir)
+		}
+
+		// Check that cont.Destroy is safe to call multiple times.
+		if err := containers[1].Destroy(); err != nil {
+			t.Errorf("error destroying container: %v", err)
+		}
+	}
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f58d111bf..75739255d 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -666,6 +666,26 @@ func (s *Sandbox) Stacks() (string, error) {
 	return stacks, nil
 }
 
+// DestroyContainer destroys the given container. If it is the root container,
+// then the entire sandbox is destroyed.
+func (s *Sandbox) DestroyContainer(cid string) error {
+	if s.IsRootContainer(cid) {
+		log.Debugf("Destroying root container %q by destroying sandbox", cid)
+		return s.Destroy()
+	}
+
+	log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+	if err := conn.Call(boot.ContainerDestroy, &cid, nil); err != nil {
+		return fmt.Errorf("error destroying container %q: %v", cid, err)
+	}
+	return nil
+}
+
 func (s *Sandbox) waitForStopped() error {
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
-- 
cgit v1.2.3


From ffb5fdd69021713e88ec965e77487b7fc28bc104 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 19 Sep 2018 22:19:10 -0700
Subject: runsc: Fix stdin/stdout/stderr in multi-container mode.

The issue with the previous change was that the stdin/stdout/stderr passed to
the sentry were dup'd by host.ImportFile. This left a dangling FD that by never
closing caused containerd to timeout waiting on container stop.

PiperOrigin-RevId: 213753032
Change-Id: Ia5e4c0565c42c8610d3b59f65599a5643b0901e4
---
 runsc/boot/controller.go | 12 +++++++-----
 runsc/boot/fds.go        | 14 +++++++++-----
 runsc/boot/fs.go         | 14 +++++++-------
 runsc/boot/loader.go     | 27 +++++++++++++++++++++------
 runsc/sandbox/sandbox.go |  9 ++++++---
 5 files changed, 50 insertions(+), 26 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index b4594c8b0..ddba117c6 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -193,8 +193,10 @@ type StartArgs struct {
 	// CID is the ID of the container to start.
 	CID string
 
-	// FilePayload contains the file descriptor over which the sandbox will
-	// request files from its root filesystem.
+	// FilePayload contains, in order:
+	//   * stdin, stdout, and stderr.
+	//   * the file descriptor over which the sandbox will
+	//     request files from its root filesystem.
 	urpc.FilePayload
 }
 
@@ -222,8 +224,8 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if path.Clean(args.CID) != args.CID {
 		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
 	}
-	if len(args.FilePayload.Files) == 0 {
-		return fmt.Errorf("start arguments must contain at least one file for the container root")
+	if len(args.FilePayload.Files) < 4 {
+		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
 	}
 
 	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
@@ -408,7 +410,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
-	fds := &fdDispenser{fds: cm.l.ioFDs}
+	fds := &fdDispenser{fds: cm.l.goferFDs}
 	renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
 	if err != nil {
 		return fmt.Errorf("error creating RestoreEnvironment: %v", err)
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 9de5a78b1..92d641b68 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -28,15 +27,20 @@ import (
 
 // createFDMap creates an fd map that contains stdin, stdout, and stderr. If
 // console is true, then ioctl calls will be passed through to the host fd.
-func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool) (*kernel.FDMap, error) {
+// Upon success, createFDMap dups then closes stdioFDs.
+func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+	if len(stdioFDs) != 3 {
+		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+	}
+
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
 
 	// Maps sandbox fd to host fd.
 	fdMap := map[int]int{
-		0: syscall.Stdin,
-		1: syscall.Stdout,
-		2: syscall.Stderr,
+		0: stdioFDs[0],
+		1: stdioFDs[1],
+		2: stdioFDs[2],
 	}
 	mounter := fs.FileOwnerFromContext(ctx)
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 110f67de8..a97a4a3da 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -82,7 +82,7 @@ func (f *fdDispenser) empty() bool {
 
 // createMountNamespace creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
-func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
+func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int) (*fs.MountNamespace, error) {
 	mounts := compileMounts(spec)
 	if conf.MultiContainer {
 		// Create a tmpfs mount where we create and mount a root filesystem for
@@ -92,7 +92,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 			Destination: ChildContainersDir,
 		})
 	}
-	fds := &fdDispenser{fds: ioFDs}
+	fds := &fdDispenser{fds: goferFDs}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount: %v", err)
@@ -587,14 +587,14 @@ func subtargets(root string, mnts []specs.Mount) []string {
 }
 
 // setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
-// procArgs are passed by reference and the FDMap field is modified.
-func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
+func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
 	ctx := procArgs.NewContext(k)
 
 	// Create the FD map, which will set stdin, stdout, and stderr.  If
 	// console is true, then ioctl calls will be passed through to the host
 	// fd.
-	fdm, err := createFDMap(ctx, k, ls, console)
+	fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
 	if err != nil {
 		return fmt.Errorf("error importing fds: %v", err)
 	}
@@ -618,7 +618,7 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	mns := k.RootMountNamespace()
 	if mns == nil {
 		// Create the virtual filesystem.
-		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
+		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, goferFDs)
 		if err != nil {
 			return fmt.Errorf("error creating mounts: %v", err)
 		}
@@ -630,7 +630,7 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 
 	// Create the container's root filesystem mount.
 	log.Infof("Creating new process in child container.")
-	fds := &fdDispenser{fds: append([]int{}, ioFDs...)}
+	fds := &fdDispenser{fds: append([]int{}, goferFDs...)}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
 	if err != nil {
 		return fmt.Errorf("error creating filesystem for container: %v", err)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f906c9f95..e47eced18 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -78,8 +78,11 @@ type Loader struct {
 
 	watchdog *watchdog.Watchdog
 
-	// ioFDs are the FDs that attach the sandbox to the gofers.
-	ioFDs []int
+	// stdioFDs contains stdin, stdout, and stderr.
+	stdioFDs []int
+
+	// goferFDs are the FDs that attach the sandbox to the gofers.
+	goferFDs []int
 
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
@@ -139,7 +142,7 @@ func init() {
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, ioFDs []int, console bool) (*Loader, error) {
+func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, console bool) (*Loader, error) {
 	if err := usage.Init(); err != nil {
 		return nil, fmt.Errorf("Error setting up memory usage: %v", err)
 	}
@@ -278,7 +281,8 @@ func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, ioFDs []int
 		conf:                  conf,
 		console:               console,
 		watchdog:              watchdog,
-		ioFDs:                 ioFDs,
+		stdioFDs:              []int{syscall.Stdin, syscall.Stdout, syscall.Stderr},
+		goferFDs:              goferFDs,
 		spec:                  spec,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
@@ -390,7 +394,8 @@ func (l *Loader) run() error {
 			&l.rootProcArgs,
 			l.spec,
 			l.conf,
-			l.ioFDs,
+			l.stdioFDs,
+			l.goferFDs,
 			l.console,
 			l.rootProcArgs.Credentials,
 			l.rootProcArgs.Limits,
@@ -474,11 +479,14 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		ioFDs = append(ioFDs, fd)
 	}
 
+	stdioFDs := ioFDs[:3]
+	goferFDs := ioFDs[3:]
 	if err := setFileSystemForProcess(
 		&procArgs,
 		spec,
 		conf,
-		ioFDs,
+		stdioFDs,
+		goferFDs,
 		false,
 		creds,
 		procArgs.Limits,
@@ -487,6 +495,13 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("failed to create new process: %v", err)
 	}
 
+	// setFileSystemForProcess dup'd stdioFDs, so we can close them.
+	for i, fd := range stdioFDs {
+		if err := syscall.Close(fd); err != nil {
+			return fmt.Errorf("failed to close stdioFD #%d: %v", i, fd)
+		}
+	}
+
 	ctx := procArgs.NewContext(l.k)
 	mns := k.RootMountNamespace()
 	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 75739255d..07a6bf388 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -101,8 +101,8 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 }
 
 // Start starts running a non-root container inside the sandbox.
-func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, ioFiles []*os.File) error {
-	for _, f := range ioFiles {
+func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+	for _, f := range goferFiles {
 		defer f.Close()
 	}
 
@@ -113,12 +113,15 @@ func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, ioFiles
 	}
 	defer sandboxConn.Close()
 
+	// The payload must container stdin/stdout/stderr followed by gofer
+	// files.
+	files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...)
 	// Start running the container.
 	args := boot.StartArgs{
 		Spec:        spec,
 		Conf:        conf,
 		CID:         cid,
-		FilePayload: urpc.FilePayload{Files: ioFiles},
+		FilePayload: urpc.FilePayload{Files: files},
 	}
 	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
 		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
-- 
cgit v1.2.3


From 9464b82a067df93101e77bd51800364671d7f032 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Thu, 20 Sep 2018 11:21:59 -0700
Subject: runsc: Fix a bug that `runsc wait` doesn't work after container
 exits.

PiperOrigin-RevId: 213849165
Change-Id: I5120b2f568850c0c42a08e8706e7f8653ef1bd94
---
 runsc/container/container.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index a2582611a..32f2dd31a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -139,6 +139,7 @@ func Load(rootDir, id string) (*Container, error) {
 		if !c.Sandbox.IsRunning() {
 			// Sandbox no longer exists, so this container definitely does not exist.
 			c.changeStatus(Stopped)
+			c.Sandbox = nil
 		} else if c.Status == Running {
 			// Container state should reflect the actual state of
 			// the application, so we don't consider gofer process
@@ -682,7 +683,6 @@ func (c *Container) changeStatus(s Status) {
 		if c.Status != Created && c.Status != Running && c.Status != Stopped {
 			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
 		}
-		c.Sandbox = nil
 
 	default:
 		panic(fmt.Sprintf("invalid new state: %v", s))
-- 
cgit v1.2.3


From cbaec4d61454f7426d14b44bf25c67282251453c Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 20 Sep 2018 14:36:38 -0700
Subject: Wait for all async fs operations to complete before returning from
 Destroy.

Destroy flushes dirent references, which triggers many async close operations.
We must wait for those to finish before returning from Destroy, otherwise we
may kill the gofer, causing a cascade of failing RPCs and leading to an
inconsistent FS state.

PiperOrigin-RevId: 213884637
Change-Id: Id054b47fc0f97adc5e596d747c08d3b97a1d1f71
---
 runsc/boot/controller.go | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index ddba117c6..7d7803e92 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -315,6 +315,13 @@ func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 		return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
 	}
 
+	// Flushing dirent references triggers many async close operations. We
+	// must wait for those to complete before returning, otherwise the
+	// caller may kill the gofer before they complete, causing a cascade of
+	// failing RPCs.
+	log.Infof("Waiting for async filesystem operations to complete")
+	fs.AsyncBarrier()
+
 	// We made it!
 	log.Debugf("Destroyed container %q", *cid)
 	return nil
-- 
cgit v1.2.3


From 8a938a3f9df631667c5f9e5d4a2185207e492a0d Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Thu, 20 Sep 2018 16:58:36 -0700
Subject: runsc: allow `runsc wait` on a container for multiple times.

PiperOrigin-RevId: 213908919
Change-Id: I74eff99a5360bb03511b946f4cb5658bb5fc40c7
---
 runsc/boot/loader.go                    | 7 -------
 runsc/container/multi_container_test.go | 8 ++++----
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index e47eced18..5867eec96 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -573,15 +573,8 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 
 	// If the thread either has already exited or exits during waiting,
 	// consider the container exited.
-	// TODO: Multiple calls to waitContainer() should return
-	// the same exit status.
 	ws := l.wait(tg)
 	*waitStatus = ws
-
-	l.mu.Lock()
-	defer l.mu.Unlock()
-	delete(l.containerRootTGs, cid)
-
 	return nil
 }
 
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 0df587e30..2867aa3b9 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -152,8 +152,8 @@ func TestMultiContainerWait(t *testing.T) {
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
 			}
-			if _, err := c.Wait(); err == nil {
-				t.Errorf("wait for stopped process %s should fail", c.Spec.Process.Args)
+			if _, err := c.Wait(); err != nil {
+				t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err)
 			}
 		}(containers[1])
 	}
@@ -239,8 +239,8 @@ func TestExecWait(t *testing.T) {
 	} else if es := ws.ExitStatus(); es != 0 {
 		t.Fatalf("process %s exited with non-zero status %d", containers[1].Spec.Process.Args, es)
 	}
-	if _, err := containers[1].Wait(); err == nil {
-		t.Fatalf("wait for stopped process %s should fail", containers[1].Spec.Process.Args)
+	if _, err := containers[1].Wait(); err != nil {
+		t.Fatalf("wait for stopped container %s shouldn't fail: %v", containers[1].Spec.Process.Args, err)
 	}
 
 	// Execute another process in the first container.
-- 
cgit v1.2.3


From b63c4bfe02d1b88eb12d75d0c7051a006d5cbe7d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 20 Sep 2018 18:53:02 -0700
Subject: Set Sandbox.Chroot so it gets cleaned up upon destruction

I've made several attempts to create a test, but the lack of
permission from the test user makes it nearly impossible to
test anything useful.

PiperOrigin-RevId: 213922174
Change-Id: I5b502ca70cb7a6645f8836f028fb203354b4c625
---
 runsc/container/container.go | 75 +++++++++++++++++++++++++-------------------
 runsc/sandbox/chroot.go      |  2 ++
 runsc/sandbox/sandbox.go     | 14 ++++++---
 3 files changed, 55 insertions(+), 36 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 32f2dd31a..31ab1385a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -95,8 +95,8 @@ type Container struct {
 	// be 0 if the gofer has been killed.
 	GoferPid int `json:"goferPid"`
 
-	// Sandbox is the sandbox this container is running in. It will be nil
-	// if the container is not in state Running or Created.
+	// Sandbox is the sandbox this container is running in. It's set when the
+	// container is created and reset when the sandbox is destroyed.
 	Sandbox *sandbox.Sandbox `json:"sandbox"`
 }
 
@@ -136,14 +136,12 @@ func Load(rootDir, id string) (*Container, error) {
 	// This is inherently racey.
 	if c.Status == Running || c.Status == Created {
 		// Check if the sandbox process is still running.
-		if !c.Sandbox.IsRunning() {
+		if !c.isSandboxRunning() {
 			// Sandbox no longer exists, so this container definitely does not exist.
 			c.changeStatus(Stopped)
-			c.Sandbox = nil
 		} else if c.Status == Running {
-			// Container state should reflect the actual state of
-			// the application, so we don't consider gofer process
-			// here.
+			// Container state should reflect the actual state of the application, so
+			// we don't consider gofer process here.
 			if err := c.Signal(syscall.Signal(0)); err != nil {
 				c.changeStatus(Stopped)
 			}
@@ -288,8 +286,8 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 // Start starts running the containerized process inside the sandbox.
 func (c *Container) Start(conf *boot.Config) error {
 	log.Debugf("Start container %q", c.ID)
-	if c.Status != Created {
-		return fmt.Errorf("cannot start container in state %s", c.Status)
+	if err := c.requireStatus("start", Created); err != nil {
+		return err
 	}
 
 	// "If any prestart hook fails, the runtime MUST generate an error,
@@ -330,11 +328,9 @@ func (c *Container) Start(conf *boot.Config) error {
 // to restore a container from its state file.
 func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
 	log.Debugf("Restore container %q", c.ID)
-
-	if c.Status != Created {
-		return fmt.Errorf("cannot restore container in state %s", c.Status)
+	if err := c.requireStatus("restore", Created); err != nil {
+		return err
 	}
-
 	if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil {
 		return err
 	}
@@ -361,8 +357,8 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 // the newly created process.
 func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
 	log.Debugf("Execute in container %q, args: %+v", c.ID, args)
-	if c.Status != Created && c.Status != Running {
-		return 0, fmt.Errorf("cannot exec in container in state %s", c.Status)
+	if err := c.requireStatus("execute in", Created, Running); err != nil {
+		return 0, err
 	}
 	return c.Sandbox.Execute(c.ID, args)
 }
@@ -370,8 +366,8 @@ func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
 // Event returns events for the container.
 func (c *Container) Event() (*boot.Event, error) {
 	log.Debugf("Getting events for container %q", c.ID)
-	if c.Status != Running && c.Status != Created {
-		return nil, fmt.Errorf("cannot get events for container in state: %s", c.Status)
+	if err := c.requireStatus("get events for", Created, Running, Paused); err != nil {
+		return nil, err
 	}
 	return c.Sandbox.Event(c.ID)
 }
@@ -379,7 +375,7 @@ func (c *Container) Event() (*boot.Event, error) {
 // Pid returns the Pid of the sandbox the container is running in, or -1 if the
 // container is not running.
 func (c *Container) Pid() int {
-	if c.Status != Running && c.Status != Created && c.Status != Paused {
+	if err := c.requireStatus("pid", Created, Running, Paused); err != nil {
 		return -1
 	}
 	return c.Sandbox.Pid
@@ -390,8 +386,8 @@ func (c *Container) Pid() int {
 // and wait returns immediately.
 func (c *Container) Wait() (syscall.WaitStatus, error) {
 	log.Debugf("Wait on container %q", c.ID)
-	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
-		return 0, fmt.Errorf("container sandbox is not running")
+	if !c.isSandboxRunning() {
+		return 0, fmt.Errorf("container is not running")
 	}
 	return c.Sandbox.Wait(c.ID)
 }
@@ -400,8 +396,8 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 // returns its WaitStatus.
 func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
-	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
-		return 0, fmt.Errorf("container sandbox is not running")
+	if !c.isSandboxRunning() {
+		return 0, fmt.Errorf("container is not running")
 	}
 	return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
 }
@@ -410,8 +406,8 @@ func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus
 // its WaitStatus.
 func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in container %q", pid, c.ID)
-	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
-		return 0, fmt.Errorf("container sandbox is not running")
+	if !c.isSandboxRunning() {
+		return 0, fmt.Errorf("container is not running")
 	}
 	return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
 }
@@ -421,8 +417,8 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 // TODO: Distinguish different error types.
 func (c *Container) Signal(sig syscall.Signal) error {
 	log.Debugf("Signal container %q: %v", c.ID, sig)
-	if c.Status == Stopped {
-		return fmt.Errorf("container sandbox is stopped")
+	if err := c.requireStatus("running", Running); err != nil {
+		return err
 	}
 	// TODO: Query the container for its state, then save it.
 	return c.Sandbox.Signal(c.ID, sig)
@@ -432,8 +428,8 @@ func (c *Container) Signal(sig syscall.Signal) error {
 // The statefile will be written to f, the file at the specified image-path.
 func (c *Container) Checkpoint(f *os.File) error {
 	log.Debugf("Checkpoint container %q", c.ID)
-	if c.Status == Stopped {
-		return fmt.Errorf("container sandbox is stopped")
+	if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil {
+		return err
 	}
 	return c.Sandbox.Checkpoint(c.ID, f)
 }
@@ -484,8 +480,8 @@ func (c *Container) State() specs.State {
 // Processes retrieves the list of processes and associated metadata inside a
 // container.
 func (c *Container) Processes() ([]*control.Process, error) {
-	if c.Status != Running && c.Status != Paused {
-		return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", c.ID, c.Status)
+	if err := c.requireStatus("get processes of", Running, Paused); err != nil {
+		return nil, err
 	}
 	return c.Sandbox.Processes(c.ID)
 }
@@ -544,11 +540,13 @@ func (c *Container) save() error {
 // root containers), and waits for the container or sandbox and the gofer
 // to stop. If any of them doesn't stop before timeout, an error is returned.
 func (c *Container) stop() error {
-	if c.Sandbox != nil && c.Sandbox.IsRunning() {
+	if c.Sandbox != nil {
 		log.Debugf("Destroying container %q", c.ID)
 		if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
 			return fmt.Errorf("error destroying container %q: %v", c.ID, err)
 		}
+		// Only set sandbox to nil after it has been told to destroy the container.
+		c.Sandbox = nil
 	}
 
 	// Try killing gofer if it does not exit with container.
@@ -567,7 +565,7 @@ func (c *Container) waitForStopped() error {
 	defer cancel()
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
-		if c.Sandbox != nil && c.Sandbox.IsRunning() {
+		if c.isSandboxRunning() {
 			if err := c.Signal(syscall.Signal(0)); err == nil {
 				return fmt.Errorf("container is still running")
 			}
@@ -689,3 +687,16 @@ func (c *Container) changeStatus(s Status) {
 	}
 	c.Status = s
 }
+
+func (c *Container) isSandboxRunning() bool {
+	return c.Sandbox != nil && c.Sandbox.IsRunning()
+}
+
+func (c *Container) requireStatus(action string, statuses ...Status) error {
+	for _, s := range statuses {
+		if c.Status == s {
+			return nil
+		}
+	}
+	return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
+}
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
index 749bf3782..30a4bae35 100644
--- a/runsc/sandbox/chroot.go
+++ b/runsc/sandbox/chroot.go
@@ -74,6 +74,8 @@ func setUpChroot() (string, error) {
 // tearDownChroot unmounts /proc and /runsc from the chroot before deleting the
 // directory.
 func tearDownChroot(chroot string) error {
+	log.Debugf("Removing chroot mounts %q", chroot)
+
 	// Unmount /proc.
 	proc := filepath.Join(chroot, "proc")
 	if err := syscall.Unmount(proc, 0); err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 07a6bf388..67244c725 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -451,6 +451,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			if err != nil {
 				return fmt.Errorf("error setting up chroot: %v", err)
 			}
+			s.Chroot = chroot // Remember path so it can cleaned up.
 			cmd.SysProcAttr.Chroot = chroot
 			cmd.Args[0] = "/runsc"
 			cmd.Path = "/runsc"
@@ -549,9 +550,9 @@ func (s *Sandbox) IsRootContainer(cid string) bool {
 	return s.ID == cid
 }
 
-// Destroy frees all resources associated with the sandbox.
-// Destroy returns error if any step fails, and the function can be safely retried.
-func (s *Sandbox) Destroy() error {
+// Destroy frees all resources associated with the sandbox. It fails fast and
+// is idempotent.
+func (s *Sandbox) destroy() error {
 	log.Debugf("Destroy sandbox %q", s.ID)
 	if s.Pid != 0 {
 		log.Debugf("Killing sandbox %q", s.ID)
@@ -674,7 +675,12 @@ func (s *Sandbox) Stacks() (string, error) {
 func (s *Sandbox) DestroyContainer(cid string) error {
 	if s.IsRootContainer(cid) {
 		log.Debugf("Destroying root container %q by destroying sandbox", cid)
-		return s.Destroy()
+		return s.destroy()
+	}
+
+	if !s.IsRunning() {
+		// Sandbox isn't running anymore, container is already destroyed.
+		return nil
 	}
 
 	log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
-- 
cgit v1.2.3


From b4321f444727cc64da0b29623764223e48dbfddd Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 21 Sep 2018 11:40:50 -0700
Subject: runsc: Synchronize container metadata changes with a file lock.

Each container has associated metadata (particularly the container status) that
is manipulated by various runsc commands. This metadata is stored in a file
identified by the container id.

Different runsc processes may manipulate the same container metadata, and each
will read/write to the metadata file.

This CL adds a file lock per container which must be held when reading the
container metadata file, and when modifying and writing the container metadata.

PiperOrigin-RevId: 214019179
Change-Id: Ice4390ad233bc7f216c9a9a6cf05fb456c9ec0ad
---
 WORKSPACE                    |   6 +++
 runsc/container/BUILD        |   1 +
 runsc/container/container.go | 119 +++++++++++++++++++++++++++++++++----------
 3 files changed, 99 insertions(+), 27 deletions(-)

(limited to 'runsc')

diff --git a/WORKSPACE b/WORKSPACE
index 88577ae3a..a305bc730 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -22,6 +22,12 @@ go_repository(
     commit = "66e726b43552c0bab0539b28e640b89fd6862115",
 )
 
+go_repository(
+    name = "com_github_gofrs_flock",
+    importpath = "github.com/gofrs/flock",
+    commit = "886344bea0798d02ff3fae16a922be5f6b26cee0"
+)
+
 go_repository(
     name = "com_github_google_go-cmp",
     importpath = "github.com/google/go-cmp",
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index d289e43be..72e2304bf 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -28,6 +28,7 @@ go_library(
         "//runsc/sandbox",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_gofrs_flock//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 31ab1385a..90dad1c80 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -30,6 +30,7 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
+	"github.com/gofrs/flock"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
@@ -38,9 +39,16 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-// metadataFilename is the name of the metadata file relative to the container
-// root directory that holds sandbox metadata.
-const metadataFilename = "meta.json"
+const (
+	// metadataFilename is the name of the metadata file relative to the
+	// container root directory that holds sandbox metadata.
+	metadataFilename = "meta.json"
+
+	// metadataLockFilename is the name of a lock file in the container
+	// root directory that is used to prevent concurrent modifications to
+	// the container state and metadata.
+	metadataLockFilename = "meta.lock"
+)
 
 // validateID validates the container id.
 func validateID(id string) error {
@@ -116,6 +124,15 @@ func Load(rootDir, id string) (*Container, error) {
 		return nil, err
 	}
 
+	// Lock the container metadata to prevent other runsc instances from
+	// writing to it while we are reading it.
+	unlock, err := lockContainerMetadata(cRoot)
+	if err != nil {
+		return nil, err
+	}
+	defer unlock()
+
+	// Read the container metadata file and create a new Container from it.
 	metaFile := filepath.Join(cRoot, metadataFilename)
 	metaBytes, err := ioutil.ReadFile(metaFile)
 	if err != nil {
@@ -204,9 +221,19 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		return nil, err
 	}
 
+	// Lock the container metadata file to prevent concurrent creations of
+	// containers with the same id.
 	containerRoot := filepath.Join(conf.RootDir, id)
-	if _, err := os.Stat(containerRoot); err == nil {
-		return nil, fmt.Errorf("container with id %q already exists: %q", id, containerRoot)
+	unlock, err := lockContainerMetadata(containerRoot)
+	if err != nil {
+		return nil, err
+	}
+	defer unlock()
+
+	// Check if the container already exists by looking for the metadata
+	// file.
+	if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil {
+		return nil, fmt.Errorf("container with id %q already exists", id)
 	} else if !os.IsNotExist(err) {
 		return nil, fmt.Errorf("error looking for existing container in %q: %v", containerRoot, err)
 	}
@@ -286,6 +313,11 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 // Start starts running the containerized process inside the sandbox.
 func (c *Container) Start(conf *boot.Config) error {
 	log.Debugf("Start container %q", c.ID)
+	unlock, err := c.lock()
+	if err != nil {
+		return err
+	}
+	defer unlock()
 	if err := c.requireStatus("start", Created); err != nil {
 		return err
 	}
@@ -328,9 +360,15 @@ func (c *Container) Start(conf *boot.Config) error {
 // to restore a container from its state file.
 func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
 	log.Debugf("Restore container %q", c.ID)
+	unlock, err := c.lock()
+	if err != nil {
+		return err
+	}
+	defer unlock()
 	if err := c.requireStatus("restore", Created); err != nil {
 		return err
 	}
+
 	if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil {
 		return err
 	}
@@ -438,32 +476,41 @@ func (c *Container) Checkpoint(f *os.File) error {
 // The call only succeeds if the container's status is created or running.
 func (c *Container) Pause() error {
 	log.Debugf("Pausing container %q", c.ID)
-	switch c.Status {
-	case Created, Running:
-		if err := c.Sandbox.Pause(c.ID); err != nil {
-			return fmt.Errorf("error pausing container: %v", err)
-		}
-		c.changeStatus(Paused)
-		return c.save()
-	default:
-		return fmt.Errorf("container %q not created or running, not pausing", c.ID)
+	unlock, err := c.lock()
+	if err != nil {
+		return err
+	}
+	defer unlock()
+
+	if c.Status != Created && c.Status != Running {
+		return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
 	}
+
+	if err := c.Sandbox.Pause(c.ID); err != nil {
+		return fmt.Errorf("error pausing container: %v", err)
+	}
+	c.changeStatus(Paused)
+	return c.save()
 }
 
 // Resume unpauses the container and its kernel.
 // The call only succeeds if the container's status is paused.
 func (c *Container) Resume() error {
 	log.Debugf("Resuming container %q", c.ID)
-	switch c.Status {
-	case Paused:
-		if err := c.Sandbox.Resume(c.ID); err != nil {
-			return fmt.Errorf("error resuming container: %v", err)
-		}
-		c.changeStatus(Running)
-		return c.save()
-	default:
-		return fmt.Errorf("container %q not paused, not resuming", c.ID)
+	unlock, err := c.lock()
+	if err != nil {
+		return err
+	}
+	defer unlock()
+
+	if c.Status != Paused {
+		return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
+	}
+	if err := c.Sandbox.Resume(c.ID); err != nil {
+		return fmt.Errorf("error resuming container: %v", err)
 	}
+	c.changeStatus(Running)
+	return c.save()
 }
 
 // State returns the metadata of the container.
@@ -520,16 +567,15 @@ func (c *Container) Destroy() error {
 }
 
 // save saves the container metadata to a file.
+//
+// Precondition: container must be locked with container.lock().
 func (c *Container) save() error {
 	log.Debugf("Save container %q", c.ID)
-	if err := os.MkdirAll(c.Root, 0711); err != nil {
-		return fmt.Errorf("error creating container root directory %q: %v", c.Root, err)
-	}
+	metaFile := filepath.Join(c.Root, metadataFilename)
 	meta, err := json.Marshal(c)
 	if err != nil {
 		return fmt.Errorf("error marshaling container metadata: %v", err)
 	}
-	metaFile := filepath.Join(c.Root, metadataFilename)
 	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
 		return fmt.Errorf("error writing container metadata: %v", err)
 	}
@@ -700,3 +746,22 @@ func (c *Container) requireStatus(action string, statuses ...Status) error {
 	}
 	return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
 }
+
+// lock takes a file lock on the container metadata lock file.
+func (c *Container) lock() (func() error, error) {
+	return lockContainerMetadata(filepath.Join(c.Root, c.ID))
+}
+
+// lockContainerMetadata takes a file lock on the metadata lock file in the
+// given container root directory.
+func lockContainerMetadata(containerRootDir string) (func() error, error) {
+	if err := os.MkdirAll(containerRootDir, 0711); err != nil {
+		return nil, fmt.Errorf("error creating container root directory %q: %v", containerRootDir, err)
+	}
+	f := filepath.Join(containerRootDir, metadataLockFilename)
+	l := flock.NewFlock(f)
+	if err := l.Lock(); err != nil {
+		return nil, fmt.Errorf("error acquiring lock on container lock file %q: %v", f, err)
+	}
+	return l.Unlock, nil
+}
-- 
cgit v1.2.3


From d260e808f478e5c5b96d574d49f315f3823aa385 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 21 Sep 2018 13:53:32 -0700
Subject: The "action" in container.Signal should be "signal".

PiperOrigin-RevId: 214038776
Change-Id: I4ad212540ec4ef4fb5ab5fdcb7f0865c4f746895
---
 runsc/container/container.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 90dad1c80..a1b31d861 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -455,7 +455,7 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 // TODO: Distinguish different error types.
 func (c *Container) Signal(sig syscall.Signal) error {
 	log.Debugf("Signal container %q: %v", c.ID, sig)
-	if err := c.requireStatus("running", Running); err != nil {
+	if err := c.requireStatus("signal", Running); err != nil {
 		return err
 	}
 	// TODO: Query the container for its state, then save it.
-- 
cgit v1.2.3


From 7ce13ebcadc764c5f69215f072b53d3843bec679 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 21 Sep 2018 14:05:46 -0700
Subject: Run gofmt -s on everything

PiperOrigin-RevId: 214040901
Change-Id: I74d79497a053da3624921ad2b7c5193ca4a87942
---
 pkg/seccomp/seccomp.go            |  2 +-
 pkg/seccomp/seccomp_rules.go      |  2 +-
 pkg/seccomp/seccomp_test.go       | 10 +++++-----
 runsc/container/container_test.go | 12 +++++-------
 runsc/fsgofer/filter/config.go    |  6 +++---
 runsc/test/testutil/testutil.go   |  2 +-
 6 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'runsc')

diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index cd6b0b4bc..49da3c775 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -98,7 +98,7 @@ func buildProgram(rules SyscallRules, kill bool) ([]linux.BPFInstruction, error)
 // buildIndex builds a BST to quickly search through all syscalls that are whitelisted.
 func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error {
 	syscalls := []uintptr{}
-	for sysno, _ := range rules {
+	for sysno := range rules {
 		syscalls = append(syscalls, sysno)
 	}
 
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 4b99792fd..9215e5c90 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -120,7 +120,7 @@ func (sr SyscallRules) Merge(rules SyscallRules) {
 				sr[sysno] = append(sr[sysno], Rule{})
 			}
 			if len(rs) == 0 {
-				rs = []Rule{Rule{}}
+				rs = []Rule{{}}
 			}
 			sr[sysno] = append(sr[sysno], rs...)
 		} else {
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 9f9507228..42cf85c03 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -368,19 +368,19 @@ func TestMerge(t *testing.T) {
 			name:  "empty both",
 			main:  nil,
 			merge: nil,
-			want:  []Rule{Rule{}, Rule{}},
+			want:  []Rule{{}, {}},
 		},
 		{
 			name:  "empty main",
 			main:  nil,
-			merge: []Rule{Rule{}},
-			want:  []Rule{Rule{}, Rule{}},
+			merge: []Rule{{}},
+			want:  []Rule{{}, {}},
 		},
 		{
 			name:  "empty merge",
-			main:  []Rule{Rule{}},
+			main:  []Rule{{}},
 			merge: nil,
-			want:  []Rule{Rule{}, Rule{}},
+			want:  []Rule{{}, {}},
 		},
 	} {
 		t.Run(tst.name, func(t *testing.T) {
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 996d80a89..083054877 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -735,13 +735,11 @@ func TestUnixDomainSockets(t *testing.T) {
 			UID: uint32(os.Getuid()),
 			GID: uint32(os.Getgid()),
 		}
-		spec.Mounts = []specs.Mount{
-			specs.Mount{
-				Type:        "bind",
-				Destination: dir,
-				Source:      dir,
-			},
-		}
+		spec.Mounts = []specs.Mount{{
+			Type:        "bind",
+			Destination: dir,
+			Source:      dir,
+		}}
 
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 0a1c63753..35698f21f 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -65,15 +65,15 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FCHMOD:     {},
 	syscall.SYS_FCHOWNAT:   {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
-		seccomp.Rule{
+		{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(syscall.F_GETFL),
 		},
-		seccomp.Rule{
+		{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(syscall.F_SETFL),
 		},
-		seccomp.Rule{
+		{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(syscall.F_GETFD),
 		},
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 2e7f95912..37927f395 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -134,7 +134,7 @@ func NewSpecWithArgs(args ...string) *specs.Spec {
 			// This creates a writable mount inside the root. Also, when tmpdir points
 			// to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs
 			// inside the sentry.
-			specs.Mount{
+			{
 				Type:        "bind",
 				Destination: TmpDir(),
 				Source:      TmpDir(),
-- 
cgit v1.2.3


From d489336784f12e1b6f92d65f53679c1226b58668 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 24 Sep 2018 17:21:16 -0700
Subject: runsc: All non-root bind mounts should be shared.

This CL changes the semantics of the "--file-access" flag so that it only
affects the root filesystem.  The default remains "exclusive" which is the
common use case, as neither Docker nor K8s supports sharing the root.

Keeping the root fs as "exclusive" means that the fs-intensive work done during
application startup will mostly be cacheable, and thus faster.

Non-root bind mounts will always be shared.

This CL also removes some redundant FSAccessType validations.  We validate this
flag in main(), so we can assume it is valid afterwards.

PiperOrigin-RevId: 214359936
Change-Id: I7e75d7bf52dbd7fa834d0aacd4034868314f3b51
---
 runsc/boot/fs.go | 38 ++++++++++++++------------------------
 runsc/main.go    |  2 +-
 2 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index a97a4a3da..22d5f621c 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -204,19 +204,13 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 		err       error
 	)
 
-	switch conf.FileAccess {
-	case FileAccessShared, FileAccessExclusive:
-		fd := fds.remove()
-		log.Infof("Mounting root over 9P, ioFD: %d", fd)
-		hostFS := mustFindFilesystem("9p")
-		opts := p9MountOptions(conf, fd)
-		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
-		if err != nil {
-			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
-		}
-
-	default:
-		return nil, fmt.Errorf("invalid file access type: %v", conf.FileAccess)
+	fd := fds.remove()
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	hostFS := mustFindFilesystem("9p")
+	opts := p9MountOptions(fd, conf.FileAccess)
+	rootInode, err = hostFS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 	}
 
 	// We need to overlay the root on top of a ramfs with stub directories
@@ -282,14 +276,10 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
 
 	case bind:
-		switch conf.FileAccess {
-		case FileAccessShared, FileAccessExclusive:
-			fd := fds.remove()
-			fsName = "9p"
-			opts = p9MountOptions(conf, fd)
-		default:
-			err = fmt.Errorf("invalid file access type: %v", conf.FileAccess)
-		}
+		fd := fds.remove()
+		fsName = "9p"
+		// Non-root bind mounts are always shared.
+		opts = p9MountOptions(fd, FileAccessShared)
 		// If configured, add overlay to all writable mounts.
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
@@ -407,14 +397,14 @@ func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
 }
 
 // p9MountOptions creates a slice of options for a p9 mount.
-func p9MountOptions(conf *Config, fd int) []string {
+func p9MountOptions(fd int, fa FileAccessType) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
 		"wfdno=" + strconv.Itoa(fd),
 		"privateunixsocket=true",
 	}
-	if conf.FileAccess == FileAccessShared {
+	if fa == FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
 	}
 	return opts
@@ -500,7 +490,7 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 
 	// Add root mount.
 	fd := fds.remove()
-	opts := p9MountOptions(conf, fd)
+	opts := p9MountOptions(fd, conf.FileAccess)
 
 	mf := fs.MountSourceFlags{}
 	if spec.Root.Readonly {
diff --git a/runsc/main.go b/runsc/main.go
index 44d30768f..624db5f40 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -58,7 +58,7 @@ var (
 	// Flags that control sandbox runtime behavior.
 	platform       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
 	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use: exclusive (default), shared. Setting 'shared' will disable caches and should be used if external modifications to the filesystem are expected.")
+	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
-- 
cgit v1.2.3


From a003e041c86198122af7e37cc171517f977dde6a Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Wed, 26 Sep 2018 17:40:01 -0700
Subject: runsc: fix pid file race condition in exec detach mode.

PiperOrigin-RevId: 214700295
Change-Id: I73d8490572eebe5da584af91914650d1953aeb91
---
 runsc/cmd/exec.go | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 957c4f0ff..28229dbcf 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -247,10 +247,14 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	// '--process' file is deleted as soon as this process returns and the child
 	// may fail to read it.
 	ready := func() (bool, error) {
-		_, err := os.Stat(pidFile)
+		pidb, err := ioutil.ReadFile(pidFile)
 		if err == nil {
-			// File appeared, we're done!
-			return true, nil
+			// File appeared, check whether pid is fully written.
+			pid, err := strconv.Atoi(string(pidb))
+			if err != nil {
+				return false, nil
+			}
+			return pid == cmd.Process.Pid, nil
 		}
 		if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
 			return false, err
-- 
cgit v1.2.3


From 6910ff36434f4bc5aa8c6b3094b617c7c92a9803 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 27 Sep 2018 08:57:32 -0700
Subject: Move uds_test_app to common test_app

This was done so it's easier to add more functionality
to this file for other tests.

PiperOrigin-RevId: 214782043
Change-Id: I1f38b9ee1219b3ce7b789044ada8e52bdc1e6279
---
 runsc/container/BUILD             |  15 +++--
 runsc/container/container_test.go |   6 +-
 runsc/container/test_app.go       | 116 ++++++++++++++++++++++++++++++++++++++
 runsc/container/uds_test_app.go   |  83 ---------------------------
 4 files changed, 126 insertions(+), 94 deletions(-)
 create mode 100644 runsc/container/test_app.go
 delete mode 100644 runsc/container/uds_test_app.go

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 72e2304bf..d72d05c13 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -2,13 +2,6 @@ package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
 
-go_binary(
-    name = "uds_test_app",
-    srcs = [
-        "uds_test_app.go",
-    ],
-)
-
 go_library(
     name = "container",
     srcs = [
@@ -42,7 +35,7 @@ go_test(
         "multi_container_test.go",
     ],
     data = [
-        ":uds_test_app",
+        ":test_app",
         "//runsc",
     ],
     embed = [":container"],
@@ -64,3 +57,9 @@ go_test(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+go_binary(
+    name = "test_app",
+    srcs = ["test_app.go"],
+    deps = ["@com_github_google_subcommands//:go_default_library"],
+)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 083054877..efa598202 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -722,15 +722,15 @@ func TestUnixDomainSockets(t *testing.T) {
 		}
 		defer outputFile.Close()
 
-		app, err := testutil.FindFile("runsc/container/uds_test_app")
+		app, err := testutil.FindFile("runsc/container/test_app")
 		if err != nil {
-			t.Fatal("error finding uds_test_app:", err)
+			t.Fatal("error finding test_app:", err)
 		}
 
 		socketPath := filepath.Join(dir, "uds_socket")
 		defer os.Remove(socketPath)
 
-		spec := testutil.NewSpecWithArgs(app, "--file", outputPath, "--socket", socketPath)
+		spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath)
 		spec.Process.User = specs.User{
 			UID: uint32(os.Getuid()),
 			GID: uint32(os.Getgid()),
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
new file mode 100644
index 000000000..768293cf9
--- /dev/null
+++ b/runsc/container/test_app.go
@@ -0,0 +1,116 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary test_app is like a swiss knife for tests that need to run anything
+// inside the sandbox. New functionality can be added with new commands.
+package main
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"net"
+	"os"
+	"strconv"
+	"time"
+
+	"flag"
+	"github.com/google/subcommands"
+)
+
+func main() {
+	subcommands.Register(subcommands.HelpCommand(), "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+	subcommands.Register(new(uds), "")
+
+	flag.Parse()
+
+	exitCode := subcommands.Execute(context.Background())
+	os.Exit(int(exitCode))
+}
+
+type uds struct {
+	fileName   string
+	socketPath string
+}
+
+// Name implements subcommands.Command.Name.
+func (*uds) Name() string {
+	return "uds"
+}
+
+// Synopsis implements subcommands.Command.Synopsys.
+func (*uds) Synopsis() string {
+	return "creates unix domain socket client and server. Client sends a contant flow of sequential numbers. Server prints them to --file"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*uds) Usage() string {
+	return "uds <flags>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *uds) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.fileName, "file", "", "name of output file")
+	f.StringVar(&c.socketPath, "socket", "", "path to socket")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if c.fileName == "" || c.socketPath == "" {
+		log.Fatal("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath)
+		return subcommands.ExitFailure
+	}
+	outputFile, err := os.OpenFile(c.fileName, os.O_WRONLY|os.O_CREATE, 0666)
+	if err != nil {
+		log.Fatal("error opening output file:", err)
+	}
+
+	defer os.Remove(c.socketPath)
+
+	listener, err := net.Listen("unix", c.socketPath)
+	if err != nil {
+		log.Fatal("error listening on socket %q:", c.socketPath, err)
+	}
+
+	go server(listener, outputFile)
+	for i := 0; ; i++ {
+		conn, err := net.Dial("unix", c.socketPath)
+		if err != nil {
+			log.Fatal("error dialing:", err)
+		}
+		if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil {
+			log.Fatal("error writing:", err)
+		}
+		conn.Close()
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func server(listener net.Listener, out *os.File) {
+	buf := make([]byte, 16)
+
+	for {
+		c, err := listener.Accept()
+		if err != nil {
+			log.Fatal("error accepting connection:", err)
+		}
+		nr, err := c.Read(buf)
+		if err != nil {
+			log.Fatal("error reading from buf:", err)
+		}
+		data := buf[0:nr]
+		fmt.Fprint(out, string(data)+"\n")
+	}
+}
diff --git a/runsc/container/uds_test_app.go b/runsc/container/uds_test_app.go
deleted file mode 100644
index bef98ac66..000000000
--- a/runsc/container/uds_test_app.go
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary uds-test-app opens a socket and reads a series of numbers
-// which are then written to an output file.
-package main
-
-import (
-	"flag"
-	"fmt"
-	"log"
-	"net"
-	"os"
-	"strconv"
-	"time"
-)
-
-var (
-	fileName   = flag.String("file", "", "name of output file")
-	socketPath = flag.String("socket", "", "path to socket")
-)
-
-func server(listener net.Listener, f *os.File) {
-	buf := make([]byte, 16)
-
-	for {
-		c, err := listener.Accept()
-		if err != nil {
-			log.Fatal("error accepting connection:", err)
-		}
-		nr, err := c.Read(buf)
-		if err != nil {
-			log.Fatal("error reading from buf:", err)
-		}
-		data := buf[0:nr]
-		fmt.Fprintf(f, string(data)+"\n")
-	}
-}
-
-func main() {
-	flag.Parse()
-	if *fileName == "" || *socketPath == "" {
-		log.Fatalf("Flags cannot be empty, given: fileName=%s, socketPath=%s", *fileName, *socketPath)
-	}
-	outputFile, err := os.OpenFile(*fileName, os.O_WRONLY|os.O_CREATE, 0666)
-	if err != nil {
-		log.Fatal("error opening output file:", err)
-	}
-
-	socket := *socketPath
-	defer os.Remove(socket)
-
-	listener, err := net.Listen("unix", socket)
-	if err != nil {
-		log.Fatal("error listening on socket:", err)
-	}
-
-	go server(listener, outputFile)
-	for i := 0; ; i++ {
-
-		conn, err := net.Dial("unix", socket)
-		if err != nil {
-			log.Fatal("error dialing:", err)
-		}
-		if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil {
-			log.Fatal("error writing:", err)
-		}
-		conn.Close()
-		time.Sleep(100 * time.Millisecond)
-	}
-
-}
-- 
cgit v1.2.3


From b514ab05897bca53c1d4f71c912f2977b3134daf Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 27 Sep 2018 10:25:19 -0700
Subject: Refactor 'runsc boot' to take container ID as argument

This makes the flow slightly simpler (no need to call
Loader.SetRootContainer). And this is required change to tag
tasks with container ID inside the Sentry.

PiperOrigin-RevId: 214795210
Change-Id: I6ff4af12e73bb07157f7058bb15fd5bb88760884
---
 runsc/boot/controller.go  |  1 -
 runsc/boot/loader.go      | 12 +++++++-----
 runsc/boot/loader_test.go |  3 +--
 runsc/cmd/boot.go         |  6 +++---
 runsc/sandbox/sandbox.go  |  3 +++
 5 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 7d7803e92..bc33e028a 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -171,7 +171,6 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
 	if err := <-cm.startResultChan; err != nil {
 		return fmt.Errorf("failed to start sandbox: %v", err)
 	}
-	cm.l.setRootContainerID(*cid)
 	return nil
 }
 
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 5867eec96..9a5d649ab 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -142,7 +142,7 @@ func init() {
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, console bool) (*Loader, error) {
+func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, console bool) (*Loader, error) {
 	if err := usage.Init(); err != nil {
 		return nil, fmt.Errorf("Error setting up memory usage: %v", err)
 	}
@@ -286,6 +286,9 @@ func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []
 		spec:                  spec,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
+		sandboxID:             id,
+		containerRootTGs:      make(map[string]*kernel.ThreadGroup),
+		execProcesses:         make(map[execID]*kernel.ThreadGroup),
 	}
 	ctrl.manager.l = l
 	return l, nil
@@ -420,10 +423,9 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
-	if l.execProcesses != nil {
-		return fmt.Errorf("there shouldn't already be a cache of exec'd processes, but found: %v", l.execProcesses)
-	}
-	l.execProcesses = make(map[execID]*kernel.ThreadGroup)
+	l.mu.Lock()
+	l.containerRootTGs[l.sandboxID] = l.k.GlobalInit()
+	l.mu.Unlock()
 
 	// Start signal forwarding only after an init process is created.
 	l.stopSignalForwarding = l.startSignalForwarding()
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index a8a796445..0b363253d 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -101,7 +101,7 @@ func createLoader() (*Loader, func(), error) {
 		return nil, nil, err
 	}
 
-	l, err := New(spec, conf, fd, -1 /* device fd */, []int{sandEnd}, false)
+	l, err := New("foo", spec, conf, fd, -1 /* device fd */, []int{sandEnd}, false)
 	if err != nil {
 		cleanup()
 		return nil, nil, err
@@ -129,7 +129,6 @@ func TestRun(t *testing.T) {
 	}()
 
 	// Run the container.
-	l.setRootContainerID("foo")
 	if err := l.Run(); err != nil {
 		t.Errorf("error running container: %v", err)
 	}
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 035147cf1..933ba2d9e 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -69,7 +69,7 @@ func (*Boot) Synopsis() string {
 
 // Usage implements subcommands.Command.Usage.
 func (*Boot) Usage() string {
-	return `boot [flags]`
+	return `boot [flags] <container id>`
 }
 
 // SetFlags implements subcommands.Command.SetFlags.
@@ -86,7 +86,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
 // waiting state.
 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if b.specFD == -1 || b.controllerFD == -1 || f.NArg() != 0 {
+	if b.specFD == -1 || b.controllerFD == -1 || f.NArg() != 1 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
@@ -138,7 +138,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-	l, err := boot.New(spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.console)
+	l, err := boot.New(f.Arg(0), spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.console)
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 67244c725..c3d90d5f4 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -460,6 +460,9 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		}
 	}
 
+	// Add container as the last argument.
+	cmd.Args = append(cmd.Args, s.ID)
+
 	// Log the fds we are donating to the sandbox process.
 	for i, f := range cmd.ExtraFiles {
 		log.Debugf("Donating FD %d: %q", i+3, f.Name())
-- 
cgit v1.2.3


From 491faac03b2815ca1bc9b5425c1b3f6291468e20 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 27 Sep 2018 15:00:03 -0700
Subject: Implement 'runsc kill --all'

In order to implement kill --all correctly, the Sentry needs
to track all tasks that belong to a given container. This change
introduces ContainerID to the task, that gets inherited by all
children. 'kill --all' then iterates over all tasks comparing the
ContainerID field to find all processes that need to be signalled.

PiperOrigin-RevId: 214841768
Change-Id: I693b2374be8692d88cc441ef13a0ae34abf73ac6
---
 pkg/sentry/control/proc.go              |  14 ++-
 pkg/sentry/kernel/kernel.go             |  25 +++++
 pkg/sentry/kernel/task.go               |  12 ++
 pkg/sentry/kernel/task_clone.go         |   1 +
 pkg/sentry/kernel/task_start.go         |   4 +
 runsc/boot/controller.go                |  29 ++---
 runsc/boot/loader.go                    |  26 +++--
 runsc/cmd/kill.go                       |  15 ++-
 runsc/container/BUILD                   |   6 +-
 runsc/container/container.go            |  11 +-
 runsc/container/container_test.go       |  38 +++++--
 runsc/container/multi_container_test.go | 190 ++++++++++++++++++++++++++++----
 runsc/container/test_app.go             |  63 +++++++++++
 runsc/sandbox/sandbox.go                |  16 ++-
 14 files changed, 371 insertions(+), 79 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index b120471cb..106055e86 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -83,6 +83,9 @@ type ExecArgs struct {
 
 	// FilePayload determines the files to give to the new process.
 	urpc.FilePayload
+
+	// ContainerID is the container for the process being executed.
+	ContainerID string
 }
 
 // Exec runs a new task.
@@ -133,6 +136,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		UTSNamespace:            proc.Kernel.RootUTSNamespace(),
 		IPCNamespace:            proc.Kernel.RootIPCNamespace(),
 		AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
+		ContainerID:             args.ContainerID,
 	}
 	if initArgs.Root != nil {
 		// initArgs must hold a reference on Root. This ref is dropped
@@ -182,7 +186,7 @@ type PsArgs struct {
 // Ps provides a process listing for the running kernel.
 func (proc *Proc) Ps(args *PsArgs, out *string) error {
 	var p []*Process
-	if e := Processes(proc.Kernel, &p); e != nil {
+	if e := Processes(proc.Kernel, "", &p); e != nil {
 		return e
 	}
 	if !args.JSON {
@@ -258,8 +262,9 @@ func PrintPIDsJSON(pl []*Process) (string, error) {
 	return string(b), nil
 }
 
-// Processes retrieves information about processes running in the sandbox.
-func Processes(k *kernel.Kernel, out *[]*Process) error {
+// Processes retrieves information about processes running in the sandbox with
+// the given container id. All processes are returned if 'containerID' is empty.
+func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 	ts := k.TaskSet()
 	now := k.RealtimeClock().Now()
 	for _, tg := range ts.Root.ThreadGroups() {
@@ -268,6 +273,9 @@ func Processes(k *kernel.Kernel, out *[]*Process) error {
 		if pid == 0 {
 			continue
 		}
+		if containerID != "" && containerID != tg.Leader().ContainerID() {
+			continue
+		}
 
 		ppid := kernel.ThreadID(0)
 		if tg.Leader().Parent() != nil {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index f71e32ac9..1ace0b501 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -524,6 +524,9 @@ type CreateProcessArgs struct {
 	// Anyone setting Root must donate a reference (i.e. increment it) to
 	// keep it alive until it is decremented by CreateProcess.
 	Root *fs.Dirent
+
+	// ContainerID is the container that the process belongs to.
+	ContainerID string
 }
 
 // NewContext returns a context.Context that represents the task that will be
@@ -660,6 +663,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
+		ContainerID:             args.ContainerID,
 	}
 	t, err := k.tasks.NewTask(config)
 	if err != nil {
@@ -818,6 +822,27 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
 	k.sendExternalSignal(info, context)
 }
 
+// SendContainerSignal sends the given signal to all processes inside the
+// namespace that match the given container ID.
+func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader && t.ContainerID() == cid {
+			t.tg.signalHandlers.mu.Lock()
+			defer t.tg.signalHandlers.mu.Unlock()
+			infoCopy := *info
+			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
 // FeatureSet returns the FeatureSet.
 func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
 	return k.featureSet
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2f6f825ac..07ad1614c 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -205,6 +205,13 @@ type Task struct {
 	// k is the Kernel that this task belongs to. The k pointer is immutable.
 	k *Kernel
 
+	// containerID has no equivalent in Linux; it's used by runsc to track all
+	// tasks that belong to a given containers since cgroups aren't implemented.
+	// It's inherited by the children, is immutable, and may be empty.
+	//
+	// NOTE: cgroups can be used to track this when implemented.
+	containerID string
+
 	// mu protects some of the following fields.
 	mu sync.Mutex `state:"nosave"`
 
@@ -678,3 +685,8 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
 func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 	return t.abstractSockets
 }
+
+// ContainerID returns t's container ID.
+func (t *Task) ContainerID() string {
+	return t.containerID
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 46c688b20..130bd652b 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -258,6 +258,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		ContainerID:             t.ContainerID(),
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 6ce99d268..6c8d7d316 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -77,6 +77,9 @@ type TaskConfig struct {
 
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
+
+	// ContainerID is the container the new task belongs to.
+	ContainerID string
 }
 
 // NewTask creates a new task defined by cfg.
@@ -124,6 +127,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		abstractSockets: cfg.AbstractSocketNamespace,
 		rseqCPU:         -1,
 		futexWaiter:     futex.NewWaiter(),
+		containerID:     cfg.ContainerID,
 	}
 	t.endStopCond.L = &t.tg.signalHandlers.mu
 	t.ptraceTracer.Store((*Task)(nil))
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index bc33e028a..116a8369c 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -174,10 +174,17 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
 	return nil
 }
 
+// ProcessesArgs container arguments to Processes method.
+type ProcessesArgs struct {
+	// CID restricts the result to processes belonging to
+	// the given container. Empty means all.
+	CID string
+}
+
 // Processes retrieves information about processes running in the sandbox.
-func (cm *containerManager) Processes(_, out *[]*control.Process) error {
+func (cm *containerManager) Processes(args *ProcessesArgs, out *[]*control.Process) error {
 	log.Debugf("containerManager.Processes")
-	return control.Processes(cm.l.k, out)
+	return control.Processes(cm.l.k, args.CID, out)
 }
 
 // StartArgs contains arguments to the Start method.
@@ -326,19 +333,11 @@ func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 	return nil
 }
 
-// ExecArgs contains arguments to Execute.
-type ExecArgs struct {
-	control.ExecArgs
-
-	// CID is the ID of the container to exec in.
-	CID string
-}
-
 // ExecuteAsync starts running a command on a created or running sandbox. It
 // returns the pid of the new process.
-func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error {
+func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
 	log.Debugf("containerManager.ExecuteAsync: %+v", args)
-	tgid, err := cm.l.executeAsync(&args.ExecArgs, args.CID)
+	tgid, err := cm.l.executeAsync(args)
 	if err != nil {
 		return err
 	}
@@ -503,11 +502,15 @@ type SignalArgs struct {
 
 	// Signo is the signal to send to the process.
 	Signo int32
+
+	// All is set when signal should be sent to all processes in the container.
+	// When false, the signal is sent to the root container process only.
+	All bool
 }
 
 // Signal sends a signal to the init process of the container.
 // TODO: Send signal to exec process.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
 	log.Debugf("containerManager.Signal")
-	return cm.l.signal(args.CID, args.Signo)
+	return cm.l.signal(args.CID, args.Signo, args.All)
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9a5d649ab..bd6e146fc 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -270,7 +270,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 		log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal)
 	}
 
-	procArgs, err := newProcess(spec, creds, k)
+	procArgs, err := newProcess(id, spec, creds, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
@@ -295,7 +295,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 }
 
 // newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
@@ -314,6 +314,7 @@ func newProcess(spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (ke
 		UTSNamespace:            k.RootUTSNamespace(),
 		IPCNamespace:            k.RootIPCNamespace(),
 		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
+		ContainerID:             id,
 	}
 	return procArgs, nil
 }
@@ -465,7 +466,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	// TODO New containers should be started in new PID namespaces
 	// when indicated by the spec.
 
-	procArgs, err := newProcess(spec, creds, l.k)
+	procArgs, err := newProcess(cid, spec, creds, l.k)
 	if err != nil {
 		return fmt.Errorf("failed to create new process: %v", err)
 	}
@@ -525,14 +526,14 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	return nil
 }
 
-func (l *Loader) executeAsync(args *control.ExecArgs, cid string) (kernel.ThreadID, error) {
+func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	// Get the container Root Dirent from the Task, since we must run this
 	// process with the same Root.
 	l.mu.Lock()
-	tg, ok := l.containerRootTGs[cid]
+	tg, ok := l.containerRootTGs[args.ContainerID]
 	l.mu.Unlock()
 	if !ok {
-		return 0, fmt.Errorf("cannot exec in container %q: no such container", cid)
+		return 0, fmt.Errorf("cannot exec in container %q: no such container", args.ContainerID)
 	}
 	tg.Leader().WithMuLocked(func(t *kernel.Task) {
 		args.Root = t.FSContext().RootDirectory()
@@ -552,7 +553,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs, cid string) (kernel.Thread
 	// later.
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	eid := execID{cid: cid, pid: tgid}
+	eid := execID{cid: args.ContainerID, pid: tgid}
 	l.execProcesses[eid] = tg
 	log.Debugf("updated execProcesses: %v", l.execProcesses)
 
@@ -671,8 +672,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	}
 }
 
-// TODO: Support sending signal to all.
-func (l *Loader) signal(cid string, signo int32) error {
+func (l *Loader) signal(cid string, signo int32, all bool) error {
 	l.mu.Lock()
 	tg, ok := l.containerRootTGs[cid]
 	l.mu.Unlock()
@@ -681,5 +681,13 @@ func (l *Loader) signal(cid string, signo int32) error {
 	}
 
 	si := arch.SignalInfo{Signo: signo}
+	if all {
+		// Pause the kernel to prevent new processes from being created while
+		// the signal is delivered. This prevents process leaks when SIGKILL is
+		// sent to the entire container.
+		l.k.Pause()
+		defer l.k.Unpause()
+		return l.k.SendContainerSignal(cid, &si)
+	}
 	return tg.Leader().SendSignal(&si)
 }
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 6fa5674f1..af709bc71 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -29,7 +29,9 @@ import (
 )
 
 // Kill implements subcommands.Command for the "kill" command.
-type Kill struct{}
+type Kill struct {
+	all bool
+}
 
 // Name implements subcommands.Command.Name.
 func (*Kill) Name() string {
@@ -47,15 +49,12 @@ func (*Kill) Usage() string {
 }
 
 // SetFlags implements subcommands.Command.SetFlags.
-func (*Kill) SetFlags(f *flag.FlagSet) {
-	// TODO: Implement this flag.  It is defined here just to
-	// prevent runsc from crashing if it is passed.
-	var all bool
-	f.BoolVar(&all, "all", false, "send the specified signal to all processes inside the container")
+func (k *Kill) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&k.all, "all", false, "send the specified signal to all processes inside the container")
 }
 
 // Execute implements subcommands.Command.Execute.
-func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	if f.NArg() == 0 || f.NArg() > 2 {
 		f.Usage()
 		return subcommands.ExitUsageError
@@ -83,7 +82,7 @@ func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	}
 	// TODO: Distinguish between already-exited containers and
 	// genuine errors.
-	if err := c.Signal(sig); err != nil {
+	if err := c.Signal(sig, k.all); err != nil {
 		Fatalf("%v", err)
 	}
 	return subcommands.ExitSuccess
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index d72d05c13..e68fb1e8e 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -53,6 +53,7 @@ go_test(
         "//runsc/boot",
         "//runsc/specutils",
         "//runsc/test/testutil",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
@@ -61,5 +62,8 @@ go_test(
 go_binary(
     name = "test_app",
     srcs = ["test_app.go"],
-    deps = ["@com_github_google_subcommands//:go_default_library"],
+    deps = [
+        "//runsc/test/testutil",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
 )
diff --git a/runsc/container/container.go b/runsc/container/container.go
index a1b31d861..44b7dad8a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -159,7 +159,7 @@ func Load(rootDir, id string) (*Container, error) {
 		} else if c.Status == Running {
 			// Container state should reflect the actual state of the application, so
 			// we don't consider gofer process here.
-			if err := c.Signal(syscall.Signal(0)); err != nil {
+			if err := c.Signal(syscall.Signal(0), false); err != nil {
 				c.changeStatus(Stopped)
 			}
 		}
@@ -398,7 +398,8 @@ func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
 	if err := c.requireStatus("execute in", Created, Running); err != nil {
 		return 0, err
 	}
-	return c.Sandbox.Execute(c.ID, args)
+	args.ContainerID = c.ID
+	return c.Sandbox.Execute(args)
 }
 
 // Event returns events for the container.
@@ -453,13 +454,13 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 // Signal sends the signal to the container.
 // Signal returns an error if the container is already stopped.
 // TODO: Distinguish different error types.
-func (c *Container) Signal(sig syscall.Signal) error {
+func (c *Container) Signal(sig syscall.Signal, all bool) error {
 	log.Debugf("Signal container %q: %v", c.ID, sig)
 	if err := c.requireStatus("signal", Running); err != nil {
 		return err
 	}
 	// TODO: Query the container for its state, then save it.
-	return c.Sandbox.Signal(c.ID, sig)
+	return c.Sandbox.Signal(c.ID, sig, all)
 }
 
 // Checkpoint sends the checkpoint call to the container.
@@ -612,7 +613,7 @@ func (c *Container) waitForStopped() error {
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
 		if c.isSandboxRunning() {
-			if err := c.Signal(syscall.Signal(0)); err == nil {
+			if err := c.Signal(syscall.Signal(0), false); err == nil {
 				return fmt.Errorf("container is still running")
 			}
 		}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index efa598202..de1e50a3f 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -30,6 +30,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -49,21 +50,34 @@ func init() {
 }
 
 // waitForProcessList waits for the given process list to show up in the container.
-func waitForProcessList(cont *Container, expected []*control.Process) error {
-	var got []*control.Process
-	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
-		var err error
-		got, err = cont.Processes()
+func waitForProcessList(cont *Container, want []*control.Process) error {
+	cb := func() error {
+		got, err := cont.Processes()
 		if err != nil {
-			return fmt.Errorf("error getting process data from container: %v", err)
+			err = fmt.Errorf("error getting process data from container: %v", err)
+			return &backoff.PermanentError{Err: err}
 		}
-		if procListsEqual(got, expected) {
-			return nil
+		if !procListsEqual(got, want) {
+			return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
 		}
-		// Process might not have started, try again...
-		time.Sleep(10 * time.Millisecond)
+		return nil
+	}
+	return testutil.Poll(cb, 5*time.Second)
+}
+
+func waitForProcessCount(cont *Container, want int) error {
+	cb := func() error {
+		pss, err := cont.Processes()
+		if err != nil {
+			err = fmt.Errorf("error getting process data from container: %v", err)
+			return &backoff.PermanentError{Err: err}
+		}
+		if got := len(pss); got != want {
+			return fmt.Errorf("wrong process count, got: %d, want: %d", got, want)
+		}
+		return nil
 	}
-	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(expected))
+	return testutil.Poll(cb, 5*time.Second)
 }
 
 // procListsEqual is used to check whether 2 Process lists are equal for all
@@ -345,7 +359,7 @@ func TestLifecycle(t *testing.T) {
 		<-ch
 		time.Sleep(100 * time.Millisecond)
 		// Send the container a SIGTERM which will cause it to stop.
-		if err := c.Signal(syscall.SIGTERM); err != nil {
+		if err := c.Signal(syscall.SIGTERM, false); err != nil {
 			t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
 		}
 		// Wait for it to die.
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 2867aa3b9..dc938066b 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -16,6 +16,7 @@ package container
 
 import (
 	"io/ioutil"
+	"math"
 	"os"
 	"path"
 	"path/filepath"
@@ -91,11 +92,16 @@ func TestMultiContainerSanity(t *testing.T) {
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
 			{PID: 1, Cmd: "sleep"},
-			{PID: 2, Cmd: "sleep"},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
+		expectedPL = []*control.Process{
+			{PID: 2, Cmd: "sleep"},
+		}
+		if err := waitForProcessList(containers[1], expectedPL); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
 	}
 }
 
@@ -134,10 +140,9 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "sleep"},
 		{PID: 2, Cmd: "sleep"},
 	}
-	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
 	}
 
@@ -179,7 +184,10 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
-	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+	expectedPL = []*control.Process{
+		{PID: 1, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
 	}
 }
@@ -219,17 +227,16 @@ func TestExecWait(t *testing.T) {
 		containers = append(containers, cont)
 	}
 
-	// Check via ps that multiple processes are running.
+	// Check via ps that process is running.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "sleep"},
 		{PID: 2, Cmd: "sleep"},
 	}
-	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Fatalf("failed to wait for sleep to start: %v", err)
 	}
 
 	// Wait for the second container to finish.
-	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+	if err := waitForProcessCount(containers[1], 0); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
 	}
 
@@ -256,7 +263,10 @@ func TestExecWait(t *testing.T) {
 	}
 
 	// Wait for the exec'd process to exit.
-	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+	expectedPL = []*control.Process{
+		{PID: 1, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
 	}
 
@@ -360,23 +370,25 @@ func TestMultiContainerSignal(t *testing.T) {
 			containers = append(containers, cont)
 		}
 
-		// Check via ps that multiple processes are running.
+		// Check via ps that container 1 process is running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
 			{PID: 2, Cmd: "sleep"},
 		}
 
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 
 		// Kill process 2.
-		if err := containers[1].Signal(syscall.SIGKILL); err != nil {
+		if err := containers[1].Signal(syscall.SIGKILL, false); err != nil {
 			t.Errorf("failed to kill process 2: %v", err)
 		}
 
 		// Make sure process 1 is still running.
-		if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		expectedPL = []*control.Process{
+			{PID: 1, Cmd: "sleep"},
+		}
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 
@@ -395,18 +407,18 @@ func TestMultiContainerSignal(t *testing.T) {
 			t.Errorf("error waiting for gofer to exit: %v", err)
 		}
 		// Make sure process 1 is still running.
-		if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 
 		// Now that process 2 is gone, ensure we get an error trying to
 		// signal it again.
-		if err := containers[1].Signal(syscall.SIGKILL); err == nil {
+		if err := containers[1].Signal(syscall.SIGKILL, false); err == nil {
 			t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
 		}
 
 		// Kill process 1.
-		if err := containers[0].Signal(syscall.SIGKILL); err != nil {
+		if err := containers[0].Signal(syscall.SIGKILL, false); err != nil {
 			t.Errorf("failed to kill process 1: %v", err)
 		}
 
@@ -428,7 +440,7 @@ func TestMultiContainerSignal(t *testing.T) {
 		}
 
 		// The sentry should be gone, so signaling should yield an error.
-		if err := containers[0].Signal(syscall.SIGKILL); err == nil {
+		if err := containers[0].Signal(syscall.SIGKILL, false); err == nil {
 			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
 		}
 	}
@@ -453,7 +465,6 @@ func TestMultiContainerDestroy(t *testing.T) {
 		// Setup the containers.
 		var containers []*Container
 		for i, spec := range specs {
-			conf := testutil.TestConfig()
 			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
@@ -501,3 +512,144 @@ func TestMultiContainerDestroy(t *testing.T) {
 		}
 	}
 }
+
+func TestMultiContainerProcesses(t *testing.T) {
+	// Note: use 'while true' to keep 'sh' process around. Otherwise, shell will
+	// just execve into 'sleep' and both containers will look the same.
+	specs, ids := createSpecs(
+		[]string{"sleep", "100"},
+		[]string{"sh", "-c", "while true; do sleep 100; done"})
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	var containers []*Container
+	for i, spec := range specs {
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	// Check root's container process list doesn't include other containers.
+	expectedPL0 := []*control.Process{
+		{PID: 1, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+
+	// Same for the other container.
+	expectedPL1 := []*control.Process{
+		{PID: 2, Cmd: "sh"},
+		{PID: 3, PPID: 2, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+
+	// Now exec into the second container and verify it shows up in the container.
+	args := &control.ExecArgs{
+		Filename: "/bin/sleep",
+		Argv:     []string{"/bin/sleep", "100"},
+	}
+	if _, err := containers[1].Execute(args); err != nil {
+		t.Fatalf("error exec'ing: %v", err)
+	}
+	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"})
+	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+	// Root container should remain unchanged.
+	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+}
+
+// TestMultiContainerKillAll checks that all process that belong to a container
+// are killed when SIGKILL is sent to *all* processes in that container.
+func TestMultiContainerKillAll(t *testing.T) {
+	app, err := testutil.FindFile("runsc/container/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	// First container will remain intact while the second container is killed.
+	specs, ids := createSpecs(
+		[]string{app, "task-tree", "--depth=2", "--width=2"},
+		[]string{app, "task-tree", "--depth=4", "--width=2"})
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	var containers []*Container
+	for i, spec := range specs {
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	// Wait until all processes are created.
+	rootProcCount := int(math.Pow(2, 3) - 1)
+	if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
+		t.Fatal(err)
+	}
+	procCount := int(math.Pow(2, 5) - 1)
+	if err := waitForProcessCount(containers[1], procCount); err != nil {
+		t.Fatal(err)
+	}
+
+	// Exec more processes to ensure signal works for exec'd processes too.
+	args := &control.ExecArgs{
+		Filename: app,
+		Argv:     []string{app, "task-tree", "--depth=2", "--width=2"},
+	}
+	if _, err := containers[1].Execute(args); err != nil {
+		t.Fatalf("error exec'ing: %v", err)
+	}
+	procCount += 3
+	if err := waitForProcessCount(containers[1], procCount); err != nil {
+		t.Fatal(err)
+	}
+
+	// Kill'Em All
+	containers[1].Signal(syscall.SIGKILL, true)
+
+	// Check that all processes are gone.
+	if err := waitForProcessCount(containers[1], 0); err != nil {
+		t.Fatal(err)
+	}
+	// Check that root container was not affected.
+	if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index 768293cf9..a99eb97c4 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -22,17 +22,20 @@ import (
 	"log"
 	"net"
 	"os"
+	"os/exec"
 	"strconv"
 	"time"
 
 	"flag"
 	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
 func main() {
 	subcommands.Register(subcommands.HelpCommand(), "")
 	subcommands.Register(subcommands.FlagsCommand(), "")
 	subcommands.Register(new(uds), "")
+	subcommands.Register(new(taskTree), "")
 
 	flag.Parse()
 
@@ -114,3 +117,63 @@ func server(listener net.Listener, out *os.File) {
 		fmt.Fprint(out, string(data)+"\n")
 	}
 }
+
+type taskTree struct {
+	depth int
+	width int
+}
+
+// Name implements subcommands.Command.
+func (*taskTree) Name() string {
+	return "task-tree"
+}
+
+// Synopsis implements subcommands.Command.
+func (*taskTree) Synopsis() string {
+	return "creates a tree of tasks"
+}
+
+// Usage implements subcommands.Command.
+func (*taskTree) Usage() string {
+	return "task-tree <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *taskTree) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&c.depth, "depth", 1, "number of levels to create")
+	f.IntVar(&c.width, "width", 1, "number of tasks at each level")
+}
+
+// Execute implements subcommands.Command.
+func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	if c.depth == 0 {
+		log.Printf("Child sleeping, PID: %d\n", os.Getpid())
+		for {
+			time.Sleep(24 * time.Hour)
+		}
+	}
+	log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid())
+
+	var cmds []*exec.Cmd
+	for i := 0; i < c.width; i++ {
+		cmd := exec.Command(
+			"/proc/self/exe", c.Name(),
+			"--depth", strconv.Itoa(c.depth-1),
+			"--width", strconv.Itoa(c.width))
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+
+		if err := cmd.Start(); err != nil {
+			log.Fatal("failed to call self:", err)
+		}
+		cmds = append(cmds, cmd)
+	}
+
+	for _, c := range cmds {
+		c.Wait()
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index c3d90d5f4..ef85f175f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -183,10 +183,9 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	}
 	defer conn.Close()
 
+	args := boot.ProcessesArgs{CID: cid}
 	var pl []*control.Process
-	// TODO: Pass in the container id (cid) here. The sandbox
-	// should return process info for only that container.
-	if err := conn.Call(boot.ContainerProcesses, nil, &pl); err != nil {
+	if err := conn.Call(boot.ContainerProcesses, &args, &pl); err != nil {
 		return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
 	}
 	return pl, nil
@@ -194,19 +193,17 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 
 // Execute runs the specified command in the container. It returns the pid of
 // the newly created process.
-func (s *Sandbox) Execute(cid string, args *control.ExecArgs) (int32, error) {
-	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
+func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
+	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return 0, s.connError(err)
 	}
 	defer conn.Close()
 
-	rpcArgs := &boot.ExecArgs{ExecArgs: *args, CID: cid}
-
 	// Send a message to the sandbox control server to start the container.
 	var pid int32
-	if err := conn.Call(boot.ContainerExecuteAsync, rpcArgs, &pid); err != nil {
+	if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil {
 		return 0, fmt.Errorf("error executing in sandbox: %v", err)
 	}
 	return pid, nil
@@ -575,7 +572,7 @@ func (s *Sandbox) destroy() error {
 }
 
 // Signal sends the signal to a container in the sandbox.
-func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
+func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error {
 	log.Debugf("Signal sandbox %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -586,6 +583,7 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 	args := boot.SignalArgs{
 		CID:   cid,
 		Signo: int32(sig),
+		All:   all,
 	}
 	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
 		return fmt.Errorf("err signaling container %q: %v", cid, err)
-- 
cgit v1.2.3


From b709d239870143102cf4e44b65cc26cea78a6ccb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 27 Sep 2018 18:15:07 -0700
Subject: Forward ioctl(TCSETSF) calls on host ttys to the host kernel.

We already forward TCSETS and TCSETSW.  TCSETSF is roughly equivalent but
discards pending input.

The filters were relaxed to allow host ioctls with TCSETSF argument.

This fixes programs like "passwd" that prevent user input from being displayed
on the terminal.

Before:
	root@b8a0240fc836:/# passwd
	Enter new UNIX password: 123
	Retype new UNIX password: 123
	passwd: password updated successfully

After:
	root@ae6f5dabe402:/# passwd
	Enter new UNIX password:
	Retype new UNIX password:
	passwd: password updated successfully
PiperOrigin-RevId: 214869788
Change-Id: I31b4d1373c1388f7b51d0f2f45ce40aa8e8b0b58
---
 pkg/abi/linux/ioctl.go      | 1 +
 pkg/sentry/fs/host/file.go  | 2 +-
 runsc/boot/filter/config.go | 5 +++++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 4d7a2dfd7..1c9dc7b03 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -21,6 +21,7 @@ const (
 	TCGETS     = 0x00005401
 	TCSETS     = 0x00005402
 	TCSETSW    = 0x00005403
+	TCSETSF    = 0x00005404
 	TIOCGPGRP  = 0x0000540f
 	TIOCSPGRP  = 0x00005410
 	TIOCOUTQ   = 0x00005411
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 8d2463c78..6f469b5cc 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -305,7 +305,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		})
 		return 0, err
 
-	case linux.TCSETS, linux.TCSETSW:
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
 		var termios linux.Termios
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
 			AddressSpaceActive: true,
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 0bcc640d5..352c64253 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -147,6 +147,11 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(linux.TCSETS),
 			seccomp.AllowAny{}, /* termios struct */
 		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETSF),
+			seccomp.AllowAny{}, /* termios struct */
+		},
 		{
 			seccomp.AllowAny{}, /* fd */
 			seccomp.AllowValue(linux.TCSETSW),
-- 
cgit v1.2.3


From 1166c088fc51c83af3198e25d5e774103ae976fc Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 27 Sep 2018 22:52:25 -0700
Subject: Move common test code to function

PiperOrigin-RevId: 214890335
Change-Id: I42743f0ce46a5a42834133bce2f32d187194fc87
---
 runsc/container/container_test.go       |   6 +-
 runsc/container/multi_container_test.go | 256 ++++++++++----------------------
 2 files changed, 80 insertions(+), 182 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index de1e50a3f..c71bcc46d 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -62,7 +62,8 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
 		}
 		return nil
 	}
-	return testutil.Poll(cb, 5*time.Second)
+	// Gives plenty of time as tests can run slow under --race.
+	return testutil.Poll(cb, 30*time.Second)
 }
 
 func waitForProcessCount(cont *Container, want int) error {
@@ -77,7 +78,8 @@ func waitForProcessCount(cont *Container, want int) error {
 		}
 		return nil
 	}
-	return testutil.Poll(cb, 5*time.Second)
+	// Gives plenty of time as tests can run slow under --race.
+	return testutil.Poll(cb, 30*time.Second)
 }
 
 // procListsEqual is used to check whether 2 Process lists are equal for all
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index dc938066b..8c98bed22 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -15,6 +15,7 @@
 package container
 
 import (
+	"fmt"
 	"io/ioutil"
 	"math"
 	"os"
@@ -56,38 +57,60 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 	return specs, ids
 }
 
+func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		return nil, nil, fmt.Errorf("error creating root dir: %v", err)
+	}
+
+	var containers []*Container
+	var bundles []string
+	cleanup := func() {
+		for _, c := range containers {
+			c.Destroy()
+		}
+		for _, b := range bundles {
+			os.RemoveAll(b)
+		}
+		os.RemoveAll(rootDir)
+	}
+	for i, spec := range specs {
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			cleanup()
+			return nil, nil, fmt.Errorf("error setting up container: %v", err)
+		}
+		bundles = append(bundles, bundleDir)
+
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			cleanup()
+			return nil, nil, fmt.Errorf("error creating container: %v", err)
+		}
+		containers = append(containers, cont)
+
+		if err := cont.Start(conf); err != nil {
+			cleanup()
+			return nil, nil, fmt.Errorf("error starting container: %v", err)
+		}
+	}
+	return containers, cleanup, nil
+}
+
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		specs, ids := createSpecs(sleep, sleep)
-		var containers []*Container
-		for i, spec := range specs {
-			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer os.RemoveAll(bundleDir)
-			cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer cont.Destroy()
-			if err := cont.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-			containers = append(containers, cont)
+		containers, cleanup, err := startContainers(conf, specs, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
 		}
+		defer cleanup()
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
@@ -106,37 +129,18 @@ func TestMultiContainerSanity(t *testing.T) {
 }
 
 func TestMultiContainerWait(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
 	// The first container should run the entire duration of the test.
 	cmd1 := []string{"sleep", "100"}
 	// We'll wait on the second container, which is much shorter lived.
 	cmd2 := []string{"sleep", "1"}
 	specs, ids := createSpecs(cmd1, cmd2)
 
-	// Setup the containers.
-	var containers []*Container
-	for i, spec := range specs {
-		conf := testutil.TestConfig()
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(bundleDir)
-		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
-		containers = append(containers, cont)
+	conf := testutil.TestConfig()
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
 	}
+	defer cleanup()
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
@@ -206,26 +210,12 @@ func TestExecWait(t *testing.T) {
 	// We'll wait on the second container, which is much shorter lived.
 	cmd2 := []string{"sleep", "1"}
 	specs, ids := createSpecs(cmd1, cmd2)
-
-	// Setup the containers.
-	var containers []*Container
-	for i, spec := range specs {
-		conf := testutil.TestConfig()
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(bundleDir)
-		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
-		containers = append(containers, cont)
+	conf := testutil.TestConfig()
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
 	}
+	defer cleanup()
 
 	// Check via ps that process is running.
 	expectedPL := []*control.Process{
@@ -284,12 +274,6 @@ func TestExecWait(t *testing.T) {
 // TestMultiContainerMount tests that bind mounts can be used with multiple
 // containers.
 func TestMultiContainerMount(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
 	cmd1 := []string{"sleep", "100"}
 
 	// 'src != dst' ensures that 'dst' doesn't exist in the host and must be
@@ -309,24 +293,12 @@ func TestMultiContainerMount(t *testing.T) {
 	})
 
 	// Setup the containers.
-	var containers []*Container
-	for i, spec := range sps {
-		conf := testutil.TestConfig()
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(bundleDir)
-		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
-		containers = append(containers, cont)
+	conf := testutil.TestConfig()
+	containers, cleanup, err := startContainers(conf, sps, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
 	}
+	defer cleanup()
 
 	ws, err := containers[1].Wait()
 	if err != nil {
@@ -343,32 +315,14 @@ func TestMultiContainerSignal(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		specs, ids := createSpecs(sleep, sleep)
-		var containers []*Container
-		for i, spec := range specs {
-			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer os.RemoveAll(bundleDir)
-			cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer cont.Destroy()
-			if err := cont.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-			containers = append(containers, cont)
+		containers, cleanup, err := startContainers(conf, specs, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
 		}
+		defer cleanup()
 
 		// Check via ps that container 1 process is running.
 		expectedPL := []*control.Process{
@@ -452,34 +406,14 @@ func TestMultiContainerDestroy(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-
 		// Two containers that will run for a long time. We will
 		// destroy the second one.
 		specs, ids := createSpecs([]string{"sleep", "100"}, []string{"sleep", "100"})
-
-		// Setup the containers.
-		var containers []*Container
-		for i, spec := range specs {
-			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer os.RemoveAll(bundleDir)
-			cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer cont.Destroy()
-			if err := cont.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-			containers = append(containers, cont)
+		containers, cleanup, err := startContainers(conf, specs, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
 		}
+		defer cleanup()
 
 		// Exec in the root container to check for the existence of the
 		// second containers root filesystem directory.
@@ -519,31 +453,12 @@ func TestMultiContainerProcesses(t *testing.T) {
 	specs, ids := createSpecs(
 		[]string{"sleep", "100"},
 		[]string{"sh", "-c", "while true; do sleep 100; done"})
-
-	rootDir, err := testutil.SetupRootDir()
+	conf := testutil.TestConfig()
+	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	var containers []*Container
-	for i, spec := range specs {
-		conf := testutil.TestConfig()
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(bundleDir)
-		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
-		containers = append(containers, cont)
+		t.Fatalf("error starting containers: %v", err)
 	}
+	defer cleanup()
 
 	// Check root's container process list doesn't include other containers.
 	expectedPL0 := []*control.Process{
@@ -592,31 +507,12 @@ func TestMultiContainerKillAll(t *testing.T) {
 	specs, ids := createSpecs(
 		[]string{app, "task-tree", "--depth=2", "--width=2"},
 		[]string{app, "task-tree", "--depth=4", "--width=2"})
-
-	rootDir, err := testutil.SetupRootDir()
+	conf := testutil.TestConfig()
+	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	var containers []*Container
-	for i, spec := range specs {
-		conf := testutil.TestConfig()
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(bundleDir)
-		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
-		containers = append(containers, cont)
+		t.Fatalf("error starting containers: %v", err)
 	}
+	defer cleanup()
 
 	// Wait until all processes are created.
 	rootProcCount := int(math.Pow(2, 3) - 1)
-- 
cgit v1.2.3


From 6779bd1187e2b0f8692ab8a16d8d1681f0e674c5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 27 Sep 2018 23:54:13 -0700
Subject: Merge Loader.containerRootTGs and execProcess into a single map

It's easier to manage a single map with processes that we're interested
to track. This will make the next change to clean up the map on destroy
easier.

PiperOrigin-RevId: 214894210
Change-Id: I099247323a0487cd0767120df47ba786fac0926d
---
 runsc/boot/controller.go | 23 ++++++++++++-----
 runsc/boot/loader.go     | 66 ++++++++++++++++++++----------------------------
 2 files changed, 43 insertions(+), 46 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 116a8369c..362e74df5 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -250,7 +250,8 @@ func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 	cm.l.mu.Lock()
 	defer cm.l.mu.Unlock()
 
-	if tg, ok := cm.l.containerRootTGs[*cid]; ok {
+	key := execID{cid: *cid}
+	if tg, ok := cm.l.processes[key]; ok {
 		// Send SIGKILL to threadgroup.
 		if err := tg.SendSignal(&arch.SignalInfo{
 			Signo: int32(linux.SIGKILL),
@@ -265,7 +266,7 @@ func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 		}
 
 		// Remove the container thread group from the map.
-		delete(cm.l.containerRootTGs, *cid)
+		delete(cm.l.processes, key)
 	}
 
 	// Clean up the filesystem by unmounting all mounts for this container
@@ -379,9 +380,9 @@ type RestoreOpts struct {
 }
 
 // Restore loads a container from a statefile.
-// The container's current kernel is destroyed, a restore environment is created,
-// and the kernel is recreated with the restore state file. The container then sends the
-// signal to start.
+// The container's current kernel is destroyed, a restore environment is
+// created, and the kernel is recreated with the restore state file. The
+// container then sends the signal to start.
 func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	log.Debugf("containerManager.Restore")
 
@@ -455,12 +456,20 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
 	cm.l.restore = true
 
+	// Reinitialize the sandbox ID and processes map. Note that it doesn't
+	// restore the state of multiple containers, nor exec processes.
+	cm.l.sandboxID = o.SandboxID
+	cm.l.mu.Lock()
+	key := execID{cid: o.SandboxID}
+	cm.l.processes = map[execID]*kernel.ThreadGroup{key: cm.l.k.GlobalInit()}
+	cm.l.mu.Unlock()
+
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
 	if err := <-cm.startResultChan; err != nil {
 		return fmt.Errorf("failed to start sandbox: %v", err)
 	}
-	cm.l.setRootContainerID(o.SandboxID)
+
 	return nil
 }
 
@@ -511,6 +520,6 @@ type SignalArgs struct {
 // Signal sends a signal to the init process of the container.
 // TODO: Send signal to exec process.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
-	log.Debugf("containerManager.Signal")
+	log.Debugf("containerManager.Signal %q %d, all: %t", args.CID, args.Signo, args.All)
 	return cm.l.signal(args.CID, args.Signo, args.All)
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index bd6e146fc..52c251812 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -104,26 +104,18 @@ type Loader struct {
 	// sandboxID is the ID for the whole sandbox.
 	sandboxID string
 
-	// mu guards containerRootTGs and execProcesses.
+	// mu guards processes.
 	mu sync.Mutex
 
-	// containerRootTGs maps container IDs to their root processes. It
-	// can be used to determine which process to manipulate when clients
-	// call methods on particular containers.
+	// processes maps containers root process and invocation of exec. Root
+	// processes are keyed with container ID and pid=0, while exec invocations
+	// have the corresponding pid set.
 	//
-	// containerRootTGs is guarded by mu.
+	// processes is guardded by mu.
 	//
 	// TODO: When containers are removed via `runsc delete`,
-	// containerRootTGs should be cleaned up.
-	containerRootTGs map[string]*kernel.ThreadGroup
-
-	// execProcesses maps each invocation of exec to the process it spawns.
-	//
-	// execProcesses is guardded by mu.
-	//
-	// TODO: When containers are removed via `runsc delete`,
-	// execProcesses should be cleaned up.
-	execProcesses map[execID]*kernel.ThreadGroup
+	// processes should be cleaned up.
+	processes map[execID]*kernel.ThreadGroup
 }
 
 // execID uniquely identifies a sentry process.
@@ -287,8 +279,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
 		sandboxID:             id,
-		containerRootTGs:      make(map[string]*kernel.ThreadGroup),
-		execProcesses:         make(map[execID]*kernel.ThreadGroup),
+		processes:             make(map[execID]*kernel.ThreadGroup),
 	}
 	ctrl.manager.l = l
 	return l, nil
@@ -425,7 +416,8 @@ func (l *Loader) run() error {
 	}
 
 	l.mu.Lock()
-	l.containerRootTGs[l.sandboxID] = l.k.GlobalInit()
+	key := execID{cid: l.sandboxID}
+	l.processes[key] = l.k.GlobalInit()
 	l.mu.Unlock()
 
 	// Start signal forwarding only after an init process is created.
@@ -521,7 +513,8 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	l.containerRootTGs[cid] = tg
+	key := execID{cid: cid}
+	l.processes[key] = tg
 
 	return nil
 }
@@ -530,7 +523,8 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	// Get the container Root Dirent from the Task, since we must run this
 	// process with the same Root.
 	l.mu.Lock()
-	tg, ok := l.containerRootTGs[args.ContainerID]
+	rootKey := execID{cid: args.ContainerID}
+	tg, ok := l.processes[rootKey]
 	l.mu.Unlock()
 	if !ok {
 		return 0, fmt.Errorf("cannot exec in container %q: no such container", args.ContainerID)
@@ -549,13 +543,13 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("error executing: %+v: %v", args, err)
 	}
 
-	// Insert the process into execProcesses so that we can wait on it
+	// Insert the process into processes so that we can wait on it
 	// later.
 	l.mu.Lock()
 	defer l.mu.Unlock()
 	eid := execID{cid: args.ContainerID, pid: tgid}
-	l.execProcesses[eid] = tg
-	log.Debugf("updated execProcesses: %v", l.execProcesses)
+	l.processes[eid] = tg
+	log.Debugf("updated processes: %v", l.processes)
 
 	return tgid, nil
 }
@@ -567,12 +561,12 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
 	l.mu.Lock()
-	tg, ok := l.containerRootTGs[cid]
+	key := execID{cid: cid}
+	tg, ok := l.processes[key]
+	l.mu.Unlock()
 	if !ok {
-		defer l.mu.Unlock()
-		return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGs)
+		return fmt.Errorf("can't find process for container %q in %v", cid, l.processes)
 	}
-	l.mu.Unlock()
 
 	// If the thread either has already exited or exits during waiting,
 	// consider the container exited.
@@ -590,10 +584,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 	}*/
 
 	// If the process was started via runsc exec, it will have an
-	// entry in l.execProcesses.
+	// entry in l.processes.
 	l.mu.Lock()
 	eid := execID{cid: cid, pid: tgid}
-	tg, ok := l.execProcesses[eid]
+	tg, ok := l.processes[eid]
 	l.mu.Unlock()
 	if ok {
 		ws := l.wait(tg)
@@ -601,8 +595,8 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 		if clearStatus {
 			// Remove tg from the cache.
 			l.mu.Lock()
-			delete(l.execProcesses, eid)
-			log.Debugf("updated execProcesses (removal): %v", l.execProcesses)
+			delete(l.processes, eid)
+			log.Debugf("updated processes (removal): %v", l.processes)
 			l.mu.Unlock()
 		}
 		return nil
@@ -626,13 +620,6 @@ func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
 	return tg.ExitStatus().Status()
 }
 
-func (l *Loader) setRootContainerID(cid string) {
-	l.mu.Lock()
-	defer l.mu.Unlock()
-	l.containerRootTGs = map[string]*kernel.ThreadGroup{cid: l.k.GlobalInit()}
-	l.sandboxID = cid
-}
-
 // WaitForStartSignal waits for a start signal from the control server.
 func (l *Loader) WaitForStartSignal() {
 	<-l.ctrl.manager.startChan
@@ -674,7 +661,8 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 
 func (l *Loader) signal(cid string, signo int32, all bool) error {
 	l.mu.Lock()
-	tg, ok := l.containerRootTGs[cid]
+	key := execID{cid: cid}
+	tg, ok := l.processes[key]
 	l.mu.Unlock()
 	if !ok {
 		return fmt.Errorf("failed to signal container %q: no such container", cid)
-- 
cgit v1.2.3


From cf226d48ce8c49409049e03ed405366db9fc2a04 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Sep 2018 09:43:13 -0700
Subject: Switch to root in userns when CAP_SYS_CHROOT is also missing

Some tests check current capabilities and re-run the tests as root inside
userns if required capabibilities are missing. It was checking for
CAP_SYS_ADMIN only, CAP_SYS_CHROOT is also required now.

PiperOrigin-RevId: 214949226
Change-Id: Ic81363969fa76c04da408fae8ea7520653266312
---
 runsc/cmd/capability.go         |  6 +++++-
 runsc/cmd/cmd.go                |  1 -
 runsc/sandbox/BUILD             |  1 +
 runsc/sandbox/sandbox.go        |  7 ++++---
 runsc/specutils/namespace.go    | 22 +++++++---------------
 runsc/test/testutil/BUILD       |  1 +
 runsc/test/testutil/testutil.go | 11 ++++++-----
 7 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
index affbb7ce3..0b18c5481 100644
--- a/runsc/cmd/capability.go
+++ b/runsc/cmd/capability.go
@@ -60,7 +60,11 @@ func applyCaps(caps *specs.LinuxCapabilities) error {
 		newCaps.Set(c, set...)
 	}
 
-	return newCaps.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS)
+	if err := newCaps.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS); err != nil {
+		return err
+	}
+	log.Infof("Capabilities applied: %+v", newCaps)
+	return nil
 }
 
 func getCaps(which capability.CapType, caps *specs.LinuxCapabilities) []string {
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index 44ebd7165..2937ae1c4 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -85,7 +85,6 @@ func setCapsAndCallSelf(args []string, caps *specs.LinuxCapabilities) error {
 		return err
 	}
 
-	log.Infof("Capabilities applied: %+v", caps)
 	log.Infof("Execve %q again, bye!", binPath)
 	syscall.Exec(binPath, args, []string{})
 	panic("unreachable")
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 7ae19ff35..09965dcc0 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -25,6 +25,7 @@ go_library(
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
         "@com_github_vishvananda_netlink//:go_default_library",
     ],
 )
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ef85f175f..d288be1d2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -26,6 +26,7 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
 	"gvisor.googlesource.com/gvisor/pkg/control/client"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -415,7 +416,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		// as user nobody.
 		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
-		} else if specutils.CanSetUIDGID() {
+		} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
 			// Map nobody in the new namespace to nobody in the parent namespace.
 			const nobody = 65534
 			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{{
@@ -442,7 +443,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		// bind-mount the executable inside it.
 		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
-		} else if specutils.HasCapSysAdmin() {
+		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
 			log.Infof("Sandbox will be started in minimal chroot")
 			chroot, err := setUpChroot()
 			if err != nil {
@@ -453,7 +454,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			cmd.Args[0] = "/runsc"
 			cmd.Path = "/runsc"
 		} else {
-			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
+			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN and CAP_SYS_CHROOT")
 		}
 	}
 
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 48a199a77..00293d45b 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -204,8 +204,8 @@ func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
 	}
 }
 
-// CanSetUIDGID returns true if the user has SETUID and SETGID capabilities.
-func CanSetUIDGID() bool {
+// HasCapabilities returns true if the user has all capabilties in 'cs'.
+func HasCapabilities(cs ...capability.Cap) bool {
 	caps, err := capability.NewPid2(os.Getpid())
 	if err != nil {
 		return false
@@ -213,18 +213,10 @@ func CanSetUIDGID() bool {
 	if err := caps.Load(); err != nil {
 		return false
 	}
-	return caps.Get(capability.EFFECTIVE, capability.CAP_SETUID) &&
-		caps.Get(capability.EFFECTIVE, capability.CAP_SETGID)
-}
-
-// HasCapSysAdmin returns true if the user has CAP_SYS_ADMIN capability.
-func HasCapSysAdmin() bool {
-	caps, err := capability.NewPid2(os.Getpid())
-	if err != nil {
-		return false
-	}
-	if err := caps.Load(); err != nil {
-		return false
+	for _, c := range cs {
+		if !caps.Get(capability.EFFECTIVE, c) {
+			return false
+		}
 	}
-	return caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN)
+	return true
 }
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 03ab3c4ac..ca91e07ff 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -18,5 +18,6 @@ go_library(
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 37927f395..706db74a7 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -32,6 +32,7 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -234,12 +235,12 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 	return Poll(cb, timeout)
 }
 
-// RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create
-// a new user namespace and reexecute the test as root inside of the namespace.
-// This functionr returns when it's running as root. If it needs to create
-// another process, it will exit from there and not return.
+// RunAsRoot ensures the test runs with CAP_SYS_ADMIN and CAP_SYS_CHROOT. If
+// need it will create a new user namespace and reexecute the test as root
+// inside of the namespace. This functionr returns when it's running as root. If
+// it needs to create another process, it will exit from there and not return.
 func RunAsRoot() {
-	if specutils.HasCapSysAdmin() {
+	if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
 		return
 	}
 
-- 
cgit v1.2.3


From fb65b0b471621b430969fe1c3009bee68209bf67 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Fri, 28 Sep 2018 12:16:47 -0700
Subject: Change tcpip.Route.Mask to tcpip.AddressMask.

PiperOrigin-RevId: 214975659
Change-Id: I7bd31a2c54f03ff52203109da312e4206701c44c
---
 pkg/dhcp/dhcp_test.go                  | 2 +-
 pkg/tcpip/adapters/gonet/gonet_test.go | 4 ++--
 pkg/tcpip/network/ipv6/icmp_test.go    | 4 ++--
 pkg/tcpip/sample/tun_tcp_echo/main.go  | 2 +-
 pkg/tcpip/tcpip.go                     | 2 +-
 pkg/tcpip/tcpip_test.go                | 2 +-
 runsc/boot/network.go                  | 8 +++++++-
 7 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'runsc')

diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index a187c5c2a..d60e3752b 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -57,7 +57,7 @@ func createStack(t *testing.T) *stack.Stack {
 
 	s.SetRouteTable([]tcpip.Route{{
 		Destination: tcpip.Address(strings.Repeat("\x00", 4)),
-		Mask:        tcpip.Address(strings.Repeat("\x00", 4)),
+		Mask:        tcpip.AddressMask(strings.Repeat("\x00", 4)),
 		Gateway:     "",
 		NIC:         nicid,
 	}})
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 86a82f21d..79b7c77ee 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -68,7 +68,7 @@ func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
 		// IPv4
 		{
 			Destination: tcpip.Address(strings.Repeat("\x00", 4)),
-			Mask:        tcpip.Address(strings.Repeat("\x00", 4)),
+			Mask:        tcpip.AddressMask(strings.Repeat("\x00", 4)),
 			Gateway:     "",
 			NIC:         NICID,
 		},
@@ -76,7 +76,7 @@ func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
 		// IPv6
 		{
 			Destination: tcpip.Address(strings.Repeat("\x00", 16)),
-			Mask:        tcpip.Address(strings.Repeat("\x00", 16)),
+			Mask:        tcpip.AddressMask(strings.Repeat("\x00", 16)),
 			Gateway:     "",
 			NIC:         NICID,
 		},
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index b8e53c13e..fabbdc8c7 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -108,14 +108,14 @@ func newTestContext(t *testing.T) *testContext {
 	c.s0.SetRouteTable(
 		[]tcpip.Route{{
 			Destination: lladdr1,
-			Mask:        tcpip.Address(strings.Repeat("\xff", 16)),
+			Mask:        tcpip.AddressMask(strings.Repeat("\xff", 16)),
 			NIC:         1,
 		}},
 	)
 	c.s1.SetRouteTable(
 		[]tcpip.Route{{
 			Destination: lladdr0,
-			Mask:        tcpip.Address(strings.Repeat("\xff", 16)),
+			Mask:        tcpip.AddressMask(strings.Repeat("\xff", 16)),
 			NIC:         1,
 		}},
 	)
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index a4b28a7a3..910d1257f 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -150,7 +150,7 @@ func main() {
 	s.SetRouteTable([]tcpip.Route{
 		{
 			Destination: tcpip.Address(strings.Repeat("\x00", len(addr))),
-			Mask:        tcpip.Address(strings.Repeat("\x00", len(addr))),
+			Mask:        tcpip.AddressMask(strings.Repeat("\x00", len(addr))),
 			Gateway:     "",
 			NIC:         1,
 		},
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 5f210cdd0..f5b5ec86b 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -490,7 +490,7 @@ type Route struct {
 
 	// Mask specifies which bits of the Destination and the target address
 	// must match for this row to be viable.
-	Mask Address
+	Mask AddressMask
 
 	// Gateway is the gateway to be used if this row is viable.
 	Gateway Address
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
index 9b20c74c6..d283f71c7 100644
--- a/pkg/tcpip/tcpip_test.go
+++ b/pkg/tcpip/tcpip_test.go
@@ -123,7 +123,7 @@ func TestSubnetCreation(t *testing.T) {
 func TestRouteMatch(t *testing.T) {
 	tests := []struct {
 		d    Address
-		m    Address
+		m    AddressMask
 		a    Address
 		want bool
 	}{
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0e43c91be..6a2678ac9 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -86,7 +86,7 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
 	return tcpip.Route{
 		Destination: ipToAddress(r.Destination),
 		Gateway:     ipToAddress(r.Gateway),
-		Mask:        ipToAddress(net.IP(r.Mask)),
+		Mask:        ipToAddressMask(net.IP(r.Mask)),
 		NIC:         id,
 	}
 }
@@ -203,6 +203,12 @@ func ipToAddress(ip net.IP) tcpip.Address {
 	return addr
 }
 
+// ipToAddressMask converts IP to tcpip.AddressMask, ignoring the protocol.
+func ipToAddressMask(ip net.IP) tcpip.AddressMask {
+	_, addr := ipToAddressAndProto(ip)
+	return tcpip.AddressMask(addr)
+}
+
 // generateRndMac returns a random local MAC address.
 // Copied from eth_random_addr() (include/linux/etherdevice.h)
 func generateRndMac() net.HardwareAddr {
-- 
cgit v1.2.3


From 2496d9b4b6343154525f73e9583a4a60bebcfa30 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Sep 2018 12:20:56 -0700
Subject: Make runsc kill and delete more conformant to the "spec"

PiperOrigin-RevId: 214976251
Change-Id: I631348c3886f41f63d0e77e7c4f21b3ede2ab521
---
 runsc/boot/controller.go                | 89 +--------------------------------
 runsc/boot/fs.go                        | 67 ++++++++++++++++++++++++-
 runsc/boot/loader.go                    | 67 +++++++++++++++++++------
 runsc/cmd/boot.go                       |  2 +
 runsc/container/container.go            | 22 ++++++--
 runsc/container/multi_container_test.go | 49 ++++++++++++++----
 runsc/container/test_app.go             | 70 ++++++++++++++++++++++++--
 runsc/sandbox/sandbox.go                |  3 +-
 8 files changed, 248 insertions(+), 121 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 362e74df5..98356e8b7 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -21,10 +21,8 @@ import (
 	"path"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -32,7 +30,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 )
 
@@ -247,91 +244,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 // filesystem.
 func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 	log.Debugf("containerManager.destroy %q", *cid)
-	cm.l.mu.Lock()
-	defer cm.l.mu.Unlock()
-
-	key := execID{cid: *cid}
-	if tg, ok := cm.l.processes[key]; ok {
-		// Send SIGKILL to threadgroup.
-		if err := tg.SendSignal(&arch.SignalInfo{
-			Signo: int32(linux.SIGKILL),
-			Code:  arch.SignalInfoUser,
-		}); err == nil {
-			// SIGKILL sent. Now wait for it to exit.
-			log.Debugf("Waiting for container process to exit.")
-			tg.WaitExited()
-			log.Debugf("Container process exited.")
-		} else if err != syserror.ESRCH {
-			return fmt.Errorf("error sending SIGKILL to container %q: %v", *cid, err)
-		}
-
-		// Remove the container thread group from the map.
-		delete(cm.l.processes, key)
-	}
-
-	// Clean up the filesystem by unmounting all mounts for this container
-	// and deleting the container root directory.
-
-	// First get a reference to the container root directory.
-	mns := cm.l.k.RootMountNamespace()
-	mnsRoot := mns.Root()
-	defer mnsRoot.DecRef()
-	ctx := cm.l.rootProcArgs.NewContext(cm.l.k)
-	containerRoot := path.Join(ChildContainersDir, *cid)
-	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
-	if err == syserror.ENOENT {
-		// Container must have been destroyed already. That's fine.
-		return nil
-	}
-	if err != nil {
-		return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
-	}
-	defer containerRootDirent.DecRef()
-
-	// Iterate through all submounts and unmount them. We unmount lazily by
-	// setting detach=true, so we can unmount in any order.
-	for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
-		root := m.Root()
-		defer root.DecRef()
-
-		// Do a best-effort unmount by flushing the refs and unmount
-		// with "detach only = true".
-		log.Debugf("Unmounting container submount %q", root.BaseName())
-		m.FlushDirentRefs()
-		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
-			return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
-		}
-	}
-
-	// Unmount the container root itself.
-	log.Debugf("Unmounting container root %q", containerRoot)
-	containerRootDirent.Inode.MountSource.FlushDirentRefs()
-	if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
-		return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
-	}
-
-	// Get a reference to the parent directory and remove the root
-	// container directory.
-	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
-	if err != nil {
-		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
-	}
-	defer containersDirDirent.DecRef()
-	log.Debugf("Deleting container root %q", containerRoot)
-	if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, *cid); err != nil {
-		return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
-	}
-
-	// Flushing dirent references triggers many async close operations. We
-	// must wait for those to complete before returning, otherwise the
-	// caller may kill the gofer before they complete, causing a cascade of
-	// failing RPCs.
-	log.Infof("Waiting for async filesystem operations to complete")
-	fs.AsyncBarrier()
-
-	// We made it!
-	log.Debugf("Destroyed container %q", *cid)
-	return nil
+	return cm.l.destroyContainer(*cid)
 }
 
 // ExecuteAsync starts running a command on a created or running sandbox. It
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 22d5f621c..9e8fea7e1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,6 +16,7 @@ package boot
 
 import (
 	"fmt"
+	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -576,9 +577,9 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-// setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
+// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
 // procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
-func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
 	ctx := procArgs.NewContext(k)
 
 	// Create the FD map, which will set stdin, stdout, and stderr.  If
@@ -676,3 +677,65 @@ func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *ke
 	procArgs.Filename = f
 	return nil
 }
+
+// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
+// given container and deleting the container root directory.
+func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
+	// First get a reference to the container root directory.
+	mns := k.RootMountNamespace()
+	mnsRoot := mns.Root()
+	defer mnsRoot.DecRef()
+	containerRoot := path.Join(ChildContainersDir, cid)
+	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
+	if err == syserror.ENOENT {
+		// Container must have been destroyed already. That's fine.
+		return nil
+	}
+	if err != nil {
+		return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
+	}
+	defer containerRootDirent.DecRef()
+
+	// Iterate through all submounts and unmount them. We unmount lazily by
+	// setting detach=true, so we can unmount in any order.
+	for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
+		root := m.Root()
+		defer root.DecRef()
+
+		// Do a best-effort unmount by flushing the refs and unmount
+		// with "detach only = true".
+		log.Debugf("Unmounting container submount %q", root.BaseName())
+		m.FlushDirentRefs()
+		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
+			return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
+		}
+	}
+
+	// Unmount the container root itself.
+	log.Debugf("Unmounting container root %q", containerRoot)
+	containerRootDirent.Inode.MountSource.FlushDirentRefs()
+	if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
+		return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
+	}
+
+	// Get a reference to the parent directory and remove the root
+	// container directory.
+	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
+	}
+	defer containersDirDirent.DecRef()
+	log.Debugf("Deleting container root %q", containerRoot)
+	if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
+		return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
+	}
+
+	// Flushing dirent references triggers many async close operations. We
+	// must wait for those to complete before returning, otherwise the
+	// caller may kill the gofer before they complete, causing a cascade of
+	// failing RPCs.
+	log.Infof("Waiting for async filesystem operations to complete")
+	fs.AsyncBarrier()
+
+	return nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 52c251812..1e2a12280 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -112,9 +112,6 @@ type Loader struct {
 	// have the corresponding pid set.
 	//
 	// processes is guardded by mu.
-	//
-	// TODO: When containers are removed via `runsc delete`,
-	// processes should be cleaned up.
 	processes map[execID]*kernel.ThreadGroup
 }
 
@@ -385,7 +382,7 @@ func (l *Loader) run() error {
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
-		if err := setFileSystemForProcess(
+		if err := setupContainerFS(
 			&l.rootProcArgs,
 			l.spec,
 			l.conf,
@@ -476,7 +473,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	stdioFDs := ioFDs[:3]
 	goferFDs := ioFDs[3:]
-	if err := setFileSystemForProcess(
+	if err := setupContainerFS(
 		&procArgs,
 		spec,
 		conf,
@@ -519,6 +516,34 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	return nil
 }
 
+// destroyContainer stops a container if it is still running and cleans up its
+// filesystem.
+func (l *Loader) destroyContainer(cid string) error {
+	// First kill and wait for all processes in the container.
+	if err := l.signal(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
+		return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	// Remove all container thread groups from the map.
+	for key := range l.processes {
+		if key.cid == cid {
+			delete(l.processes, key)
+		}
+	}
+
+	ctx := l.rootProcArgs.NewContext(l.k)
+	if err := destroyContainerFS(ctx, cid, l.k); err != nil {
+		return fmt.Errorf("failed to destroy filesystem for container %q: %v", cid, err)
+	}
+
+	// We made it!
+	log.Debugf("Container destroyed %q", cid)
+	return nil
+}
+
 func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	// Get the container Root Dirent from the Task, since we must run this
 	// process with the same Root.
@@ -669,13 +694,27 @@ func (l *Loader) signal(cid string, signo int32, all bool) error {
 	}
 
 	si := arch.SignalInfo{Signo: signo}
-	if all {
-		// Pause the kernel to prevent new processes from being created while
-		// the signal is delivered. This prevents process leaks when SIGKILL is
-		// sent to the entire container.
-		l.k.Pause()
-		defer l.k.Unpause()
-		return l.k.SendContainerSignal(cid, &si)
-	}
-	return tg.Leader().SendSignal(&si)
+	if !all {
+		return tg.Leader().SendSignal(&si)
+	}
+
+	// Pause the kernel to prevent new processes from being created while
+	// the signal is delivered. This prevents process leaks when SIGKILL is
+	// sent to the entire container.
+	l.k.Pause()
+	if err := l.k.SendContainerSignal(cid, &si); err != nil {
+		l.k.Unpause()
+		return err
+	}
+	l.k.Unpause()
+
+	// If killing all processes, wait for them to exit.
+	if all && linux.Signal(signo) == linux.SIGKILL {
+		for _, t := range l.k.TaskSet().Root.Tasks() {
+			if t.ContainerID() == cid {
+				t.ThreadGroup().WaitExited()
+			}
+		}
+	}
+	return nil
 }
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 933ba2d9e..82e534479 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -142,6 +142,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
+	// Fatalf exits the process and doesn't run defers. 'l' must be destroyed
+	// explicitly!
 
 	// Notify other processes the loader has been created.
 	l.NotifyLoaderCreated()
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 44b7dad8a..e65800b8d 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -72,6 +72,21 @@ func validateID(id string) error {
 // Containers must write their metadata files after any change to their internal
 // states. The entire container directory is deleted when the container is
 // destroyed.
+//
+// When the container is stopped, all processes that belong to the container
+// must be stopped before Destroy() returns. containerd makes roughly the
+// following calls to stop a container:
+//   - First it attempts to kill the container process with
+//     'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a
+//     separate thread, it's waiting on the container. As soon as the wait
+//     returns, it moves on to the next step:
+//   - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to
+//     the container. 'kill --all SIGKILL' waits for all processes before
+//     returning.
+//   - Containerd waits for stdin, stdout and stderr to drain and be closed.
+//   - It calls 'runsc delete'. runc implementation kills --all SIGKILL once
+//     again just to be sure, waits, and then proceeds with remaining teardown.
+//
 type Container struct {
 	// ID is the container ID.
 	ID string `json:"id"`
@@ -451,7 +466,8 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 	return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
 }
 
-// Signal sends the signal to the container.
+// Signal sends the signal to the container. If all is true and signal is
+// SIGKILL, then waits for all processes to exit before returning.
 // Signal returns an error if the container is already stopped.
 // TODO: Distinguish different error types.
 func (c *Container) Signal(sig syscall.Signal, all bool) error {
@@ -534,8 +550,8 @@ func (c *Container) Processes() ([]*control.Process, error) {
 	return c.Sandbox.Processes(c.ID)
 }
 
-// Destroy frees all resources associated with the container. It fails fast and
-// is idempotent.
+// Destroy stops all processes and frees all resources associated with the
+// container. It fails fast and is idempotent.
 func (c *Container) Destroy() error {
 	log.Debugf("Destroy container %q", c.ID)
 
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 8c98bed22..3d7385a82 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -25,6 +25,7 @@ import (
 	"sync"
 	"syscall"
 	"testing"
+	"time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
@@ -403,12 +404,18 @@ func TestMultiContainerSignal(t *testing.T) {
 // TestMultiContainerDestroy checks that container are properly cleaned-up when
 // they are destroyed.
 func TestMultiContainerDestroy(t *testing.T) {
+	app, err := testutil.FindFile("runsc/container/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		// Two containers that will run for a long time. We will
-		// destroy the second one.
-		specs, ids := createSpecs([]string{"sleep", "100"}, []string{"sleep", "100"})
+		// First container will remain intact while the second container is killed.
+		specs, ids := createSpecs(
+			[]string{app, "reaper"},
+			[]string{app, "fork-bomb"})
 		containers, cleanup, err := startContainers(conf, specs, ids)
 		if err != nil {
 			t.Fatalf("error starting containers: %v", err)
@@ -416,26 +423,48 @@ func TestMultiContainerDestroy(t *testing.T) {
 		defer cleanup()
 
 		// Exec in the root container to check for the existence of the
-		// second containers root filesystem directory.
+		// second container's root filesystem directory.
 		contDir := path.Join(boot.ChildContainersDir, containers[1].ID)
-		args := &control.ExecArgs{
+		dirArgs := &control.ExecArgs{
 			Filename: "/usr/bin/test",
 			Argv:     []string{"test", "-d", contDir},
 		}
-		if ws, err := containers[0].executeSync(args); err != nil {
-			t.Fatalf("error executing %+v: %v", args, err)
+		if ws, err := containers[0].executeSync(dirArgs); err != nil {
+			t.Fatalf("error executing %+v: %v", dirArgs, err)
 		} else if ws.ExitStatus() != 0 {
 			t.Errorf("exec 'test -f %q' got exit status %d, wanted 0", contDir, ws.ExitStatus())
 		}
 
-		// Destory the second container.
+		// Exec more processes to ensure signal all works for exec'd processes too.
+		args := &control.ExecArgs{
+			Filename: app,
+			Argv:     []string{app, "fork-bomb"},
+		}
+		if _, err := containers[1].Execute(args); err != nil {
+			t.Fatalf("error exec'ing: %v", err)
+		}
+
+		// Let it brew...
+		time.Sleep(500 * time.Millisecond)
+
 		if err := containers[1].Destroy(); err != nil {
 			t.Fatalf("error destroying container: %v", err)
 		}
 
+		// Check that destroy killed all processes belonging to the container and
+		// waited for them to exit before returning.
+		pss, err := containers[0].Sandbox.Processes("")
+		if err != nil {
+			t.Fatalf("error getting process data from sandbox: %v", err)
+		}
+		expectedPL := []*control.Process{{PID: 1, Cmd: "test_app"}}
+		if !procListsEqual(pss, expectedPL) {
+			t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
+		}
+
 		// Now the container dir should be gone.
-		if ws, err := containers[0].executeSync(args); err != nil {
-			t.Fatalf("error executing %+v: %v", args, err)
+		if ws, err := containers[0].executeSync(dirArgs); err != nil {
+			t.Fatalf("error executing %+v: %v", dirArgs, err)
 		} else if ws.ExitStatus() == 0 {
 			t.Errorf("exec 'test -f %q' got exit status 0, wanted non-zero", contDir)
 		}
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index a99eb97c4..f69cfdf83 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -36,6 +36,8 @@ func main() {
 	subcommands.Register(subcommands.FlagsCommand(), "")
 	subcommands.Register(new(uds), "")
 	subcommands.Register(new(taskTree), "")
+	subcommands.Register(new(forkBomb), "")
+	subcommands.Register(new(reaper), "")
 
 	flag.Parse()
 
@@ -151,9 +153,7 @@ func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 
 	if c.depth == 0 {
 		log.Printf("Child sleeping, PID: %d\n", os.Getpid())
-		for {
-			time.Sleep(24 * time.Hour)
-		}
+		select {}
 	}
 	log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid())
 
@@ -177,3 +177,67 @@ func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	}
 	return subcommands.ExitSuccess
 }
+
+type forkBomb struct {
+	delay time.Duration
+}
+
+// Name implements subcommands.Command.
+func (*forkBomb) Name() string {
+	return "fork-bomb"
+}
+
+// Synopsis implements subcommands.Command.
+func (*forkBomb) Synopsis() string {
+	return "creates child process until the end of times"
+}
+
+// Usage implements subcommands.Command.
+func (*forkBomb) Usage() string {
+	return "fork-bomb <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *forkBomb) SetFlags(f *flag.FlagSet) {
+	f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child")
+}
+
+// Execute implements subcommands.Command.
+func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	time.Sleep(c.delay)
+
+	cmd := exec.Command("/proc/self/exe", c.Name())
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		log.Fatal("failed to call self:", err)
+	}
+	return subcommands.ExitSuccess
+}
+
+type reaper struct{}
+
+// Name implements subcommands.Command.
+func (*reaper) Name() string {
+	return "reaper"
+}
+
+// Synopsis implements subcommands.Command.
+func (*reaper) Synopsis() string {
+	return "reaps all children in a loop"
+}
+
+// Usage implements subcommands.Command.
+func (*reaper) Usage() string {
+	return "reaper <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (*reaper) SetFlags(*flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	stop := testutil.StartReaper()
+	defer stop()
+	select {}
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index d288be1d2..4111b1a60 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -572,7 +572,8 @@ func (s *Sandbox) destroy() error {
 	return nil
 }
 
-// Signal sends the signal to a container in the sandbox.
+// Signal sends the signal to a container in the sandbox. If all is true and
+// signal is SIGKILL, then waits for all processes to exit before returning.
 func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error {
 	log.Debugf("Signal sandbox %q", s.ID)
 	conn, err := s.sandboxConnect()
-- 
cgit v1.2.3


From 49ff81a42b51a3fa2ee139e1e86179fa0c427a86 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Sep 2018 15:51:36 -0700
Subject: Add ruby image tests

PiperOrigin-RevId: 215009066
Change-Id: I54ab920fa649cf4d0817f7cb8ea76f9126523330
---
 runsc/test/image/BUILD         |  2 ++
 runsc/test/image/image_test.go | 49 ++++++++++++++++++++++++++++++++++++++++++
 runsc/test/image/ruby.rb       | 23 ++++++++++++++++++++
 runsc/test/image/ruby.sh       | 20 +++++++++++++++++
 runsc/test/testutil/docker.go  | 17 ++++++++++-----
 5 files changed, 106 insertions(+), 5 deletions(-)
 create mode 100644 runsc/test/image/ruby.rb
 create mode 100644 runsc/test/image/ruby.sh

(limited to 'runsc')

diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index 5854eec12..c41161d50 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -11,6 +11,8 @@ go_test(
     data = [
         "latin10k.txt",
         "mysql.sql",
+        "ruby.rb",
+        "ruby.sh",
     ],
     embed = [":image"],
     tags = [
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 13fd8f1ee..5f90ca9d2 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -30,6 +30,7 @@ import (
 	"io/ioutil"
 	"net/http"
 	"os"
+	"path/filepath"
 	"strings"
 	"testing"
 	"time"
@@ -256,6 +257,54 @@ func TestTomcat(t *testing.T) {
 	}
 }
 
+func TestRuby(t *testing.T) {
+	if err := testutil.Pull("ruby"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("ruby-test")
+
+	dir, err := testutil.PrepareFiles("ruby.rb", "ruby.sh")
+	if err != nil {
+		t.Fatalf("PrepareFiles() failed: %v", err)
+	}
+	if err := os.Chmod(filepath.Join(dir, "ruby.sh"), 0333); err != nil {
+		t.Fatalf("os.Chmod(%q, 0333) failed: %v", dir, err)
+	}
+
+	if _, err := d.Run("-p", "8080", "-v", testutil.MountArg(dir, "/src:ro"), "ruby", "/src/ruby.sh"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running, 'gem install' can take some time.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	}
+
+	// Ensure that content is being served.
+	url := fmt.Sprintf("http://localhost:%d", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Errorf("error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		t.Fatalf("error reading body: %v", err)
+	}
+	if got, want := string(body), "Hello World"; !strings.Contains(got, want) {
+		t.Errorf("invalid body content, got: %q, want: %q", got, want)
+	}
+}
+
 func MainTest(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
diff --git a/runsc/test/image/ruby.rb b/runsc/test/image/ruby.rb
new file mode 100644
index 000000000..ae5de3419
--- /dev/null
+++ b/runsc/test/image/ruby.rb
@@ -0,0 +1,23 @@
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+require 'sinatra'
+
+set :bind, "0.0.0.0"
+set :port, 8080
+
+get '/' do
+  'Hello World'
+end
+
diff --git a/runsc/test/image/ruby.sh b/runsc/test/image/ruby.sh
new file mode 100644
index 000000000..54be2c931
--- /dev/null
+++ b/runsc/test/image/ruby.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+gem install sinatra
+ruby /src/ruby.rb
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index c73bb0406..d0446df4e 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -162,6 +162,11 @@ func (d *Docker) Run(args ...string) (string, error) {
 	return do(a...)
 }
 
+// Logs calls 'docker logs'.
+func (d *Docker) Logs() (string, error) {
+	return do("logs", d.Name)
+}
+
 // Exec calls 'docker exec' with the arguments provided.
 func (d *Docker) Exec(args ...string) (string, error) {
 	a := []string{"exec", d.Name}
@@ -193,12 +198,14 @@ func (d *Docker) Remove() error {
 	return nil
 }
 
-// CleanUp kills and deletes the container.
-func (d *Docker) CleanUp() error {
+// CleanUp kills and deletes the container (best effort).
+func (d *Docker) CleanUp() {
 	if _, err := do("kill", d.Name); err != nil {
-		return fmt.Errorf("error killing container %q: %v", d.Name, err)
+		log.Printf("error killing container %q: %v", d.Name, err)
+	}
+	if err := d.Remove(); err != nil {
+		log.Print(err)
 	}
-	return d.Remove()
 }
 
 // FindPort returns the host port that is mapped to 'sandboxPort'. This calls
@@ -223,7 +230,7 @@ func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, e
 	var out string
 	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
 		var err error
-		out, err = do("logs", d.Name)
+		out, err = d.Logs()
 		if err != nil {
 			return "", err
 		}
-- 
cgit v1.2.3


From f21dde566641ee9d80730cc04f16d75df8b05036 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Fri, 28 Sep 2018 15:51:51 -0700
Subject: runsc: allow `kill --all` when container is in stopped state.

PiperOrigin-RevId: 215009105
Change-Id: I1ab12eddf7694c4db98f6dafca9dae352a33f7c4
---
 runsc/container/container.go | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index e65800b8d..e09ed9347 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -472,9 +472,17 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 // TODO: Distinguish different error types.
 func (c *Container) Signal(sig syscall.Signal, all bool) error {
 	log.Debugf("Signal container %q: %v", c.ID, sig)
-	if err := c.requireStatus("signal", Running); err != nil {
+	// Signaling container in Stopped state is allowed. When all=false,
+	// an error will be returned anyway; when all=true, this allows
+	// sending signal to other processes inside the container even
+	// after the init process exits. This is especially useful for
+	// container cleanup.
+	if err := c.requireStatus("signal", Running, Stopped); err != nil {
 		return err
 	}
+	if !c.isSandboxRunning() {
+		return fmt.Errorf("container is not running")
+	}
 	// TODO: Query the container for its state, then save it.
 	return c.Sandbox.Signal(c.ID, sig, all)
 }
-- 
cgit v1.2.3


From cfdd418fe23880cad88639596c1171cbe7ad6ffb Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Sep 2018 17:47:22 -0700
Subject: Made a few changes to make testutil.Docker easier to use

PiperOrigin-RevId: 215023376
Change-Id: I139569bd15c013e5dd0f60d0c98a64eaa0ba9e8e
---
 runsc/test/image/image_test.go             | 20 +++++++++--------
 runsc/test/integration/exec_test.go        |  2 +-
 runsc/test/integration/integration_test.go |  6 ++---
 runsc/test/testutil/docker.go              | 35 +++++++++++++++++++++++++-----
 4 files changed, 44 insertions(+), 19 deletions(-)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 5f90ca9d2..1696de6f1 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -40,7 +40,7 @@ import (
 
 func TestHelloWorld(t *testing.T) {
 	d := testutil.MakeDocker("hello-test")
-	if _, err := d.Run("hello-world"); err != nil {
+	if err := d.Run("hello-world"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
@@ -94,7 +94,8 @@ func TestHttpd(t *testing.T) {
 	}
 
 	// Start the container.
-	if _, err := d.Run("-p", "80", "-v", testutil.MountArg(dir, "/usr/local/apache2/htdocs:ro"), "httpd"); err != nil {
+	mountArg := testutil.MountArg(dir, "/usr/local/apache2/htdocs", testutil.ReadOnly)
+	if err := d.Run("-p", "80", mountArg, "httpd"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
@@ -127,7 +128,8 @@ func TestNginx(t *testing.T) {
 	}
 
 	// Start the container.
-	if _, err := d.Run("-p", "80", "-v", testutil.MountArg(dir, "/usr/share/nginx/html:ro"), "nginx"); err != nil {
+	mountArg := testutil.MountArg(dir, "/usr/share/nginx/html", testutil.ReadOnly)
+	if err := d.Run("-p", "80", mountArg, "nginx"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
@@ -155,7 +157,7 @@ func TestMysql(t *testing.T) {
 	d := testutil.MakeDocker("mysql-test")
 
 	// Start the container.
-	if _, err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
+	if err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
@@ -174,12 +176,12 @@ func TestMysql(t *testing.T) {
 	// Tell mysql client to connect to the server and execute the file in verbose
 	// mode to verify the output.
 	args := []string{
-		"--link", testutil.LinkArg(&d, "mysql"),
-		"-v", testutil.MountArg(dir, "/sql"),
+		testutil.LinkArg(&d, "mysql"),
+		testutil.MountArg(dir, "/sql", testutil.ReadWrite),
 		"mysql",
 		"mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql",
 	}
-	if _, err := client.Run(args...); err != nil {
+	if err := client.Run(args...); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer client.CleanUp()
@@ -198,7 +200,7 @@ func TestPythonHello(t *testing.T) {
 		t.Fatalf("docker pull failed: %v", err)
 	}
 	d := testutil.MakeDocker("python-hello-test")
-	if _, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
+	if err := d.Run("-p", "8080", "google/python-hello"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
@@ -230,7 +232,7 @@ func TestTomcat(t *testing.T) {
 		t.Fatalf("docker pull failed: %v", err)
 	}
 	d := testutil.MakeDocker("tomcat-test")
-	if _, err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
+	if err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 6053ecd1c..910c36597 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -40,7 +40,7 @@ func TestExecCapabilities(t *testing.T) {
 	d := testutil.MakeDocker("exec-test")
 
 	// Start the container.
-	if _, err := d.Run("alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
+	if err := d.Run("alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 59df5bd7c..457b5fbf5 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -98,8 +98,8 @@ func TestPauseResume(t *testing.T) {
 		t.Fatal("docker pull failed:", err)
 	}
 	d := testutil.MakeDocker("pause-resume-test")
-	if out, err := d.Run("-p", "8080", "google/python-hello"); err != nil {
-		t.Fatalf("docker run failed: %v\nout: %s", err, out)
+	if err := d.Run("-p", "8080", "google/python-hello"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
 
@@ -157,7 +157,7 @@ func TestConnectToSelf(t *testing.T) {
 
 	// Creates server that replies "server" and exists. Sleeps at the end because
 	// 'docker exec' gets killed if the init process exists before it can finish.
-	if _, err := d.Run("ubuntu:trusty", "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
+	if err := d.Run("ubuntu:trusty", "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
 		t.Fatal("docker run failed:", err)
 	}
 	defer d.CleanUp()
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index d0446df4e..7f5909987 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -65,14 +65,35 @@ func EnsureSupportedDockerVersion() {
 	}
 }
 
+// MountMode describes if the mount should be ro or rw.
+type MountMode int
+
+const (
+	// ReadOnly is what the name says.
+	ReadOnly MountMode = iota
+	// ReadWrite is what the name says.
+	ReadWrite
+)
+
+// String returns the mount mode argument for this MountMode.
+func (m MountMode) String() string {
+	switch m {
+	case ReadOnly:
+		return "ro"
+	case ReadWrite:
+		return "rw"
+	}
+	panic(fmt.Sprintf("invalid mode: %d", m))
+}
+
 // MountArg formats the volume argument to mount in the container.
-func MountArg(source, target string) string {
-	return fmt.Sprintf("%s:%s", source, target)
+func MountArg(source, target string, mode MountMode) string {
+	return fmt.Sprintf("-v=%s:%s:%v", source, target, mode)
 }
 
 // LinkArg formats the link argument.
 func LinkArg(source *Docker, target string) string {
-	return fmt.Sprintf("%s:%s", source.Name, target)
+	return fmt.Sprintf("--link=%s:%s", source.Name, target)
 }
 
 // PrepareFiles creates temp directory to copy files there. The sandbox doesn't
@@ -155,11 +176,13 @@ func (d *Docker) Stop() error {
 	return nil
 }
 
-// Run calls 'docker run' with the arguments provided.
-func (d *Docker) Run(args ...string) (string, error) {
+// Run calls 'docker run' with the arguments provided. The container starts
+// running in the backgroud and the call returns immediately.
+func (d *Docker) Run(args ...string) error {
 	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
 	a = append(a, args...)
-	return do(a...)
+	_, err := do(a...)
+	return err
 }
 
 // Logs calls 'docker logs'.
-- 
cgit v1.2.3


From 50c283b9f56bb7200938d9e207355f05f79f0d17 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Sep 2018 18:14:59 -0700
Subject: Add test for 'signall --all' with stopped container

PiperOrigin-RevId: 215025517
Change-Id: I04b9d8022b3d9dfe279e466ddb91310b9860b9af
---
 runsc/container/multi_container_test.go | 120 ++++++++++++++++++++------------
 1 file changed, 76 insertions(+), 44 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 3d7385a82..e5f7daf60 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -527,54 +527,86 @@ func TestMultiContainerProcesses(t *testing.T) {
 // TestMultiContainerKillAll checks that all process that belong to a container
 // are killed when SIGKILL is sent to *all* processes in that container.
 func TestMultiContainerKillAll(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app")
-	if err != nil {
-		t.Fatal("error finding test_app:", err)
-	}
+	for _, tc := range []struct {
+		killContainer bool
+	}{
+		{killContainer: true},
+		{killContainer: false},
+	} {
+		app, err := testutil.FindFile("runsc/container/test_app")
+		if err != nil {
+			t.Fatal("error finding test_app:", err)
+		}
 
-	// First container will remain intact while the second container is killed.
-	specs, ids := createSpecs(
-		[]string{app, "task-tree", "--depth=2", "--width=2"},
-		[]string{app, "task-tree", "--depth=4", "--width=2"})
-	conf := testutil.TestConfig()
-	containers, cleanup, err := startContainers(conf, specs, ids)
-	if err != nil {
-		t.Fatalf("error starting containers: %v", err)
-	}
-	defer cleanup()
+		// First container will remain intact while the second container is killed.
+		specs, ids := createSpecs(
+			[]string{app, "task-tree", "--depth=2", "--width=2"},
+			[]string{app, "task-tree", "--depth=4", "--width=2"})
+		conf := testutil.TestConfig()
+		containers, cleanup, err := startContainers(conf, specs, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
 
-	// Wait until all processes are created.
-	rootProcCount := int(math.Pow(2, 3) - 1)
-	if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
-		t.Fatal(err)
-	}
-	procCount := int(math.Pow(2, 5) - 1)
-	if err := waitForProcessCount(containers[1], procCount); err != nil {
-		t.Fatal(err)
-	}
+		// Wait until all processes are created.
+		rootProcCount := int(math.Pow(2, 3) - 1)
+		if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
+			t.Fatal(err)
+		}
+		procCount := int(math.Pow(2, 5) - 1)
+		if err := waitForProcessCount(containers[1], procCount); err != nil {
+			t.Fatal(err)
+		}
 
-	// Exec more processes to ensure signal works for exec'd processes too.
-	args := &control.ExecArgs{
-		Filename: app,
-		Argv:     []string{app, "task-tree", "--depth=2", "--width=2"},
-	}
-	if _, err := containers[1].Execute(args); err != nil {
-		t.Fatalf("error exec'ing: %v", err)
-	}
-	procCount += 3
-	if err := waitForProcessCount(containers[1], procCount); err != nil {
-		t.Fatal(err)
-	}
+		// Exec more processes to ensure signal works for exec'd processes too.
+		args := &control.ExecArgs{
+			Filename: app,
+			Argv:     []string{app, "task-tree", "--depth=2", "--width=2"},
+		}
+		if _, err := containers[1].Execute(args); err != nil {
+			t.Fatalf("error exec'ing: %v", err)
+		}
+		procCount += 3
+		if err := waitForProcessCount(containers[1], procCount); err != nil {
+			t.Fatal(err)
+		}
+
+		if tc.killContainer {
+			// First kill the init process to make the container be stopped with
+			// processes still running inside.
+			containers[1].Signal(syscall.SIGKILL, false)
+			op := func() error {
+				c, err := Load(conf.RootDir, ids[1])
+				if err != nil {
+					return err
+				}
+				if c.Status != Stopped {
+					return fmt.Errorf("container is not stopped")
+				}
+				return nil
+			}
+			if err := testutil.Poll(op, 5*time.Second); err != nil {
+				t.Fatalf("container did not stop %q: %v", containers[1].ID, err)
+			}
+		}
 
-	// Kill'Em All
-	containers[1].Signal(syscall.SIGKILL, true)
+		c, err := Load(conf.RootDir, ids[1])
+		if err != nil {
+			t.Fatalf("failed to load child container %q: %v", c.ID, err)
+		}
+		// Kill'Em All
+		if err := c.Signal(syscall.SIGKILL, true); err != nil {
+			t.Fatalf("failed to send SIGKILL to container %q: %v", c.ID, err)
+		}
 
-	// Check that all processes are gone.
-	if err := waitForProcessCount(containers[1], 0); err != nil {
-		t.Fatal(err)
-	}
-	// Check that root container was not affected.
-	if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
-		t.Fatal(err)
+		// Check that all processes are gone.
+		if err := waitForProcessCount(containers[1], 0); err != nil {
+			t.Fatal(err)
+		}
+		// Check that root container was not affected.
+		if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
+			t.Fatal(err)
+		}
 	}
 }
-- 
cgit v1.2.3


From 9c7eb13079e65100b69b41536a51d2433b05637b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Sun, 30 Sep 2018 22:21:34 -0700
Subject: Removed duplicate/stale TODOs

PiperOrigin-RevId: 215162121
Change-Id: I35f06ac3235cf31c9e8a158dcf6261a7ded6c4c4
---
 runsc/cmd/kill.go            | 2 --
 runsc/container/container.go | 1 -
 2 files changed, 3 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index af709bc71..dcb2988e3 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -80,8 +80,6 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if err != nil {
 		Fatalf("%v", err)
 	}
-	// TODO: Distinguish between already-exited containers and
-	// genuine errors.
 	if err := c.Signal(sig, k.all); err != nil {
 		Fatalf("%v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index e09ed9347..b39d6bf12 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -483,7 +483,6 @@ func (c *Container) Signal(sig syscall.Signal, all bool) error {
 	if !c.isSandboxRunning() {
 		return fmt.Errorf("container is not running")
 	}
-	// TODO: Query the container for its state, then save it.
 	return c.Sandbox.Signal(c.ID, sig, all)
 }
 
-- 
cgit v1.2.3


From 43e6aff50e23763d12c71b054f100fd91da46736 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Sun, 30 Sep 2018 23:22:13 -0700
Subject: Don't fail if Root is readonly and is not a mount point

This makes runsc more friendly to run without docker or K8s.

PiperOrigin-RevId: 215165586
Change-Id: Id45a9fc24a3c09b1645f60dbaf70e64711a7a4cd
---
 runsc/container/container_test.go | 21 ++++++++++++++
 runsc/container/fs.go             | 59 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 78 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index c71bcc46d..aebfb2878 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1556,6 +1556,27 @@ func TestGoferExits(t *testing.T) {
 	}
 }
 
+func TestRootNotMount(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/true")
+
+	root, err := ioutil.TempDir(testutil.TmpDir(), "root")
+	if err != nil {
+		t.Fatalf("failure to create tmp dir: %v", err)
+	}
+	spec.Root.Path = root
+	spec.Root.Readonly = true
+	spec.Mounts = []specs.Mount{
+		{Destination: "/bin", Source: "/bin", Type: "bind", Options: []string{"ro"}},
+		{Destination: "/lib", Source: "/lib", Type: "bind", Options: []string{"ro"}},
+		{Destination: "/lib64", Source: "/lib64", Type: "bind", Options: []string{"ro"}},
+	}
+
+	conf := testutil.TestConfig()
+	if err := run(spec, conf); err != nil {
+		t.Fatalf("error running sandbox: %v", err)
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index a3c5772ba..59edd9488 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -15,6 +15,7 @@
 package container
 
 import (
+	"bufio"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -100,18 +101,72 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		}
 	}
 
-	// Remount root as readonly after setup is done, if requested.
+	// If root is read only, check if it needs to be remounted as readonly.
 	if spec.Root.Readonly {
+		isMountPoint, readonly, err := mountInfo(spec.Root.Path)
+		if err != nil {
+			return err
+		}
+		if readonly {
+			return nil
+		}
+		if !isMountPoint {
+			// Readonly root is not a mount point nor read-only. Can't do much other
+			// than just logging a warning. The gofer will prevent files to be open
+			// in write mode.
+			log.Warningf("Mount where root is located is not read-only and cannot be changed: %q", spec.Root.Path)
+			return nil
+		}
+
+		// If root is a mount point but not read-only, we can change mount options
+		// to make it read-only for extra safety.
 		log.Infof("Remounting root as readonly: %q", spec.Root.Path)
 		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
 		src := spec.Root.Path
 		if err := syscall.Mount(src, src, "bind", flags, ""); err != nil {
-			return fmt.Errorf("failed to remount root as readonly with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
+			return fmt.Errorf("failed to remount root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
 		}
 	}
 	return nil
 }
 
+// mountInfo returns whether the path is a mount point and whether the mount
+// that path belongs to is read-only.
+func mountInfo(path string) (bool, bool, error) {
+	// Mounts are listed by their real paths.
+	realPath, err := filepath.EvalSymlinks(path)
+	if err != nil {
+		return false, false, err
+	}
+	f, err := os.Open("/proc/mounts")
+	if err != nil {
+		return false, false, err
+	}
+	scanner := bufio.NewScanner(f)
+
+	var mountPoint string
+	var readonly bool
+	for scanner.Scan() {
+		line := scanner.Text()
+		parts := strings.Split(line, " ")
+		if len(parts) < 4 {
+			return false, false, fmt.Errorf("invalid /proc/mounts line format %q", line)
+		}
+		mp := parts[1]
+		opts := strings.Split(parts[3], ",")
+
+		// Find the closest submount to the path.
+		if strings.Contains(realPath, mp) && len(mp) > len(mountPoint) {
+			mountPoint = mp
+			readonly = specutils.ContainsStr(opts, "ro")
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return false, false, err
+	}
+	return mountPoint == realPath, readonly, nil
+}
+
 // destroyFS unmounts mounts done by runsc under `spec.Root.Path`. This
 // recovers the container rootfs into the original state.
 func destroyFS(spec *specs.Spec) error {
-- 
cgit v1.2.3


From a2ad8fef136b31989bfcd2f40003f6113aebaf1d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 1 Oct 2018 10:29:45 -0700
Subject: Make multi-container the default mode for runsc

And remove multicontainer option.

PiperOrigin-RevId: 215236981
Change-Id: I9fd1d963d987e421e63d5817f91a25c819ced6cb
---
 runsc/boot/config.go            |  5 -----
 runsc/boot/fs.go                | 16 ++++++++--------
 runsc/boot/loader.go            |  2 --
 runsc/container/container.go    |  4 ++--
 runsc/main.go                   |  2 --
 runsc/sandbox/network.go        | 29 -----------------------------
 runsc/test/testutil/testutil.go | 13 ++++++-------
 7 files changed, 16 insertions(+), 55 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 01da535af..cd977c8a5 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -193,10 +193,6 @@ type Config struct {
 	// disabled. Pardon the double negation, but default to enabled is important.
 	DisableSeccomp bool
 
-	// MultiContainer enables multiple containers support inside one sandbox.
-	// TODO: Remove this when multiple container is fully supported.
-	MultiContainer bool
-
 	// SpecFile is the file containing the OCI spec.
 	SpecFile string
 
@@ -224,7 +220,6 @@ func (c *Config) ToFlags() []string {
 		"--debug-log-dir=" + c.DebugLogDir,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
-		"--multi-container=" + strconv.FormatBool(c.MultiContainer),
 		"--network=" + c.Network.String(),
 		"--log-packets=" + strconv.FormatBool(c.LogPackets),
 		"--platform=" + c.Platform.String(),
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 9e8fea7e1..42e011beb 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -85,14 +85,14 @@ func (f *fdDispenser) empty() bool {
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
 func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int) (*fs.MountNamespace, error) {
 	mounts := compileMounts(spec)
-	if conf.MultiContainer {
-		// Create a tmpfs mount where we create and mount a root filesystem for
-		// each child container.
-		mounts = append(mounts, specs.Mount{
-			Type:        tmpfs,
-			Destination: ChildContainersDir,
-		})
-	}
+
+	// Create a tmpfs mount where we create and mount a root filesystem for
+	// each child container.
+	mounts = append(mounts, specs.Mount{
+		Type:        tmpfs,
+		Destination: ChildContainersDir,
+	})
+
 	fds := &fdDispenser{fds: goferFDs}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 1e2a12280..9fa9b51a0 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -579,8 +579,6 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	return tgid, nil
 }
 
-// TODO: Per-container namespaces must be supported for -pid.
-
 // waitContainer waits for the root process of a container to exit.
 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
diff --git a/runsc/container/container.go b/runsc/container/container.go
index b39d6bf12..be833c03d 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -267,7 +267,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// started in an existing sandbox, we must do so. The metadata will
 	// indicate the ID of the sandbox, which is the same as the ID of the
 	// init container in the sandbox.
-	if specutils.ShouldCreateSandbox(spec) || !conf.MultiContainer {
+	if specutils.ShouldCreateSandbox(spec) {
 		log.Debugf("Creating new sandbox for container %q", id)
 		ioFiles, err := c.createGoferProcess(spec, conf, bundleDir)
 		if err != nil {
@@ -345,7 +345,7 @@ func (c *Container) Start(conf *boot.Config) error {
 		}
 	}
 
-	if specutils.ShouldCreateSandbox(c.Spec) || !conf.MultiContainer {
+	if specutils.ShouldCreateSandbox(c.Spec) {
 		if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
 			return err
 		}
diff --git a/runsc/main.go b/runsc/main.go
index 624db5f40..2a18c4b9e 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -60,7 +60,6 @@ var (
 	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
 	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
-	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 )
@@ -140,7 +139,6 @@ func main() {
 		Platform:       platformType,
 		Strace:         *strace,
 		StraceLogSize:  *straceLogSize,
-		MultiContainer: *multiContainer,
 		WatchdogAction: wa,
 		PanicSignal:    *panicSignal,
 	}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 60cbbfcdb..86a52c6ae 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -57,35 +57,6 @@ const (
 func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
 	log.Infof("Setting up network")
 
-	if !conf.MultiContainer {
-		// HACK!
-		//
-		// When kubernetes starts a pod, it first creates a sandbox with an
-		// application that just pauses forever.  Later, when a container is
-		// added to the pod, kubernetes will create another sandbox with a
-		// config that corresponds to the containerized application, and add it
-		// to the same namespaces as the pause sandbox.
-		//
-		// Running a second sandbox currently breaks because the two sandboxes
-		// have the same network namespace and configuration, and try to create
-		// a tap device on the same host device which fails.
-		//
-		// Runsc will eventually need to detect that this container is meant to
-		// be run in the same sandbox as the pausing application, and somehow
-		// make that happen.
-		//
-		// For now the following HACK disables networking for the "pause"
-		// sandbox, allowing the second sandbox to start up successfully.
-		//
-		// TODO: Remove this once multiple containers per sandbox
-		// is properly supported.
-		if spec.Annotations[crioContainerTypeAnnotation] == "sandbox" ||
-			spec.Annotations[containerdContainerTypeAnnotation] == "sandbox" {
-			log.Warningf("HACK: Disabling network")
-			conf.Network = boot.NetworkNone
-		}
-	}
-
 	switch conf.Network {
 	case boot.NetworkNone:
 		log.Infof("Network is disabled, create loopback interface only")
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 706db74a7..07d66e469 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -104,13 +104,12 @@ func FindFile(path string) (string, error) {
 // TestConfig return the default configuration to use in tests.
 func TestConfig() *boot.Config {
 	return &boot.Config{
-		Debug:          true,
-		LogFormat:      "text",
-		LogPackets:     true,
-		Network:        boot.NetworkNone,
-		Strace:         true,
-		MultiContainer: true,
-		FileAccess:     boot.FileAccessExclusive,
+		Debug:      true,
+		LogFormat:  "text",
+		LogPackets: true,
+		Network:    boot.NetworkNone,
+		Strace:     true,
+		FileAccess: boot.FileAccessExclusive,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
 	}
 }
-- 
cgit v1.2.3


From d185552e79e19bd25cdcf64c123712086c48ae58 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 1 Oct 2018 13:55:58 -0700
Subject: Fix ruby image tests.

PiperOrigin-RevId: 215274663
Change-Id: I051721f459084db3aa608432831170cd47ae7df0
---
 runsc/test/image/image_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 1696de6f1..5048ffdd7 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -273,7 +273,7 @@ func TestRuby(t *testing.T) {
 		t.Fatalf("os.Chmod(%q, 0333) failed: %v", dir, err)
 	}
 
-	if _, err := d.Run("-p", "8080", "-v", testutil.MountArg(dir, "/src:ro"), "ruby", "/src/ruby.sh"); err != nil {
+	if err := d.Run("-p", "8080", testutil.MountArg(dir, "/src", testutil.ReadOnly), "ruby", "/src/ruby.sh"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
-- 
cgit v1.2.3


From f1c01ed88666ea81d8f5cef7931153a9951a6e64 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 1 Oct 2018 22:05:41 -0700
Subject: runsc: Support job control signals in "exec -it".

Terminal support in runsc relies on host tty file descriptors that are imported
into the sandbox. Application tty ioctls are sent directly to the host fd.

However, those host tty ioctls are associated in the host kernel with a host
process (in this case runsc), and the host kernel intercepts job control
characters like ^C and send signals to the host process. Thus, typing ^C into a
"runsc exec" shell will send a SIGINT to the runsc process.

This change makes "runsc exec" handle all signals, and forward them into the
sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is
associated with a particular container process in the sandbox, the signal must
be associated with the same container process.

One big difficulty is that the signal should not necessarily be sent to the
sandbox process started by "exec", but instead must be sent to the foreground
process group for the tty. For example, we may exec "bash", and from bash call
"sleep 100". A ^C at this point should SIGINT sleep, not bash.

To handle this, tty files inside the sandbox must keep track of their
foreground process group, which is set/get via ioctls. When an incoming
ContainerSignal urpc comes in, we look up the foreground process group via the
tty file. Unfortunately, this means we have to expose and cache the tty file in
the Loader.

Note that "runsc exec" now handles signals properly, but "runs run" does not.
That will come in a later CL, as this one is complex enough already.

Example:
	root@:/usr/local/apache2# sleep 100
	^C

	root@:/usr/local/apache2# sleep 100
	^Z
	[1]+  Stopped                 sleep 100

	root@:/usr/local/apache2# fg
	sleep 100
	^C

	root@:/usr/local/apache2#

PiperOrigin-RevId: 215334554
Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
---
 pkg/sentry/control/proc.go              |  78 ++++++++++----
 pkg/sentry/fs/host/BUILD                |   1 +
 pkg/sentry/fs/host/file.go              | 144 +++++--------------------
 pkg/sentry/fs/host/inode.go             |  18 ++--
 pkg/sentry/fs/host/tty.go               | 185 ++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/sessions.go           |   5 +
 runsc/boot/controller.go                |  46 ++++++--
 runsc/boot/fds.go                       |   8 +-
 runsc/boot/loader.go                    | 108 ++++++++++++++-----
 runsc/cmd/exec.go                       |  13 ++-
 runsc/container/BUILD                   |   2 +
 runsc/container/container.go            |  51 ++++++---
 runsc/container/container_test.go       | 117 ++++++++++++++++++++
 runsc/container/multi_container_test.go |   7 +-
 runsc/sandbox/sandbox.go                |  49 ++++++---
 runsc/test/integration/exec_test.go     |  55 ++++++++++
 runsc/test/testutil/BUILD               |   1 +
 runsc/test/testutil/docker.go           |  21 ++++
 runsc/test/testutil/testutil.go         |  36 +++++++
 19 files changed, 732 insertions(+), 213 deletions(-)
 create mode 100644 pkg/sentry/fs/host/tty.go

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index faf1168bb..0ba730c1e 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -78,7 +78,7 @@ type ExecArgs struct {
 	Capabilities *auth.TaskCapabilities
 
 	// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host
-	// pty fd.
+	// pty FD.
 	StdioIsPty bool
 
 	// FilePayload determines the files to give to the new process.
@@ -90,7 +90,7 @@ type ExecArgs struct {
 
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
-	newTG, _, err := proc.execAsync(args)
+	newTG, _, _, err := proc.execAsync(args)
 	if err != nil {
 		return err
 	}
@@ -103,18 +103,27 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 
 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
 // as a function rather than a method to avoid exposing execAsync as an RPC.
-func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, error) {
+func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
 	return proc.execAsync(args)
 }
 
 // execAsync runs a new task, but doesn't wait for it to finish. It returns the
-// newly created thread group and its PID.
-func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, error) {
+// newly created thread group and its PID. If the stdio FDs are TTYs, then a
+// TTYFileOperations that wraps the TTY is also returned.
+func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
 	// Import file descriptors.
 	l := limits.NewLimitSet()
 	fdm := proc.Kernel.NewFDMap()
 	defer fdm.DecRef()
 
+	// No matter what happens, we should close all files in the FilePayload
+	// before returning. Any files that are imported will be duped.
+	defer func() {
+		for _, f := range args.FilePayload.Files {
+			f.Close()
+		}
+	}()
+
 	creds := auth.NewUserCredentials(
 		args.KUID,
 		args.KGID,
@@ -150,31 +159,62 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		paths := fs.GetPath(initArgs.Envv)
 		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 		if err != nil {
-			return nil, 0, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
 		}
 		initArgs.Filename = f
 	}
 
 	mounter := fs.FileOwnerFromContext(ctx)
-	for appFD, f := range args.FilePayload.Files {
-		enableIoctl := args.StdioIsPty && appFD <= 2
 
-		// Import the given file FD. This dups the FD as well.
-		file, err := host.ImportFile(ctx, int(f.Fd()), mounter, enableIoctl)
-		if err != nil {
-			return nil, 0, err
+	var ttyFile *fs.File
+	for appFD, hostFile := range args.FilePayload.Files {
+		var appFile *fs.File
+
+		if args.StdioIsPty && appFD < 3 {
+			// Import the file as a host TTY file.
+			if ttyFile == nil {
+				var err error
+				appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, true /* isTTY */)
+				if err != nil {
+					return nil, 0, nil, err
+				}
+				defer appFile.DecRef()
+
+				// Remember this in the TTY file, as we will
+				// use it for the other stdio FDs.
+				ttyFile = appFile
+			} else {
+				// Re-use the existing TTY file, as all three
+				// stdio FDs must point to the same fs.File in
+				// order to share TTY state, specifically the
+				// foreground process group id.
+				appFile = ttyFile
+			}
+		} else {
+			// Import the file as a regular host file.
+			var err error
+			appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, false /* isTTY */)
+			if err != nil {
+				return nil, 0, nil, err
+			}
+			defer appFile.DecRef()
 		}
-		defer file.DecRef()
-
-		// We're done with this file.
-		f.Close()
 
-		if err := fdm.NewFDAt(kdefs.FD(appFD), file, kernel.FDFlags{}, l); err != nil {
-			return nil, 0, err
+		// Add the file to the FD map.
+		if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+			return nil, 0, nil, err
 		}
 	}
 
-	return proc.Kernel.CreateProcess(initArgs)
+	tg, tid, err := proc.Kernel.CreateProcess(initArgs)
+	if err != nil {
+		return nil, 0, nil, err
+	}
+
+	if ttyFile == nil {
+		return tg, tid, nil, nil
+	}
+	return tg, tid, ttyFile.FileOperations.(*host.TTYFileOperations), nil
 }
 
 // PsArgs is the set of arguments to ps.
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index d1a6eaf6e..c34f1c26b 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -17,6 +17,7 @@ go_library(
         "socket.go",
         "socket_state.go",
         "socket_unsafe.go",
+        "tty.go",
         "util.go",
         "util_unsafe.go",
     ],
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 6f469b5cc..22a5d9f12 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -18,15 +18,12 @@ import (
 	"fmt"
 	"syscall"
 
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/secio"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -39,6 +36,7 @@ import (
 //
 // +stateify savable
 type fileOperations struct {
+	fsutil.NoIoctl     `state:"nosave"`
 	fsutil.NoopRelease `state:"nosave"`
 
 	// iops are the Inode operations for this file.
@@ -49,49 +47,49 @@ type fileOperations struct {
 
 	// dirCursor is the directory cursor.
 	dirCursor string
-
-	// allowIoctl determines whether ioctls should be passed through to the
-	// host.
-	allowIoctl bool
 }
 
 // fileOperations implements fs.FileOperations.
 var _ fs.FileOperations = (*fileOperations)(nil)
 
 // NewFile creates a new File backed by the provided host file descriptor. If
-// NewFile succeeds, ownership of the fd is transferred to the returned File.
+// NewFile succeeds, ownership of the FD is transferred to the returned File.
 //
 // The returned File cannot be saved, since there is no guarantee that the same
-// fd will exist or represent the same file at time of restore. If such a
+// FD will exist or represent the same file at time of restore. If such a
 // guarantee does exist, use ImportFile instead.
 func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
 	return newFileFromDonatedFD(ctx, fd, mounter, false, false)
 }
 
 // ImportFile creates a new File backed by the provided host file descriptor.
-// Unlike NewFile, the file descriptor used by the File is duped from fd to
-// ensure that later changes to fd are not reflected by the fs.File.
+// Unlike NewFile, the file descriptor used by the File is duped from FD to
+// ensure that later changes to FD are not reflected by the fs.File.
 //
-// If the returned file is saved, it will be restored by re-importing the fd
+// If the returned file is saved, it will be restored by re-importing the FD
 // originally passed to ImportFile. It is the restorer's responsibility to
-// ensure that the fd represents the same file.
-func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, allowIoctl bool) (*fs.File, error) {
-	return newFileFromDonatedFD(ctx, fd, mounter, true, allowIoctl)
+// ensure that the FD represents the same file.
+func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY)
 }
 
-// newFileFromDonatedFD returns an fs.File from a donated fd. If the fd is
+// newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is
 // saveable, then saveable is true.
-func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, allowIoctl bool) (*fs.File, error) {
+func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(donated, &s); err != nil {
 		return nil, err
 	}
+	flags, err := fileFlagsFromDonatedFD(donated)
+	if err != nil {
+		return nil, err
+	}
 	switch s.Mode & syscall.S_IFMT {
 	case syscall.S_IFSOCK:
-		flags, err := fileFlagsFromDonatedFD(donated)
-		if err != nil {
-			return nil, err
+		if isTTY {
+			return nil, fmt.Errorf("cannot import host socket as TTY")
 		}
+
 		s, err := newSocket(ctx, donated, saveable)
 		if err != nil {
 			return nil, err
@@ -101,10 +99,6 @@ func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner
 		})
 		return s, nil
 	default:
-		flags, err := fileFlagsFromDonatedFD(donated)
-		if err != nil {
-			return nil, err
-		}
 		msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
 		inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
 		if err != nil {
@@ -116,14 +110,18 @@ func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner
 		dirent := fs.NewDirent(inode, name)
 		defer dirent.DecRef()
 
-		return newFile(ctx, dirent, flags, iops, allowIoctl), nil
+		if isTTY {
+			return newTTYFile(ctx, dirent, flags, iops), nil
+		}
+
+		return newFile(ctx, dirent, flags, iops), nil
 	}
 }
 
 func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
 	flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0)
 	if errno != 0 {
-		log.Warningf("Failed to get file flags for donated fd %d (errno=%d)", donated, errno)
+		log.Warningf("Failed to get file flags for donated FD %d (errno=%d)", donated, errno)
 		return fs.FileFlags{}, syscall.EIO
 	}
 	accmode := flags & syscall.O_ACCMODE
@@ -138,17 +136,14 @@ func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
 }
 
 // newFile returns a new fs.File.
-func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations, allowIoctl bool) *fs.File {
+func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
 	if !iops.ReturnsWouldBlock() {
 		// Allow reading/writing at an arbitrary offset for files
 		// that support it.
 		flags.Pread = true
 		flags.Pwrite = true
 	}
-	return fs.NewFile(ctx, dirent, flags, &fileOperations{
-		iops:       iops,
-		allowIoctl: allowIoctl,
-	})
+	return fs.NewFile(ctx, dirent, flags, &fileOperations{iops: iops})
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
@@ -269,7 +264,7 @@ func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64,
 func (f *fileOperations) Flush(context.Context, *fs.File) error {
 	// This is a no-op because flushing the resource backing this
 	// file would mean closing it. We can't do that because other
-	// open files may depend on the backing host fd.
+	// open files may depend on the backing host FD.
 	return nil
 }
 
@@ -285,88 +280,3 @@ func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts
 func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
 	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
 }
-
-// Ioctl implements fs.FileOperations.Iocotl.
-func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	if !f.allowIoctl {
-		return 0, syserror.ENOTTY
-	}
-	// Ignore arg[0].  This is the real FD:
-	fd := f.iops.fileState.FD()
-	ioctl := args[1].Uint64()
-	switch ioctl {
-	case linux.TCGETS:
-		termios, err := ioctlGetTermios(fd)
-		if err != nil {
-			return 0, err
-		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-
-	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
-		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
-			return 0, err
-		}
-		err := ioctlSetTermios(fd, ioctl, &termios)
-		return 0, err
-
-	case linux.TIOCGPGRP:
-		// Args: pid_t *argp
-		// When successful, equivalent to *argp = tcgetpgrp(fd).
-		// Get the process group ID of the foreground process group on
-		// this terminal.
-
-		t := kernel.TaskFromContext(ctx)
-		if t == nil {
-			panic(fmt.Sprintf("cannot get thread group from context %v", ctx))
-		}
-		tid := t.ThreadID()
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tid, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-
-	case linux.TIOCSPGRP:
-		// Args: const pid_t *argp
-		// Equivalent to tcsetpgrp(fd, *argp).
-		// Set the foreground process group ID of this terminal.
-
-		// Not much we can do with this one at the moment, so we just
-		// lie and pretend everything is great. Bash and Sh seem fine
-		// with this.
-		log.Warningf("Ignoring application ioctl(TIOCSPGRP) call")
-		return 0, nil
-
-	case linux.TIOCGWINSZ:
-		// Args: struct winsize *argp
-		// Get window size.
-		winsize, err := ioctlGetWinsize(fd)
-		if err != nil {
-			return 0, err
-		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-
-	case linux.TIOCSWINSZ:
-		// Args: const struct winsize *argp
-		// Set window size.
-		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
-			return 0, err
-		}
-		err := ioctlSetWinsize(fd, &winsize)
-		return 0, err
-
-	default:
-		return 0, syserror.ENOTTY
-	}
-}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index e7254fa7d..c2e8ba62f 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -73,7 +73,7 @@ type inodeFileState struct {
 	// Common file system state.
 	mops *superOperations `state:"wait"`
 
-	// descriptor is the backing host fd.
+	// descriptor is the backing host FD.
 	descriptor *descriptor `state:"wait"`
 
 	// Event queue for blocking operations.
@@ -167,7 +167,7 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 // inodeOperations implements fs.InodeOperations.
 var _ fs.InodeOperations = (*inodeOperations)(nil)
 
-// newInode returns a new fs.Inode backed by the host fd.
+// newInode returns a new fs.Inode backed by the host FD.
 func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
 	// Retrieve metadata.
 	var s syscall.Stat_t
@@ -212,8 +212,8 @@ func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
 	return i.cachingInodeOps
 }
 
-// ReturnsWouldBlock returns true if this host fd can return EWOULDBLOCK
-// for operations that would block.
+// ReturnsWouldBlock returns true if this host FD can return EWOULDBLOCK for
+// operations that would block.
 func (i *inodeOperations) ReturnsWouldBlock() bool {
 	return i.fileState.descriptor.wouldBlock
 }
@@ -226,7 +226,7 @@ func (i *inodeOperations) Release(context.Context) {
 
 // Lookup implements fs.InodeOperations.Lookup.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
-	// Get a new fd relative to i at name.
+	// Get a new FD relative to i at name.
 	fd, err := open(i, name)
 	if err != nil {
 		if err == syserror.ENOENT {
@@ -321,7 +321,7 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.Bound
 
 // GetFile implements fs.InodeOperations.GetFile.
 func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	return newFile(ctx, d, flags, i, false), nil
+	return newFile(ctx, d, flags, i), nil
 }
 
 // canMap returns true if this fs.Inode can be memory mapped.
@@ -362,7 +362,7 @@ func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) err
 func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool {
 	// Can we use host kernel metadata caches?
 	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
-		// Then just change the timestamps on the fd, the host
+		// Then just change the timestamps on the FD, the host
 		// will synchronize the metadata update with any host
 		// inode and page cache.
 		return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil
@@ -375,7 +375,7 @@ func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f
 func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
 	// Can we use host kernel metadata caches?
 	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
-		// Then just change the timestamps on the fd, the host
+		// Then just change the timestamps on the FD, the host
 		// will synchronize the metadata update with any host
 		// inode and page cache.
 		return setTimestamps(i.fileState.FD(), ts)
@@ -388,7 +388,7 @@ func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts
 func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
 	// Is the file not memory-mappable?
 	if !canMap(inode) {
-		// Then just change the file size on the fd, the host
+		// Then just change the file size on the FD, the host
 		// will synchronize the metadata update with any host
 		// inode and page cache.
 		return syscall.Ftruncate(i.fileState.FD(), size)
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
new file mode 100644
index 000000000..ad1323610
--- /dev/null
+++ b/pkg/sentry/fs/host/tty.go
@@ -0,0 +1,185 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TTYFileOperations implements fs.FileOperations for a host file descriptor
+// that wraps a TTY FD.
+//
+// +stateify savable
+type TTYFileOperations struct {
+	fileOperations
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// FGProcessGroup is the foreground process group this TTY.  Will be
+	// nil if not set or if this file has been released.
+	fgProcessGroup *kernel.ProcessGroup
+}
+
+// newTTYFile returns a new fs.File that wraps a TTY FD.
+func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
+	return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
+		fileOperations: fileOperations{iops: iops},
+	})
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY. This will
+// be nil if the foreground process has not been set or if the file has been
+// released.
+func (t *TTYFileOperations) ForegroundProcessGroup() *kernel.ProcessGroup {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.fgProcessGroup
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *TTYFileOperations) Release() {
+	t.mu.Lock()
+	t.fgProcessGroup = nil
+	t.mu.Unlock()
+
+	t.fileOperations.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Ignore arg[0].  This is the real FD:
+	fd := t.fileOperations.iops.fileState.FD()
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case linux.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		return 0, err
+
+	case linux.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on
+		// this terminal.
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if t.fgProcessGroup == nil {
+			// No process group has been set yet. Let's just lie
+			// and tell it the process group from the current task.
+			// The app is probably going to set it to something
+			// else very soon anyways.
+			t.fgProcessGroup = kernel.TaskFromContext(ctx).ThreadGroup().ProcessGroup()
+		}
+
+		// Map the ProcessGroup into a ProcessGroupID in the task's PID
+		// namespace.
+		pgID := kernel.TaskFromContext(ctx).ThreadGroup().PIDNamespace().IDOfProcessGroup(t.fgProcessGroup)
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		var pgID kernel.ProcessGroupID
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		// pgID must be non-negative.
+		if pgID < 0 {
+			return 0, syserror.EINVAL
+		}
+
+		// Process group with pgID must exist in this PID namespace.
+		task := kernel.TaskFromContext(ctx)
+		pidns := task.PIDNamespace()
+		pg := pidns.ProcessGroupWithID(pgID)
+		if pg == nil {
+			return 0, syserror.ESRCH
+		}
+
+		// Process group must be in same session as calling task's
+		// process group.
+		curSession := task.ThreadGroup().ProcessGroup().Session()
+		curSessionID := pidns.IDOfSession(curSession)
+		if pidns.IDOfSession(pg.Session()) != curSessionID {
+			return 0, syserror.EPERM
+		}
+
+		t.mu.Lock()
+		t.fgProcessGroup = pg
+		t.mu.Unlock()
+		return 0, nil
+
+	case linux.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := ioctlGetWinsize(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+		var winsize linux.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetWinsize(fd, &winsize)
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index cf4e18805..b44d218d9 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -219,6 +219,11 @@ func (pg *ProcessGroup) handleOrphan() {
 	return
 }
 
+// Session returns the process group's session without taking a reference.
+func (pg *ProcessGroup) Session() *Session {
+	return pg.session
+}
+
 // CreateSession creates a new Session, with the ThreadGroup as the leader.
 //
 // EPERM may be returned if either the given ThreadGroup is already a Session
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 98356e8b7..eaeb9e2d8 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -65,6 +65,10 @@ const (
 	// ContainerSignal is used to send a signal to a container.
 	ContainerSignal = "containerManager.Signal"
 
+	// ContainerSignalProcess is used to send a signal to a particular
+	// process in a container.
+	ContainerSignalProcess = "containerManager.SignalProcess"
+
 	// ContainerStart is the URPC endpoint for running a non-root container
 	// within a sandbox.
 	ContainerStart = "containerManager.Start"
@@ -92,7 +96,7 @@ const (
 	SandboxStacks = "debug.Stacks"
 )
 
-// ControlSocketAddr generates an abstract unix socket name for the given id.
+// ControlSocketAddr generates an abstract unix socket name for the given ID.
 func ControlSocketAddr(id string) string {
 	return fmt.Sprintf("\x00runsc-sandbox.%s", id)
 }
@@ -248,7 +252,7 @@ func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 }
 
 // ExecuteAsync starts running a command on a created or running sandbox. It
-// returns the pid of the new process.
+// returns the PID of the new process.
 func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
 	log.Debugf("containerManager.ExecuteAsync: %+v", args)
 	tgid, err := cm.l.executeAsync(args)
@@ -373,8 +377,12 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	// restore the state of multiple containers, nor exec processes.
 	cm.l.sandboxID = o.SandboxID
 	cm.l.mu.Lock()
-	key := execID{cid: o.SandboxID}
-	cm.l.processes = map[execID]*kernel.ThreadGroup{key: cm.l.k.GlobalInit()}
+	eid := execID{cid: o.SandboxID}
+	cm.l.processes = map[execID]*execProcess{
+		eid: &execProcess{
+			tg: cm.l.k.GlobalInit(),
+		},
+	}
 	cm.l.mu.Unlock()
 
 	// Tell the root container to start and wait for the result.
@@ -419,7 +427,7 @@ func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error
 
 // SignalArgs are arguments to the Signal method.
 type SignalArgs struct {
-	// CID is the container id.
+	// CID is the container ID.
 	CID string
 
 	// Signo is the signal to send to the process.
@@ -430,9 +438,31 @@ type SignalArgs struct {
 	All bool
 }
 
-// Signal sends a signal to the init process of the container.
-// TODO: Send signal to exec process.
+// Signal sends a signal to the root process of the container.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
 	log.Debugf("containerManager.Signal %q %d, all: %t", args.CID, args.Signo, args.All)
-	return cm.l.signal(args.CID, args.Signo, args.All)
+	return cm.l.signalContainer(args.CID, args.Signo, args.All)
+}
+
+// SignalProcessArgs are arguments to the Signal method.
+type SignalProcessArgs struct {
+	// CID is the container ID.
+	CID string
+
+	// PID is the process ID in the given container that will be signaled.
+	PID int32
+
+	// Signo is the signal to send to the process.
+	Signo int32
+
+	// SendToForegroundProcess indicates that the signal should be sent to
+	// the foreground process group in the session that PID belongs to.
+	// This is only valid if the process is attached to a host TTY.
+	SendToForegroundProcess bool
+}
+
+// SignalProcess sends a signal to a particular process in the container.
+func (cm *containerManager) SignalProcess(args *SignalProcessArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Signal: %+v", args)
+	return cm.l.signalProcess(args.CID, args.PID, args.Signo, args.SendToForegroundProcess)
 }
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 92d641b68..a5a6ba8af 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -25,8 +25,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 )
 
-// createFDMap creates an fd map that contains stdin, stdout, and stderr. If
-// console is true, then ioctl calls will be passed through to the host fd.
+// createFDMap creates an FD map that contains stdin, stdout, and stderr. If
+// console is true, then ioctl calls will be passed through to the host FD.
 // Upon success, createFDMap dups then closes stdioFDs.
 func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
 	if len(stdioFDs) != 3 {
@@ -36,7 +36,7 @@ func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, cons
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
 
-	// Maps sandbox fd to host fd.
+	// Maps sandbox FD to host FD.
 	fdMap := map[int]int{
 		0: stdioFDs[0],
 		1: stdioFDs[1],
@@ -45,7 +45,7 @@ func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, cons
 	mounter := fs.FileOwnerFromContext(ctx)
 
 	for sfd, hfd := range fdMap {
-		file, err := host.ImportFile(ctx, hfd, mounter, console /* allow ioctls */)
+		file, err := host.ImportFile(ctx, hfd, mounter, console /* isTTY */)
 		if err != nil {
 			return nil, fmt.Errorf("failed to import fd %d: %v", hfd, err)
 		}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9fa9b51a0..766a2e968 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -112,7 +113,7 @@ type Loader struct {
 	// have the corresponding pid set.
 	//
 	// processes is guardded by mu.
-	processes map[execID]*kernel.ThreadGroup
+	processes map[execID]*execProcess
 }
 
 // execID uniquely identifies a sentry process.
@@ -121,6 +122,14 @@ type execID struct {
 	pid kernel.ThreadID
 }
 
+// execProcess contains the thread group and host TTY of a sentry process.
+type execProcess struct {
+	tg *kernel.ThreadGroup
+
+	// tty will be nil if the process is not attached to a terminal.
+	tty *host.TTYFileOperations
+}
+
 func init() {
 	// Initialize the random number generator.
 	rand.Seed(gtime.Now().UnixNano())
@@ -276,7 +285,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
 		sandboxID:             id,
-		processes:             make(map[execID]*kernel.ThreadGroup),
+		processes:             make(map[execID]*execProcess),
 	}
 	ctrl.manager.l = l
 	return l, nil
@@ -330,7 +339,7 @@ func createPlatform(conf *Config, deviceFD int) (platform.Platform, error) {
 	case PlatformKVM:
 		log.Infof("Platform: kvm")
 		if deviceFD < 0 {
-			return nil, fmt.Errorf("kvm device fd must be provided")
+			return nil, fmt.Errorf("kvm device FD must be provided")
 		}
 		return kvm.New(os.NewFile(uintptr(deviceFD), "kvm device"))
 	default:
@@ -413,8 +422,8 @@ func (l *Loader) run() error {
 	}
 
 	l.mu.Lock()
-	key := execID{cid: l.sandboxID}
-	l.processes[key] = l.k.GlobalInit()
+	eid := execID{cid: l.sandboxID}
+	l.processes[eid] = &execProcess{tg: l.k.GlobalInit()}
 	l.mu.Unlock()
 
 	// Start signal forwarding only after an init process is created.
@@ -510,8 +519,8 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	key := execID{cid: cid}
-	l.processes[key] = tg
+	eid := execID{cid: cid}
+	l.processes[eid] = &execProcess{tg: tg}
 
 	return nil
 }
@@ -520,7 +529,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 // filesystem.
 func (l *Loader) destroyContainer(cid string) error {
 	// First kill and wait for all processes in the container.
-	if err := l.signal(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
+	if err := l.signalContainer(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
 		return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
 	}
 
@@ -549,12 +558,12 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	// process with the same Root.
 	l.mu.Lock()
 	rootKey := execID{cid: args.ContainerID}
-	tg, ok := l.processes[rootKey]
+	ep, ok := l.processes[rootKey]
 	l.mu.Unlock()
 	if !ok {
 		return 0, fmt.Errorf("cannot exec in container %q: no such container", args.ContainerID)
 	}
-	tg.Leader().WithMuLocked(func(t *kernel.Task) {
+	ep.tg.Leader().WithMuLocked(func(t *kernel.Task) {
 		args.Root = t.FSContext().RootDirectory()
 	})
 	if args.Root != nil {
@@ -563,7 +572,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
-	tg, tgid, err := control.ExecAsync(&proc, args)
+	tg, tgid, ttyFile, err := control.ExecAsync(&proc, args)
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %+v: %v", args, err)
 	}
@@ -573,7 +582,10 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 	eid := execID{cid: args.ContainerID, pid: tgid}
-	l.processes[eid] = tg
+	l.processes[eid] = &execProcess{
+		tg:  tg,
+		tty: ttyFile,
+	}
 	log.Debugf("updated processes: %v", l.processes)
 
 	return tgid, nil
@@ -584,8 +596,8 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
 	l.mu.Lock()
-	key := execID{cid: cid}
-	tg, ok := l.processes[key]
+	eid := execID{cid: cid}
+	ep, ok := l.processes[eid]
 	l.mu.Unlock()
 	if !ok {
 		return fmt.Errorf("can't find process for container %q in %v", cid, l.processes)
@@ -593,7 +605,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 
 	// If the thread either has already exited or exits during waiting,
 	// consider the container exited.
-	ws := l.wait(tg)
+	ws := l.wait(ep.tg)
 	*waitStatus = ws
 	return nil
 }
@@ -610,10 +622,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 	// entry in l.processes.
 	l.mu.Lock()
 	eid := execID{cid: cid, pid: tgid}
-	tg, ok := l.processes[eid]
+	ep, ok := l.processes[eid]
 	l.mu.Unlock()
 	if ok {
-		ws := l.wait(tg)
+		ws := l.wait(ep.tg)
 		*waitStatus = ws
 		if clearStatus {
 			// Remove tg from the cache.
@@ -626,8 +638,8 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 	}
 
 	// This process wasn't created by runsc exec or start, so just find it
-	// by pid and hope it hasn't exited yet.
-	tg = l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
+	// by PID and hope it hasn't exited yet.
+	tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
 	if tg == nil {
 		return fmt.Errorf("no thread group with ID %d", tgid)
 	}
@@ -682,18 +694,66 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	}
 }
 
-func (l *Loader) signal(cid string, signo int32, all bool) error {
+// signalProcess sends a signal to the process with the given PID. If
+// sendToFGProcess is true, then the signal will be sent to the foreground
+// process group in the same session that PID belongs to.
+func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess bool) error {
+	si := arch.SignalInfo{Signo: signo}
+
+	if pid <= 0 {
+		return fmt.Errorf("failed to signal container %q PID %d: PID must be positive", cid, pid)
+	}
+
+	eid := execID{
+		cid: cid,
+		pid: kernel.ThreadID(pid),
+	}
 	l.mu.Lock()
-	key := execID{cid: cid}
-	tg, ok := l.processes[key]
+	ep, ok := l.processes[eid]
 	l.mu.Unlock()
+
 	if !ok {
-		return fmt.Errorf("failed to signal container %q: no such container", cid)
+		return fmt.Errorf("failed to signal container %q PID %d: no such PID", cid, pid)
+	}
+
+	if !sendToFGProcess {
+		// Send signal directly to exec process.
+		return ep.tg.SendSignal(&si)
 	}
 
+	// Lookup foreground process group from the TTY for the given process,
+	// and send the signal to it.
+	if ep.tty == nil {
+		return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, pid)
+	}
+	pg := ep.tty.ForegroundProcessGroup()
+	if pg == nil {
+		// No foreground process group has been set. Signal the
+		// original thread group.
+		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, pid, pid)
+		return ep.tg.SendSignal(&si)
+	}
+
+	// Send the signal.
+	return pg.Originator().SendSignal(&si)
+}
+
+// signalContainer sends a signal to the root container process, or to all
+// processes in the container if all is true.
+func (l *Loader) signalContainer(cid string, signo int32, all bool) error {
 	si := arch.SignalInfo{Signo: signo}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	ep, ok := l.processes[eid]
+	if !ok {
+		return fmt.Errorf("failed to signal container %q: no such container", cid)
+	}
+
 	if !all {
-		return tg.Leader().SendSignal(&si)
+		return ep.tg.SendSignal(&si)
 	}
 
 	// Pause the kernel to prevent new processes from being created while
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 28229dbcf..336edf3f6 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -158,6 +158,13 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("error getting processes for container: %v", err)
 	}
 
+	if e.StdioIsPty {
+		// Forward signals sent to this process to the foreground
+		// process in the sandbox.
+		stopForwarding := c.ForwardSignals(pid, true /* fgProcess */)
+		defer stopForwarding()
+	}
+
 	// Write the sandbox-internal pid if required.
 	if ex.internalPidFile != "" {
 		pidStr := []byte(strconv.Itoa(int(pid)))
@@ -216,9 +223,9 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	cmd.Stderr = os.Stderr
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the tty on the sandbox process.
+	// pty master/slave pair and set the TTY on the sandbox process.
 	if ex.consoleSocket != "" {
-		// Create a new tty pair and send the master on the provided
+		// Create a new TTY pair and send the master on the provided
 		// socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
 		if err != nil {
@@ -226,7 +233,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		}
 		defer tty.Close()
 
-		// Set stdio to the new tty slave.
+		// Set stdio to the new TTY slave.
 		cmd.Stdin = tty
 		cmd.Stdout = tty
 		cmd.Stderr = tty
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index e68fb1e8e..bf8b9a2ab 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -50,10 +50,12 @@ go_test(
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
+        "//pkg/urpc",
         "//runsc/boot",
         "//runsc/specutils",
         "//runsc/test/testutil",
         "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/runsc/container/container.go b/runsc/container/container.go
index be833c03d..4b0037b4e 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -22,6 +22,7 @@ import (
 	"io/ioutil"
 	"os"
 	"os/exec"
+	"os/signal"
 	"path/filepath"
 	"regexp"
 	"strconv"
@@ -107,14 +108,13 @@ type Container struct {
 	Owner string `json:"owner"`
 
 	// ConsoleSocket is the path to a unix domain socket that will receive
-	// the console FD. It is only used during create, so we don't need to
-	// store it in the metadata.
-	ConsoleSocket string `json:"-"`
+	// the console FD.
+	ConsoleSocket string `json:"consoleSocket"`
 
 	// Status is the current container Status.
 	Status Status `json:"status"`
 
-	// GoferPid is the pid of the gofer running along side the sandbox. May
+	// GoferPid is the PID of the gofer running along side the sandbox. May
 	// be 0 if the gofer has been killed.
 	GoferPid int `json:"goferPid"`
 
@@ -313,12 +313,12 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		return nil, err
 	}
 
-	// Write the pid file. Containerd considers the create complete after
+	// Write the PID file. Containerd considers the create complete after
 	// this file is created, so it must be the last thing we do.
 	if pidFile != "" {
 		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.Pid())), 0644); err != nil {
 			c.Destroy()
-			return nil, fmt.Errorf("error writing pid file: %v", err)
+			return nil, fmt.Errorf("error writing PID file: %v", err)
 		}
 	}
 
@@ -406,7 +406,7 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	return c.Wait()
 }
 
-// Execute runs the specified command in the container. It returns the pid of
+// Execute runs the specified command in the container. It returns the PID of
 // the newly created process.
 func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
 	log.Debugf("Execute in container %q, args: %+v", c.ID, args)
@@ -429,7 +429,7 @@ func (c *Container) Event() (*boot.Event, error) {
 // Pid returns the Pid of the sandbox the container is running in, or -1 if the
 // container is not running.
 func (c *Container) Pid() int {
-	if err := c.requireStatus("pid", Created, Running, Paused); err != nil {
+	if err := c.requireStatus("get PID", Created, Running, Paused); err != nil {
 		return -1
 	}
 	return c.Sandbox.Pid
@@ -449,7 +449,7 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
 // returns its WaitStatus.
 func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
-	log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
+	log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("container is not running")
 	}
@@ -459,7 +459,7 @@ func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus
 // WaitPID waits for process 'pid' in the container's PID namespace and returns
 // its WaitStatus.
 func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
-	log.Debugf("Wait on pid %d in container %q", pid, c.ID)
+	log.Debugf("Wait on PID %d in container %q", pid, c.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("container is not running")
 	}
@@ -483,7 +483,30 @@ func (c *Container) Signal(sig syscall.Signal, all bool) error {
 	if !c.isSandboxRunning() {
 		return fmt.Errorf("container is not running")
 	}
-	return c.Sandbox.Signal(c.ID, sig, all)
+	return c.Sandbox.SignalContainer(c.ID, sig, all)
+}
+
+// ForwardSignals forwards all signals received by the current process to the
+// container process inside the sandbox. It returns a function that will stop
+// forwarding signals.
+func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() {
+	log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh)
+	go func() {
+		for s := range sigCh {
+			log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", s, c.ID, pid, fgProcess)
+			if err := c.Sandbox.SignalProcess(c.ID, pid, s.(syscall.Signal), fgProcess); err != nil {
+				log.Warningf("error forwarding signal %d to container %q: %v", s, c.ID, err)
+			}
+		}
+		log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+	}()
+
+	return func() {
+		signal.Stop(sigCh)
+		close(sigCh)
+	}
 }
 
 // Checkpoint sends the checkpoint call to the container.
@@ -683,9 +706,9 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 		if err != nil {
 			return nil, err
 		}
-		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd"))
+		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD"))
 
-		goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd")
+		goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD")
 		defer goferEnd.Close()
 		goferEnds = append(goferEnds, goferEnd)
 
@@ -710,7 +733,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	if err := specutils.StartInNS(cmd, nss); err != nil {
 		return nil, err
 	}
-	log.Infof("Gofer started, pid: %d", cmd.Process.Pid)
+	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
 	c.GoferPid = cmd.Process.Pid
 	return sandEnds, nil
 }
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index aebfb2878..84b59ffd8 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -31,6 +31,7 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
+	"github.com/kr/pty"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -38,6 +39,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
@@ -1577,6 +1579,121 @@ func TestRootNotMount(t *testing.T) {
 	}
 }
 
+func TestJobControlSignalExec(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	conf := testutil.TestConfig()
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Create a pty master/slave. The slave will be passed to the exec
+	// process.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		t.Fatalf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+	defer ptySlave.Close()
+
+	// Exec bash and attach a terminal.
+	args := &control.ExecArgs{
+		Filename: "/bin/bash",
+		// Don't let bash execute from profile or rc files, otherwise
+		// our PID counts get messed up.
+		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
+		// Pass the pty slave as FD 0, 1, and 2.
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{ptySlave, ptySlave, ptySlave},
+		},
+		StdioIsPty: true,
+	}
+
+	pid, err := c.Execute(args)
+	if err != nil {
+		t.Fatalf("error executing: %v", err)
+	}
+	if pid != 2 {
+		t.Fatalf("exec got pid %d, wanted %d", pid, 2)
+	}
+
+	// Make sure all the processes are running.
+	expectedPL := []*control.Process{
+		// Root container process.
+		{PID: 1, Cmd: "sleep"},
+		// Bash from exec process.
+		{PID: 2, Cmd: "bash"},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Execute sleep.
+	ptyMaster.Write([]byte("sleep 100\n"))
+
+	// Wait for it to start. Sleep's PPID is bash's PID.
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Send a SIGTERM to the foreground process for the exec PID. Note that
+	// although we pass in the PID of "bash", it should actually terminate
+	// "sleep", since that is the foreground process.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGTERM, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Sleep process should be gone.
+	expectedPL = expectedPL[:len(expectedPL)-1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Sleep is dead, but it may take more time for bash to notice and
+	// change the foreground process back to itself. We know it is done
+	// when bash writes "Terminated" to the pty.
+	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
+		t.Fatalf("bash did not take over pty: %v", err)
+	}
+
+	// Send a SIGKILL to the foreground process again. This time "bash"
+	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
+	// because bash ignores those.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGKILL, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+	expectedPL = expectedPL[:1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Make sure the process indicates it was killed by a SIGKILL.
+	ws, err := c.WaitPID(pid, true)
+	if err != nil {
+		t.Errorf("waiting on container failed: %v", err)
+	}
+	if !ws.Signaled() {
+		t.Error("ws.Signaled() got false, want true")
+	}
+	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
+		t.Errorf("ws.Signal() got %v, want %v", got, want)
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index e5f7daf60..ab200b75c 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -477,11 +477,12 @@ func TestMultiContainerDestroy(t *testing.T) {
 }
 
 func TestMultiContainerProcesses(t *testing.T) {
-	// Note: use 'while true' to keep 'sh' process around. Otherwise, shell will
-	// just execve into 'sleep' and both containers will look the same.
+	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
+	// will just execve into 'sleep' and both containers will look the
+	// same.
 	specs, ids := createSpecs(
 		[]string{"sleep", "100"},
-		[]string{"sh", "-c", "while true; do sleep 100; done"})
+		[]string{"sh", "-c", "{ sleep 100; }"})
 	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 4111b1a60..e4853af69 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -80,7 +80,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 
 // StartRoot starts running the root container process inside the sandbox.
 func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
-	log.Debugf("Start root sandbox %q, pid: %d", s.ID, s.Pid)
+	log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
@@ -107,7 +107,7 @@ func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, goferFi
 		defer f.Close()
 	}
 
-	log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid)
+	log.Debugf("Start non-root container sandbox %q, PID: %d", s.ID, s.Pid)
 	sandboxConn, err := s.sandboxConnect()
 	if err != nil {
 		return fmt.Errorf("couldn't connect to sandbox: %v", err)
@@ -147,7 +147,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 		SandboxID: s.ID,
 	}
 
-	// If the platform needs a device fd we must pass it in.
+	// If the platform needs a device FD we must pass it in.
 	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
 		return err
 	} else if deviceFile != nil {
@@ -192,7 +192,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	return pl, nil
 }
 
-// Execute runs the specified command in the container. It returns the pid of
+// Execute runs the specified command in the container. It returns the PID of
 // the newly created process.
 func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
 	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
@@ -239,7 +239,7 @@ func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
 }
 
 func (s *Sandbox) connError(err error) error {
-	return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+	return fmt.Errorf("error connecting to control server at PID %d: %v", s.Pid, err)
 }
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
@@ -322,7 +322,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nextFD++
 	}
 
-	// If the platform needs a device fd we must pass it in.
+	// If the platform needs a device FD we must pass it in.
 	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
 		return err
 	} else if deviceFile != nil {
@@ -338,7 +338,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	cmd.Stderr = os.Stderr
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the tty on the sandbox process.
+	// pty master/slave pair and set the TTY on the sandbox process.
 	if consoleEnabled {
 		// console.NewWithSocket will send the master on the socket,
 		// and return the slave.
@@ -461,7 +461,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Add container as the last argument.
 	cmd.Args = append(cmd.Args, s.ID)
 
-	// Log the fds we are donating to the sandbox process.
+	// Log the FDs we are donating to the sandbox process.
 	for i, f := range cmd.ExtraFiles {
 		log.Debugf("Donating FD %d: %q", i+3, f.Name())
 	}
@@ -472,7 +472,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		return err
 	}
 	s.Pid = cmd.Process.Pid
-	log.Infof("Sandbox started, pid: %d", s.Pid)
+	log.Infof("Sandbox started, PID: %d", s.Pid)
 
 	return nil
 }
@@ -572,9 +572,10 @@ func (s *Sandbox) destroy() error {
 	return nil
 }
 
-// Signal sends the signal to a container in the sandbox. If all is true and
-// signal is SIGKILL, then waits for all processes to exit before returning.
-func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error {
+// SignalContainer sends the signal to a container in the sandbox. If all is
+// true and signal is SIGKILL, then waits for all processes to exit before
+// returning.
+func (s *Sandbox) SignalContainer(cid string, sig syscall.Signal, all bool) error {
 	log.Debugf("Signal sandbox %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -593,6 +594,30 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error {
 	return nil
 }
 
+// SignalProcess sends the signal to a particular process in the container. If
+// fgProcess is true, then the signal is sent to the foreground process group
+// in the same session that PID belongs to. This is only valid if the process
+// is attached to a host TTY.
+func (s *Sandbox) SignalProcess(cid string, pid int32, sig syscall.Signal, fgProcess bool) error {
+	log.Debugf("Signal sandbox %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	args := boot.SignalProcessArgs{
+		CID:                     cid,
+		Signo:                   int32(sig),
+		PID:                     pid,
+		SendToForegroundProcess: fgProcess,
+	}
+	if err := conn.Call(boot.ContainerSignalProcess, &args, nil); err != nil {
+		return fmt.Errorf("err signaling container %q PID %d: %v", cid, pid, err)
+	}
+	return nil
+}
+
 // Checkpoint sends the checkpoint call for a container in the sandbox.
 // The statefile will be written to f.
 func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 910c36597..ddd088223 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -27,6 +27,7 @@
 package integration
 
 import (
+	"syscall"
 	"testing"
 	"time"
 
@@ -60,3 +61,57 @@ func TestExecCapabilities(t *testing.T) {
 		t.Errorf("wrong capabilities, got: %q, want: %q", got, want)
 	}
 }
+
+func TestExecJobControl(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("exec-test")
+
+	// Start the container.
+	if _, err := d.Run("alpine", "sleep", "1000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Exec 'sh' with an attached pty.
+	cmd, ptmx, err := d.ExecWithTerminal("sh")
+	if err != nil {
+		t.Fatalf("docker exec failed: %v", err)
+	}
+	defer ptmx.Close()
+
+	// Call "sleep 100" in the shell.
+	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Give shell a few seconds to start executing the sleep.
+	time.Sleep(2 * time.Second)
+
+	// Send a ^C to the pty, which should kill sleep, but not the shell.
+	// \x03 is ASCII "end of text", which is the same as ^C.
+	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// The shell should still be alive at this point. Sleep should have
+	// exited with code 2+128=130. We'll exit with 10 plus that number, so
+	// that we can be sure that the shell did not get signalled.
+	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Exec process should exit with code 10+130=140.
+	ps, err := cmd.Process.Wait()
+	if err != nil {
+		t.Fatalf("error waiting for exec process: %v", err)
+	}
+	ws := ps.Sys().(syscall.WaitStatus)
+	if !ws.Exited() {
+		t.Errorf("ws.Exited got false, want true")
+	}
+	if got, want := ws.ExitStatus(), 140; got != want {
+		t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
+	}
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index ca91e07ff..da2535bfa 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -17,6 +17,7 @@ go_library(
         "//runsc/boot",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 7f5909987..55ca353b8 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -26,6 +26,8 @@ import (
 	"strconv"
 	"strings"
 	"time"
+
+	"github.com/kr/pty"
 )
 
 func init() {
@@ -131,6 +133,17 @@ func do(args ...string) (string, error) {
 	return string(out), nil
 }
 
+// doWithPty executes docker command with stdio attached to a pty.
+func doWithPty(args ...string) (*exec.Cmd, *os.File, error) {
+	fmt.Printf("Running with pty: docker %s\n", args)
+	cmd := exec.Command("docker", args...)
+	ptmx, err := pty.Start(cmd)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error executing docker %s with a pty: %v", args, err)
+	}
+	return cmd, ptmx, nil
+}
+
 // Pull pulls a docker image. This is used in tests to isolate the
 // time to pull the image off the network from the time to actually
 // start the container, to avoid timeouts over slow networks.
@@ -197,6 +210,14 @@ func (d *Docker) Exec(args ...string) (string, error) {
 	return do(a...)
 }
 
+// ExecWithTerminal calls 'docker exec -it' with the arguments provided and
+// attaches a pty to stdio.
+func (d *Docker) ExecWithTerminal(args ...string) (*exec.Cmd, *os.File, error) {
+	a := []string{"exec", "-it", d.Name}
+	a = append(a, args...)
+	return doWithPty(a...)
+}
+
 // Pause calls 'docker pause'.
 func (d *Docker) Pause() error {
 	if _, err := do("pause", d.Name); err != nil {
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 07d66e469..cdc7f78c3 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -16,6 +16,7 @@
 package testutil
 
 import (
+	"bufio"
 	"context"
 	"encoding/json"
 	"fmt"
@@ -27,6 +28,8 @@ import (
 	"os/signal"
 	"path/filepath"
 	"runtime"
+	"strings"
+	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -315,3 +318,36 @@ func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
 		}
 	}
 }
+
+// WaitUntilRead reads from the given reader until the wanted string is found
+// or until timeout.
+func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {
+	sc := bufio.NewScanner(r)
+	if split != nil {
+		sc.Split(split)
+	}
+	// done must be accessed atomically. A value greater than 0 indicates
+	// that the read loop can exit.
+	var done uint32
+	doneCh := make(chan struct{})
+	go func() {
+		for sc.Scan() {
+			t := sc.Text()
+			if strings.Contains(t, want) {
+				atomic.StoreUint32(&done, 1)
+				close(doneCh)
+				break
+			}
+			if atomic.LoadUint32(&done) > 0 {
+				break
+			}
+		}
+	}()
+	select {
+	case <-time.After(timeout):
+		atomic.StoreUint32(&done, 1)
+		return fmt.Errorf("timeout waiting to read %q", want)
+	case <-doneCh:
+		return nil
+	}
+}
-- 
cgit v1.2.3


From cf3dc2f8a55f2499f1385f6c5f47522b41cf25b3 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 2 Oct 2018 11:35:41 -0700
Subject: Fix compilation bug.

Docker.Run only returns a single argument.

PiperOrigin-RevId: 215427309
Change-Id: I1eebbc628853ca57f79d25e18d4f04dfa5a2a003
---
 runsc/test/integration/exec_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index ddd088223..014254aab 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -69,7 +69,7 @@ func TestExecJobControl(t *testing.T) {
 	d := testutil.MakeDocker("exec-test")
 
 	// Start the container.
-	if _, err := d.Run("alpine", "sleep", "1000"); err != nil {
+	if err := d.Run("alpine", "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
-- 
cgit v1.2.3


From 0a13042d48cb26439aa02a19fe959f93f2f22ac1 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 2 Oct 2018 17:27:09 -0700
Subject: Bump some timeouts in the image tests.

PiperOrigin-RevId: 215489101
Change-Id: Iaf96aa8edb1101b70548030c62995841215237d9
---
 pkg/eventchannel/BUILD         | 2 +-
 runsc/test/image/image_test.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index ac2ea869d..9d531ce12 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -15,7 +15,7 @@ go_library(
         "//pkg/log",
         "//pkg/unet",
         "@com_github_golang_protobuf//proto:go_default_library",
-        "@com_github_golang_protobuf//ptypes:go_default_library",
+        "@com_github_golang_protobuf//ptypes:go_default_library_gen",
     ],
 )
 
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 5048ffdd7..71d992115 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -285,7 +285,7 @@ func TestRuby(t *testing.T) {
 	}
 
 	// Wait until it's up and running, 'gem install' can take some time.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 1*time.Minute); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
-- 
cgit v1.2.3


From 77e43adeab4abcd301d76222e0304f551fbcf0cc Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 3 Oct 2018 09:31:53 -0700
Subject: Add TIOCINQ to allowed seccomp when hostinet is used

PiperOrigin-RevId: 215574070
Change-Id: Ib36e804adebaf756adb9cbc2752be9789691530b
---
 runsc/boot/filter/config.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'runsc')

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 352c64253..06c04e3bb 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -335,6 +335,10 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.TIOCOUTQ),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.TIOCINQ),
+			},
 		},
 		syscall.SYS_LISTEN:   {},
 		syscall.SYS_READV:    {},
-- 
cgit v1.2.3


From e215b9970ad82915a8d544b81b3c49d7d84a0eb0 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 3 Oct 2018 10:31:01 -0700
Subject: runsc: Pass root container's stdio via FD.

We were previously using the sandbox process's stdio as the root container's
stdio. This makes it difficult/impossible to distinguish output application
output from sandbox output, such as panics, which are always written to stderr.

Also close the console socket when we are done with it.

PiperOrigin-RevId: 215585180
Change-Id: I980b8c69bd61a8b8e0a496fd7bc90a06446764e0
---
 runsc/boot/loader.go      |  6 +++---
 runsc/boot/loader_test.go |  3 ++-
 runsc/cmd/boot.go         |  7 ++++++-
 runsc/console/console.go  |  2 ++
 runsc/sandbox/sandbox.go  | 50 +++++++++++++++++++++++++++++++++++------------
 5 files changed, 50 insertions(+), 18 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 766a2e968..726482bb2 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -140,7 +140,7 @@ func init() {
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, console bool) (*Loader, error) {
+func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, stdioFDs []int, console bool) (*Loader, error) {
 	if err := usage.Init(); err != nil {
 		return nil, fmt.Errorf("Error setting up memory usage: %v", err)
 	}
@@ -279,9 +279,9 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 		conf:                  conf,
 		console:               console,
 		watchdog:              watchdog,
-		stdioFDs:              []int{syscall.Stdin, syscall.Stdout, syscall.Stderr},
-		goferFDs:              goferFDs,
 		spec:                  spec,
+		goferFDs:              goferFDs,
+		stdioFDs:              stdioFDs,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
 		sandboxID:             id,
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 0b363253d..ea8411a8b 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -101,7 +101,8 @@ func createLoader() (*Loader, func(), error) {
 		return nil, nil, err
 	}
 
-	l, err := New("foo", spec, conf, fd, -1 /* device fd */, []int{sandEnd}, false)
+	stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())}
+	l, err := New("foo", spec, conf, fd, -1 /* device fd */, []int{sandEnd}, stdio, false)
 	if err != nil {
 		cleanup()
 		return nil, nil, err
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 82e534479..c6f78f63f 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -48,6 +48,10 @@ type Boot struct {
 	// ioFDs is the list of FDs used to connect to FS gofers.
 	ioFDs intFlags
 
+	// stdioFDs are the fds for stdin, stdout, and stderr. They must be
+	// provided in that order.
+	stdioFDs intFlags
+
 	// console is set to true if the sandbox should allow terminal ioctl(2)
 	// syscalls.
 	console bool
@@ -79,6 +83,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
 	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
+	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
 }
@@ -138,7 +143,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-	l, err := boot.New(f.Arg(0), spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.console)
+	l, err := boot.New(f.Arg(0), spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.stdioFDs.GetArray(), b.console)
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 2f2745b2b..3df184742 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -40,6 +40,7 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 		ptySlave.Close()
 		return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
 	}
+	defer conn.Close()
 	uc, ok := conn.(*net.UnixConn)
 	if !ok {
 		ptySlave.Close()
@@ -50,6 +51,7 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 		ptySlave.Close()
 		return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
 	}
+	defer socket.Close()
 
 	// Send the master FD over the connection.
 	msg := unix.UnixRights(int(ptyMaster.Fd()))
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e4853af69..1ed1ab61d 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -285,9 +285,6 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// All flags after this must be for the boot command
 	cmd.Args = append(cmd.Args, "boot", "--bundle="+bundleDir)
 
-	consoleEnabled := consoleSocket != ""
-	cmd.Args = append(cmd.Args, "--console="+strconv.FormatBool(consoleEnabled))
-
 	// Create a socket for the control server and donate it to the sandbox.
 	addr := boot.ControlSocketAddr(s.ID)
 	sockFD, err := server.CreateSocket(addr)
@@ -332,27 +329,54 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nextFD++
 	}
 
-	// Sandbox stdio defaults to current process stdio.
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-
 	// If the console control socket file is provided, then create a new
 	// pty master/slave pair and set the TTY on the sandbox process.
-	if consoleEnabled {
-		// console.NewWithSocket will send the master on the socket,
-		// and return the slave.
+	if consoleSocket != "" {
+		cmd.Args = append(cmd.Args, "--console=true")
+
+		// console.NewWithSocket will send the master on the given
+		// socket, and return the slave.
 		tty, err := console.NewWithSocket(consoleSocket)
 		if err != nil {
 			return fmt.Errorf("error setting up console with socket %q: %v", consoleSocket, err)
 		}
 		defer tty.Close()
+		fd := int(tty.Fd())
+
+		// Set the TTY as a controlling TTY on the sandbox process.
+		cmd.SysProcAttr.Setctty = true
+		cmd.SysProcAttr.Ctty = fd
 
+		// Ideally we would set the sandbox stdin to this process'
+		// stdin, but for some reason Docker does not like that (it
+		// never calls `runsc start`). Instead we set stdio to the
+		// console TTY, but note that this is distinct from the
+		// container stdio, which is passed via the flags below.
 		cmd.Stdin = tty
 		cmd.Stdout = tty
 		cmd.Stderr = tty
-		cmd.SysProcAttr.Setctty = true
-		cmd.SysProcAttr.Ctty = int(tty.Fd())
+
+		// Pass the tty as all stdio fds to sandbox.
+		for i := 0; i < 3; i++ {
+			cmd.ExtraFiles = append(cmd.ExtraFiles, tty)
+			cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
+			nextFD++
+		}
+	} else {
+		// Connect the sandbox process to this process's stdios. Note
+		// that this is distinct from the container's stdio, which is
+		// passed by the flags below.
+		cmd.Stdin = os.Stdin
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+
+		// If not using a console, pass our current stdio as the
+		// container stdio via flags.
+		for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
+			cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+			cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
+			nextFD++
+		}
 	}
 
 	// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
-- 
cgit v1.2.3


From 55d28fb124dc4caa9047c5ca9150c82f073e70b2 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 3 Oct 2018 10:32:35 -0700
Subject: runsc: Dup debug log file to stderr, so sentry panics don't get lost.

Docker and containerd do not expose runsc's stderr, so tracking down sentry
panics can be painful.

If we have a debug log file, we should send panics (and all stderr data) to the
log file.

PiperOrigin-RevId: 215585559
Change-Id: I3844259ed0cd26e26422bcdb40dded302740b8b6
---
 runsc/main.go | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'runsc')

diff --git a/runsc/main.go b/runsc/main.go
index 2a18c4b9e..7e704a127 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -177,6 +177,10 @@ func main() {
 
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+		// Dup f to stderr so we capture stack traces on panic.
+		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
+		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	} else if *debugLogDir != "" {
 		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
@@ -187,6 +191,10 @@ func main() {
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
 		}
+		// Dup f to stderr so we capture stack traces on panic.
+		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
+		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	}
 
-- 
cgit v1.2.3


From 37e57a903cf0d40e53a97eb3d47036024d7536c3 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 3 Oct 2018 10:46:42 -0700
Subject: Fix arithmetic error in multi_container_test.

We add an additional (2^3)-1=7 processes, but the code was only waiting for 3.

I switched back to Math.Pow format to make the arithmetic easier to inspect.

PiperOrigin-RevId: 215588140
Change-Id: Iccad4d6f977c1bfc5c4b08d3493afe553fe25733
---
 runsc/container/multi_container_test.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index ab200b75c..d23d36c37 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -568,7 +568,8 @@ func TestMultiContainerKillAll(t *testing.T) {
 		if _, err := containers[1].Execute(args); err != nil {
 			t.Fatalf("error exec'ing: %v", err)
 		}
-		procCount += 3
+		// Wait for these new processes to start.
+		procCount += int(math.Pow(2, 3) - 1)
 		if err := waitForProcessCount(containers[1], procCount); err != nil {
 			t.Fatal(err)
 		}
-- 
cgit v1.2.3


From 7a6412cb0b4e52bf175a77b5f43d4a74547e9798 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 3 Oct 2018 11:48:30 -0700
Subject: runsc: Allow state transition from Creating to Stopped.

This can happen if an error is encountered during Create() which causes the
container to be destroyed and set to state Stopped.

Without this transition, errors during Create get hidden by the later panic.

PiperOrigin-RevId: 215599193
Change-Id: Icd3f42e12c685cbf042f46b3929bccdf30ad55b0
---
 runsc/container/container.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 4b0037b4e..827528349 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -771,7 +771,7 @@ func (c *Container) changeStatus(s Status) {
 		}
 
 	case Stopped:
-		if c.Status != Created && c.Status != Running && c.Status != Stopped {
+		if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped {
 			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
 		}
 
-- 
cgit v1.2.3


From 9f2ba6ac3e7b56d428ef4369a7326dd85f30642d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 3 Oct 2018 14:53:10 -0700
Subject: Automated rollback of changelist 215585559

PiperOrigin-RevId: 215633475
Change-Id: I7bc471e3b9a2c725fb5e15b3bbcba2ee1ea574b1
---
 runsc/main.go | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'runsc')

diff --git a/runsc/main.go b/runsc/main.go
index 7e704a127..2a18c4b9e 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -177,10 +177,6 @@ func main() {
 
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
-		// Dup f to stderr so we capture stack traces on panic.
-		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
-			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
-		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	} else if *debugLogDir != "" {
 		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
@@ -191,10 +187,6 @@ func main() {
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
 		}
-		// Dup f to stderr so we capture stack traces on panic.
-		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
-			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
-		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	}
 
-- 
cgit v1.2.3


From 3f46f2e5017106d1569f759b8d19aee6e9827c58 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 3 Oct 2018 20:43:18 -0700
Subject: Fix sandbox chroot

Sandbox was setting chroot, but was not chaging the working
dir. Added test to ensure this doesn't happen in the future.

PiperOrigin-RevId: 215676270
Change-Id: I14352d3de64a4dcb90e50948119dc8328c9c15e1
---
 kokoro/run_tests.sh                        |  23 ++++++-
 runsc/sandbox/chroot.go                    |   2 +-
 runsc/sandbox/sandbox.go                   |   1 +
 runsc/test/README.md                       |  26 ++++++++
 runsc/test/image/image_test.go             |  14 ++--
 runsc/test/integration/integration_test.go |  17 ++---
 runsc/test/root/BUILD                      |  27 ++++++++
 runsc/test/root/chroot_test.go             | 103 +++++++++++++++++++++++++++++
 runsc/test/root/root.go                    |  16 +++++
 runsc/test/testutil/docker.go              |  13 ++++
 runsc/test/testutil/testutil.go            |   4 +-
 11 files changed, 222 insertions(+), 24 deletions(-)
 create mode 100644 runsc/test/README.md
 create mode 100644 runsc/test/root/BUILD
 create mode 100644 runsc/test/root/chroot_test.go
 create mode 100644 runsc/test/root/root.go

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 665d63390..3f8841cee 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -35,6 +35,11 @@ bazel build //...
 runtime=runsc_test_$((RANDOM))
 sudo -n ./runsc/test/install.sh --runtime ${runtime}
 
+# Best effort to uninstall the runtime
+uninstallRuntime() {
+  sudo -n ./runsc/test/install.sh -u --runtime ${runtime}
+}
+
 # Run the tests and upload results.
 #
 # We turn off "-e" flag because we must move the log files even if the test
@@ -43,6 +48,7 @@ set +e
 bazel test --test_output=errors //...
 exit_code=${?}
 
+# Execute local tests that require docker.
 if [[ ${exit_code} -eq 0 ]]; then
   # These names are used to exclude tests not supported in certain
   # configuration, e.g. save/restore not supported with hostnet.
@@ -59,8 +65,21 @@ if [[ ${exit_code} -eq 0 ]]; then
   done
 fi
 
-# Best effort to uninstall
-sudo -n ./runsc/test/install.sh -u --runtime ${runtime}
+# Execute local tests that require superuser.
+if [[ ${exit_code} -eq 0 ]]; then
+  bazel build //runsc/test/root:root_test
+  root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
+  if [[ ! -f "${root_test}" ]]; then
+    uninstallRuntime
+    echo "root_test executable not found"
+    exit 1
+  fi
+  sudo -n -E RUNSC_RUNTIME=${runtime} ${root_test}
+  exit_code=${?}
+fi
+
+uninstallRuntime
+
 set -e
 
 # Find and rename all test xml and log files so that Sponge can pick them up.
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
index 30a4bae35..35b19a0b1 100644
--- a/runsc/sandbox/chroot.go
+++ b/runsc/sandbox/chroot.go
@@ -55,7 +55,7 @@ func setUpChroot() (string, error) {
 	log.Infof("Setting up sandbox chroot in %q", chroot)
 
 	// Mount /proc.
-	if err := mountInChroot(chroot, "proc", "/proc", "proc", 0); err != nil {
+	if err := mountInChroot(chroot, "proc", "/proc", "proc", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC); err != nil {
 		return "", fmt.Errorf("error mounting proc in chroot: %v", err)
 	}
 
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 1ed1ab61d..847417a15 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -475,6 +475,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			}
 			s.Chroot = chroot // Remember path so it can cleaned up.
 			cmd.SysProcAttr.Chroot = chroot
+			cmd.Dir = "/"
 			cmd.Args[0] = "/runsc"
 			cmd.Path = "/runsc"
 		} else {
diff --git a/runsc/test/README.md b/runsc/test/README.md
new file mode 100644
index 000000000..5929cbeb6
--- /dev/null
+++ b/runsc/test/README.md
@@ -0,0 +1,26 @@
+# Tests
+
+The tests defined under this path are verifying functionality beyond what unit
+tests can cover, e.g. integration and end to end tests. Due to their nature,
+they may need extra setup in the test machine and extra configuration to run.
+
+-   **integration:** defines integration tests that uses `docker run` to test
+    functionality.
+-   **image:** basic end to end test for popular images.
+-   **root:** tests that require to be run as root.
+-   **testutil:** utilities library to support the tests.
+
+The following setup steps are required in order to run these tests:
+
+
+     `./runsc/test/install.sh [--runtime <name>]`
+
+The tests expect the runtime name to be provided in the `RUNSC_RUNTIME`
+environment variable (default: `runsc-test`). To run the tests execute:
+
+
+```
+bazel test --test_env=RUNSC_RUNTIME=runsc-test \
+  //runsc/test/image:image_test \
+  //runsc/test/integration:integration_test
+```
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 71d992115..341bdc1d5 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -12,17 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package image provides end-to-end image tests for runsc. These tests require
-// docker and runsc to be installed on the machine. To set it up, run:
-//
-//     ./runsc/test/install.sh [--runtime <name>]
-//
-// The tests expect the runtime name to be provided in the RUNSC_RUNTIME
-// environment variable (default: runsc-test).
-//
+// Package image provides end-to-end image tests for runsc.
+
 // Each test calls docker commands to start up a container, and tests that it is
 // behaving properly, like connecting to a port or looking at the output. The
 // container is killed and deleted at the end.
+//
+// Setup instruction in runsc/test/README.md.
 package image
 
 import (
@@ -307,7 +303,7 @@ func TestRuby(t *testing.T) {
 	}
 }
 
-func MainTest(m *testing.M) {
+func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
 }
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 457b5fbf5..5f24aeed5 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -12,18 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package image provides end-to-end integration tests for runsc. These tests require
-// docker and runsc to be installed on the machine. To set it up, run:
-//
-//     ./runsc/test/install.sh [--runtime <name>]
-//
-// The tests expect the runtime name to be provided in the RUNSC_RUNTIME
-// environment variable (default: runsc-test).
+// Package integration provides end-to-end integration tests for runsc.
 //
 // Each test calls docker commands to start up a container, and tests that it is
-// behaving properly, with various runsc commands. The container is killed and deleted
-// at the end.
-
+// behaving properly, with various runsc commands. The container is killed and
+// deleted at the end.
+//
+// Setup instruction in runsc/test/README.md.
 package integration
 
 import (
@@ -184,7 +179,7 @@ func TestConnectToSelf(t *testing.T) {
 	}
 }
 
-func MainTest(m *testing.M) {
+func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
 }
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
new file mode 100644
index 000000000..dbc0f1d26
--- /dev/null
+++ b/runsc/test/root/BUILD
@@ -0,0 +1,27 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "root",
+    srcs = ["root.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/test/root",
+)
+
+go_test(
+    name = "root_test",
+    size = "small",
+    srcs = ["chroot_test.go"],
+    embed = [":root"],
+    tags = [
+        # Requires docker and runsc to be configured before the test runs.
+        # Also test only runs as root.
+        "manual",
+        "local",
+    ],
+    deps = [
+        "//runsc/specutils",
+        "//runsc/test/testutil",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
+    ],
+)
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
new file mode 100644
index 000000000..5c59e7451
--- /dev/null
+++ b/runsc/test/root/chroot_test.go
@@ -0,0 +1,103 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package root is used for tests that requires sysadmin privileges run. First,
+// follow the setup instruction in runsc/test/README.md. To run these test:
+//
+//     bazel build //runsc/test/root:root_test
+//     root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
+//     sudo RUNSC_RUNTIME=runsc-test ${root_test}
+package root
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"reflect"
+	"sort"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/syndtr/gocapability/capability"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// TestChroot verifies that the sandbox is chroot'd and that mounts are cleaned
+// up after the sandbox is destroyed.
+func TestChroot(t *testing.T) {
+	d := testutil.MakeDocker("chroot-test")
+	if err := d.Run("alpine", "sleep", "10000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	pid, err := d.SandboxPid()
+	if err != nil {
+		t.Fatalf("Docker.SandboxPid(): %v", err)
+	}
+
+	// Check that sandbox is chroot'ed.
+	chroot, err := filepath.EvalSymlinks(filepath.Join("/proc", strconv.Itoa(pid), "root"))
+	if err != nil {
+		t.Fatalf("error resolving /proc/<pid>/root symlink: %v", err)
+	}
+	if want := "/tmp/runsc-sandbox-chroot-"; !strings.HasPrefix(chroot, want) {
+		t.Errorf("sandbox is not chroot'd, it should be inside: %q, got: %q", want, chroot)
+	}
+
+	path, err := filepath.EvalSymlinks(filepath.Join("/proc", strconv.Itoa(pid), "cwd"))
+	if err != nil {
+		t.Fatalf("error resolving /proc/<pid>/cwd symlink: %v", err)
+	}
+	if chroot != path {
+		t.Errorf("sandbox current dir is wrong, want: %q, got: %q", chroot, path)
+	}
+
+	fi, err := ioutil.ReadDir(chroot)
+	if err != nil {
+		t.Fatalf("error listing %q: %v", chroot, err)
+	}
+	if want, got := 2, len(fi); want != got {
+		t.Fatalf("chroot dir got %d entries, want %d", want, got)
+	}
+
+	// chroot dir is prepared by runsc and should contains only the executable
+	// and /proc.
+	files := []string{fi[0].Name(), fi[1].Name()}
+	sort.Strings(files)
+	if want := []string{"proc", "runsc"}; !reflect.DeepEqual(files, want) {
+		t.Errorf("chroot got children %v, want %v", files, want)
+	}
+
+	d.CleanUp()
+
+	// Check that chroot directory was cleaned up.
+	if _, err := os.Stat(chroot); err == nil || !os.IsNotExist(err) {
+		t.Errorf("chroot directory %q was not deleted: %v", chroot, err)
+	}
+}
+
+func TestMain(m *testing.M) {
+	testutil.EnsureSupportedDockerVersion()
+
+	if !specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_DAC_OVERRIDE) {
+		fmt.Println("Test requires sysadmin privileges to run. Try again with sudo.")
+		os.Exit(1)
+	}
+
+	os.Exit(m.Run())
+}
diff --git a/runsc/test/root/root.go b/runsc/test/root/root.go
new file mode 100644
index 000000000..790f62c29
--- /dev/null
+++ b/runsc/test/root/root.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package root is empty. See chroot_test.go for description.
+package root
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 55ca353b8..cf61f2c10 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -267,6 +267,19 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) {
 	return port, nil
 }
 
+// SandboxPid returns the PID to the sandbox process.
+func (d *Docker) SandboxPid() (int, error) {
+	out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving pid: %v", err)
+	}
+	pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing pid %q: %v", out, err)
+	}
+	return pid, nil
+}
+
 // WaitForOutput calls 'docker logs' to retrieve containers output and searches
 // for the given pattern.
 func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index cdc7f78c3..b4664995c 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -238,7 +238,7 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 }
 
 // RunAsRoot ensures the test runs with CAP_SYS_ADMIN and CAP_SYS_CHROOT. If
-// need it will create a new user namespace and reexecute the test as root
+// needed it will create a new user namespace and re-execute the test as root
 // inside of the namespace. This functionr returns when it's running as root. If
 // it needs to create another process, it will exit from there and not return.
 func RunAsRoot() {
@@ -246,6 +246,8 @@ func RunAsRoot() {
 		return
 	}
 
+	fmt.Println("*** Re-running test as root in new user namespace ***")
+
 	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
 	// as root inside that namespace to get it.
 	runtime.LockOSThread()
-- 
cgit v1.2.3


From 4a00ea557c6e60cdd131b2a9866aa3b0bcb9cb2c Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 4 Oct 2018 11:00:40 -0700
Subject: Capture boot panics in debug log.

Docker and Containerd both eat the boot processes stderr, making it difficult
to track down panics (which are always written to stderr).

This CL makes the boot process dup its debug log FD to stderr, so that panics
will be captured in the debug log, which is better than nothing.

This is the 3rd try at this CL.  Previous attempts were foiled because Docker
expects the 'create' command to pass its stdio directly to the container, so
duping stderr in 'create' caused the applications stderr to go to the log file,
which breaks many applications (including our mysql test).

I added a new image_test that makes sure stdout and stderr are handled
correctly.

PiperOrigin-RevId: 215767328
Change-Id: Icebac5a5dcf39b623b79d7a0e2f968e059130059
---
 runsc/main.go                  | 18 +++++++++++++++++-
 runsc/test/image/image_test.go | 21 +++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/main.go b/runsc/main.go
index 2a18c4b9e..16d30f7a0 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -175,14 +175,30 @@ func main() {
 		cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat)
 	}
 
+	subcommand := flag.CommandLine.Arg(0)
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+
+		// Quick sanity check to make sure no other commands get passed
+		// a log fd (they should use log dir instead).
+		if subcommand != "boot" {
+			cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' command, but was passed to %q", subcommand)
+		}
+
+		// If we are the boot process, then we own our stdio FDs and
+		// can do what we want with them. Since Docker and Containerd
+		// both eat boot's stderr, we dup our stderr to the provided
+		// log FD so that panics will appear in the logs, rather than
+		// just disappear.
+		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
+		}
+
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	} else if *debugLogDir != "" {
 		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
 			cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err)
 		}
-		subcommand := flag.CommandLine.Arg(0)
 		f, err := specutils.DebugLogFile(*debugLogDir, subcommand)
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 341bdc1d5..428f05c04 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -303,6 +303,27 @@ func TestRuby(t *testing.T) {
 	}
 }
 
+func TestStdio(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("stdio-test")
+
+	wantStdout := "hello stdout"
+	wantStderr := "bonjour stderr"
+	cmd := fmt.Sprintf("echo %q; echo %q 1>&2;", wantStdout, wantStderr)
+	if err := d.Run("alpine", "/bin/sh", "-c", cmd); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	for _, want := range []string{wantStdout, wantStderr} {
+		if _, err := d.WaitForOutput(want, 5*time.Second); err != nil {
+			t.Fatalf("docker didn't get output %q : %v", want, err)
+		}
+	}
+}
+
 func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
-- 
cgit v1.2.3


From b8048f75daa2ec13059162cb421236f99e5e4a0e Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 8 Oct 2018 17:43:31 -0700
Subject: Uncapitalize error

PiperOrigin-RevId: 216281263
Change-Id: Ie0c189e7f5934b77c6302336723bc1181fd2866c
---
 runsc/boot/loader.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 726482bb2..c419b366f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -142,7 +142,7 @@ func init() {
 // New also handles setting up a kernel for restoring a container.
 func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, stdioFDs []int, console bool) (*Loader, error) {
 	if err := usage.Init(); err != nil {
-		return nil, fmt.Errorf("Error setting up memory usage: %v", err)
+		return nil, fmt.Errorf("error setting up memory usage: %v", err)
 	}
 	// Create kernel and platform.
 	p, err := createPlatform(conf, deviceFD)
-- 
cgit v1.2.3


From ae5122eb87b5b453d51b70cffe2a253333af9eb4 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 8 Oct 2018 20:47:47 -0700
Subject: Job control signals must be sent to all processes in the FG process
 group.

We were previously only sending to the originator of the process group.

Integration test was changed to test this behavior. It fails without the
corresponding code change.

PiperOrigin-RevId: 216297263
Change-Id: I7e41cfd6bdd067f4b9dc215e28f555fb5088916f
---
 runsc/boot/loader.go                | 19 +++++++++++++------
 runsc/test/integration/exec_test.go |  9 +++++----
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index c419b366f..5716ef217 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -698,8 +698,6 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 // sendToFGProcess is true, then the signal will be sent to the foreground
 // process group in the same session that PID belongs to.
 func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess bool) error {
-	si := arch.SignalInfo{Signo: signo}
-
 	if pid <= 0 {
 		return fmt.Errorf("failed to signal container %q PID %d: PID must be positive", cid, pid)
 	}
@@ -718,7 +716,7 @@ func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess boo
 
 	if !sendToFGProcess {
 		// Send signal directly to exec process.
-		return ep.tg.SendSignal(&si)
+		return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
 	}
 
 	// Lookup foreground process group from the TTY for the given process,
@@ -731,11 +729,20 @@ func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess boo
 		// No foreground process group has been set. Signal the
 		// original thread group.
 		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, pid, pid)
-		return ep.tg.SendSignal(&si)
+		return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
 	}
 
-	// Send the signal.
-	return pg.Originator().SendSignal(&si)
+	// Send the signal to all processes in the process group.
+	var lastErr error
+	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+		if tg.ProcessGroup() != pg {
+			continue
+		}
+		if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+			lastErr = err
+		}
+	}
+	return lastErr
 }
 
 // signalContainer sends a signal to the root container process, or to all
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 014254aab..d08140ad3 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -81,16 +81,17 @@ func TestExecJobControl(t *testing.T) {
 	}
 	defer ptmx.Close()
 
-	// Call "sleep 100" in the shell.
-	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
+	// Call "sleep 100 | cat" in the shell.  We pipe to cat so that there
+	// will be two processes in the foreground process group.
+	if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil {
 		t.Fatalf("error writing to pty: %v", err)
 	}
 
 	// Give shell a few seconds to start executing the sleep.
 	time.Sleep(2 * time.Second)
 
-	// Send a ^C to the pty, which should kill sleep, but not the shell.
-	// \x03 is ASCII "end of text", which is the same as ^C.
+	// Send a ^C to the pty, which should kill sleep and cat, but not the
+	// shell.  \x03 is ASCII "end of text", which is the same as ^C.
 	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
 		t.Fatalf("error writing to pty: %v", err)
 	}
-- 
cgit v1.2.3


From c36d2ef3733a0619b992f8ddc23b072474b04044 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 9 Oct 2018 15:11:46 -0700
Subject: Add new netstack metrics to the sentry

PiperOrigin-RevId: 216431260
Change-Id: Ia6e5c8d506940148d10ff2884cf4440f470e5820
---
 pkg/metric/metric.go                   | 54 ++++++++++++++++++++++++----------
 pkg/sentry/socket/epsocket/BUILD       |  1 +
 pkg/sentry/socket/epsocket/epsocket.go | 38 ++++++++++++++++++++++++
 runsc/boot/loader.go                   |  7 +++--
 4 files changed, 83 insertions(+), 17 deletions(-)

(limited to 'runsc')

diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 0743612f0..763cd6bc2 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -48,9 +48,6 @@ var (
 // TODO: Support metric fields.
 //
 type Uint64Metric struct {
-	// metadata describes the metric. It is immutable.
-	metadata *pb.MetricMetadata
-
 	// value is the actual value of the metric. It must be accessed
 	// atomically.
 	value uint64
@@ -101,24 +98,35 @@ func Disable() {
 	}
 }
 
-// NewUint64Metric creates a new metric with the given name.
+type customUint64Metric struct {
+	// metadata describes the metric. It is immutable.
+	metadata *pb.MetricMetadata
+
+	// value returns the current value of the metric.
+	value func() uint64
+}
+
+// RegisterCustomUint64Metric registers a metric with the given name.
+//
+// Register must only be called at init and will return and error if called
+// after Initialized.
 //
-// Metrics must be statically defined (i.e., at startup). NewUint64Metric will
-// return an error if called after Initialized.
+// All metrics must be cumulative, meaning that the return values of value must
+// only increase over time.
 //
 // Preconditions:
 //  * name must be globally unique.
 //  * Initialize/Disable have not been called.
-func NewUint64Metric(name string, sync bool, description string) (*Uint64Metric, error) {
+func RegisterCustomUint64Metric(name string, sync bool, description string, value func() uint64) error {
 	if initialized {
-		return nil, ErrInitializationDone
+		return ErrInitializationDone
 	}
 
 	if _, ok := allMetrics.m[name]; ok {
-		return nil, ErrNameInUse
+		return ErrNameInUse
 	}
 
-	m := &Uint64Metric{
+	allMetrics.m[name] = customUint64Metric{
 		metadata: &pb.MetricMetadata{
 			Name:        name,
 			Description: description,
@@ -126,9 +134,25 @@ func NewUint64Metric(name string, sync bool, description string) (*Uint64Metric,
 			Sync:        sync,
 			Type:        pb.MetricMetadata_UINT64,
 		},
+		value: value,
+	}
+	return nil
+}
+
+// MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric and panics
+// if it returns an error.
+func MustRegisterCustomUint64Metric(name string, sync bool, description string, value func() uint64) {
+	if err := RegisterCustomUint64Metric(name, sync, description, value); err != nil {
+		panic(fmt.Sprintf("Unable to register metric %q: %v", name, err))
 	}
-	allMetrics.m[name] = m
-	return m, nil
+}
+
+// NewUint64Metric creates and registers a new metric with the given name.
+//
+// Metrics must be statically defined (i.e., at init).
+func NewUint64Metric(name string, sync bool, description string) (*Uint64Metric, error) {
+	var m Uint64Metric
+	return &m, RegisterCustomUint64Metric(name, sync, description, m.Value)
 }
 
 // MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns an
@@ -158,13 +182,13 @@ func (m *Uint64Metric) IncrementBy(v uint64) {
 
 // metricSet holds named metrics.
 type metricSet struct {
-	m map[string]*Uint64Metric
+	m map[string]customUint64Metric
 }
 
 // makeMetricSet returns a new metricSet.
 func makeMetricSet() metricSet {
 	return metricSet{
-		m: make(map[string]*Uint64Metric),
+		m: make(map[string]customUint64Metric),
 	}
 }
 
@@ -172,7 +196,7 @@ func makeMetricSet() metricSet {
 func (m *metricSet) Values() metricValues {
 	vals := make(metricValues)
 	for k, v := range m.m {
-		vals[k] = v.Value()
+		vals[k] = v.value()
 	}
 	return vals
 }
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 49af8db85..7f9ea9edc 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/log",
+        "//pkg/metric",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 550569b4c..c5da18b0e 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -33,6 +33,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -53,6 +54,43 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+func mustCreateMetric(name, description string) *tcpip.StatCounter {
+	var cm tcpip.StatCounter
+	metric.MustRegisterCustomUint64Metric(name, false /* sync */, description, cm.Value)
+	return &cm
+}
+
+// Metrics contains metrics exported by netstack.
+var Metrics = tcpip.Stats{
+	UnknownProtocolRcvdPackets: mustCreateMetric("/netstack/unknown_protocol_received_packets", "Number of packets received by netstack that were for an unknown or unsupported protocol."),
+	MalformedRcvdPackets:       mustCreateMetric("/netstack/malformed_received_packets", "Number of packets received by netstack that were deemed malformed."),
+	DroppedPackets:             mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped by netstack due to full queues."),
+	IP: tcpip.IPStats{
+		PacketsReceived:          mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
+		InvalidAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."),
+		PacketsDelivered:         mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
+		PacketsSent:              mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."),
+		OutgoingPacketErrors:     mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
+	},
+	TCP: tcpip.TCPStats{
+		ActiveConnectionOpenings:  mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
+		PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
+		FailedConnectionAttempts:  mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
+		ValidSegmentsReceived:     mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
+		InvalidSegmentsReceived:   mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
+		SegmentsSent:              mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
+		ResetsSent:                mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
+		ResetsReceived:            mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
+	},
+	UDP: tcpip.UDPStats{
+		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
+		UnknownPortErrors:        mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
+		ReceiveBufferErrors:      mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
+		MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
+		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent via sendUDP."),
+	},
+}
+
 const sizeOfInt32 int = 4
 
 var errStackType = syserr.New("expected but did not receive an epsocket.Stack", linux.EINVAL)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 5716ef217..1ad6b09f4 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -683,11 +683,14 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
 		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
-		s := &epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{Clock: clock})}
+		s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
+			Clock: clock,
+			Stats: epsocket.Metrics,
+		})}
 		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
 			return nil, fmt.Errorf("failed to enable SACK: %v", err)
 		}
-		return s, nil
+		return &s, nil
 
 	default:
 		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
-- 
cgit v1.2.3


From 20508bafb88d2037ea3b2c8483b191ce72e7ad7e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 9 Oct 2018 21:06:18 -0700
Subject: Add tests to verify gofer is chroot'ed

PiperOrigin-RevId: 216472439
Change-Id: Ic4cb86c8e0a9cb022d3ceed9dc5615266c307cf9
---
 runsc/cmd/debug.go             |  2 +-
 runsc/cmd/list.go              |  2 +-
 runsc/container/container.go   |  8 ++---
 runsc/test/root/chroot_test.go | 70 ++++++++++++++++++++++++++++++++++++++++++
 runsc/test/testutil/docker.go  |  9 ++++++
 5 files changed, 85 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index caa44168b..cb7d81057 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -85,7 +85,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			if err != nil {
 				Fatalf("error loading container %q: %v", id, err)
 			}
-			if candidate.Pid() == d.pid {
+			if candidate.SandboxPid() == d.pid {
 				c = candidate
 				break
 			}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index d554bf7cf..4d4a5cb0b 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -94,7 +94,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		for _, c := range containers {
 			fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
 				c.ID,
-				c.Pid(),
+				c.SandboxPid(),
 				c.Status,
 				c.BundleDir,
 				c.CreatedAt.Format(time.RFC3339Nano),
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 827528349..f0cdee8d3 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -316,7 +316,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// Write the PID file. Containerd considers the create complete after
 	// this file is created, so it must be the last thing we do.
 	if pidFile != "" {
-		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.Pid())), 0644); err != nil {
+		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil {
 			c.Destroy()
 			return nil, fmt.Errorf("error writing PID file: %v", err)
 		}
@@ -426,9 +426,9 @@ func (c *Container) Event() (*boot.Event, error) {
 	return c.Sandbox.Event(c.ID)
 }
 
-// Pid returns the Pid of the sandbox the container is running in, or -1 if the
+// SandboxPid returns the Pid of the sandbox the container is running in, or -1 if the
 // container is not running.
-func (c *Container) Pid() int {
+func (c *Container) SandboxPid() int {
 	if err := c.requireStatus("get PID", Created, Running, Paused); err != nil {
 		return -1
 	}
@@ -566,7 +566,7 @@ func (c *Container) State() specs.State {
 		Version: specs.Version,
 		ID:      c.ID,
 		Status:  c.Status.String(),
-		Pid:     c.Pid(),
+		Pid:     c.SandboxPid(),
 		Bundle:  c.BundleDir,
 	}
 }
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 5c59e7451..8831e6a78 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -24,6 +24,7 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"reflect"
 	"sort"
@@ -91,6 +92,75 @@ func TestChroot(t *testing.T) {
 	}
 }
 
+func TestChrootGofer(t *testing.T) {
+	d := testutil.MakeDocker("chroot-test")
+	if err := d.Run("alpine", "sleep", "10000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// It's tricky to find gofers. Get sandbox PID first, then find parent. From
+	// parent get all immediate children, remove the sandbox, and everything else
+	// are gofers.
+	sandPID, err := d.SandboxPid()
+	if err != nil {
+		t.Fatalf("Docker.SandboxPid(): %v", err)
+	}
+
+	// Find sandbox's parent PID.
+	cmd := fmt.Sprintf("grep PPid /proc/%d/status | awk '{print $2}'", sandPID)
+	parent, err := exec.Command("sh", "-c", cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to fetch runsc (%d) parent PID: %v, out:\n%s", sandPID, err, string(parent))
+	}
+	parentPID, err := strconv.Atoi(strings.TrimSpace(string(parent)))
+	if err != nil {
+		t.Fatalf("failed to parse PPID %q: %v", string(parent), err)
+	}
+
+	// Get all children from parent.
+	childrenOut, err := exec.Command("/usr/bin/pgrep", "-P", strconv.Itoa(parentPID)).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to fetch containerd-shim children: %v", err)
+	}
+	children := strings.Split(strings.TrimSpace(string(childrenOut)), "\n")
+
+	// This where the root directory is mapped on the host and that's where the
+	// gofer must have chroot'd to.
+	root, err := d.RootDirInHost()
+	if err != nil {
+		t.Fatalf("Docker.RootDirInHost(): %v", err)
+	}
+
+	for _, child := range children {
+		childPID, err := strconv.Atoi(child)
+		if err != nil {
+			t.Fatalf("failed to parse child PID %q: %v", child, err)
+		}
+		if childPID == sandPID {
+			// Skip the sandbox, all other immediate children are gofers.
+			continue
+		}
+
+		// Check that gofer is chroot'ed.
+		chroot, err := filepath.EvalSymlinks(filepath.Join("/proc", child, "root"))
+		if err != nil {
+			t.Fatalf("error resolving /proc/<pid>/root symlink: %v", err)
+		}
+		if root != chroot {
+			t.Errorf("gofer chroot is wrong, want: %q, got: %q", root, chroot)
+		}
+
+		path, err := filepath.EvalSymlinks(filepath.Join("/proc", child, "cwd"))
+		if err != nil {
+			t.Fatalf("error resolving /proc/<pid>/cwd symlink: %v", err)
+		}
+		if root != path {
+			t.Errorf("gofer current dir is wrong, want: %q, got: %q", root, path)
+		}
+	}
+}
+
 func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index cf61f2c10..d70b4377a 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -280,6 +280,15 @@ func (d *Docker) SandboxPid() (int, error) {
 	return pid, nil
 }
 
+// RootDirInHost returns where the root directory is mapped on the host.
+func (d *Docker) RootDirInHost() (string, error) {
+	out, err := do("inspect", "-f={{.GraphDriver.Data.MergedDir}}", d.Name)
+	if err != nil {
+		return "", fmt.Errorf("error retrieving pid: %v", err)
+	}
+	return strings.TrimSuffix(string(out), "\n"), nil
+}
+
 // WaitForOutput calls 'docker logs' to retrieve containers output and searches
 // for the given pattern.
 func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
-- 
cgit v1.2.3


From 29cd05a7c66ee8061c0e5cf8e94c4e507dcf33e0 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 10 Oct 2018 08:59:25 -0700
Subject: Add sandbox to cgroup

Sandbox creation uses the limits and reservations configured in the
OCI spec and set cgroup options accordinly. Then it puts both the
sandbox and gofer processes inside the cgroup.

It also allows the cgroup to be pre-configured by the caller. If the
cgroup already exists, sandbox and gofer processes will join the
cgroup but it will not modify the cgroup with spec limits.

PiperOrigin-RevId: 216538209
Change-Id: If2c65ffedf55820baab743a0edcfb091b89c1019
---
 runsc/boot/loader.go                       |  90 ++++---
 runsc/boot/loader_test.go                  |  11 +-
 runsc/cgroup/BUILD                         |  24 ++
 runsc/cgroup/cgroup.go                     | 405 +++++++++++++++++++++++++++++
 runsc/cgroup/cgroup_test.go                |  56 ++++
 runsc/cmd/boot.go                          |  23 +-
 runsc/container/container.go               |  16 +-
 runsc/sandbox/BUILD                        |   1 +
 runsc/sandbox/sandbox.go                   |  57 +++-
 runsc/specutils/BUILD                      |   1 -
 runsc/specutils/cpu.go                     |  90 -------
 runsc/specutils/specutils.go               |  37 +++
 runsc/test/integration/BUILD               |   4 +-
 runsc/test/integration/integration_test.go |  84 ++++++
 runsc/test/testutil/docker.go              |   9 +
 15 files changed, 776 insertions(+), 132 deletions(-)
 create mode 100644 runsc/cgroup/BUILD
 create mode 100644 runsc/cgroup/cgroup.go
 create mode 100644 runsc/cgroup/cgroup_test.go
 delete mode 100644 runsc/specutils/cpu.go

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 1ad6b09f4..dc3c6c3d0 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,6 +20,7 @@ import (
 	"math/rand"
 	"os"
 	"os/signal"
+	"runtime"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -138,14 +139,39 @@ func init() {
 	kernel.RegisterSyscallTable(slinux.AMD64)
 }
 
+// Args are the arguments for New().
+type Args struct {
+	// Id is the sandbox ID.
+	ID string
+	// Spec is the sandbox specification.
+	Spec *specs.Spec
+	// Conf is the system configuration.
+	Conf *Config
+	// ControllerFD is the FD to the URPC controller.
+	ControllerFD int
+	// DeviceFD is an optional argument that is passed to the platform.
+	DeviceFD int
+	// GoferFDs is an array of FDs used to connect with the Gofer.
+	GoferFDs []int
+	// StdioFDs is the stdio for the application.
+	StdioFDs []int
+	// Console is set to true if using TTY.
+	Console bool
+	// NumCPU is the number of CPUs to create inside the sandbox.
+	NumCPU int
+	// TotalMem is the initial amount of total memory to report back to the
+	// container.
+	TotalMem uint64
+}
+
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, stdioFDs []int, console bool) (*Loader, error) {
+func New(args Args) (*Loader, error) {
 	if err := usage.Init(); err != nil {
 		return nil, fmt.Errorf("error setting up memory usage: %v", err)
 	}
 	// Create kernel and platform.
-	p, err := createPlatform(conf, deviceFD)
+	p, err := createPlatform(args.Conf, args.DeviceFD)
 	if err != nil {
 		return nil, fmt.Errorf("error creating platform: %v", err)
 	}
@@ -168,7 +194,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 	}
 	tk.SetClocks(time.NewCalibratedClocks())
 
-	if err := enableStrace(conf); err != nil {
+	if err := enableStrace(args.Conf); err != nil {
 		return nil, fmt.Errorf("failed to enable strace: %v", err)
 	}
 
@@ -176,35 +202,41 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
-	networkStack, err := newEmptyNetworkStack(conf, k)
+	networkStack, err := newEmptyNetworkStack(args.Conf, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create network: %v", err)
 	}
 
 	// Create capabilities.
-	caps, err := specutils.Capabilities(spec.Process.Capabilities)
+	caps, err := specutils.Capabilities(args.Spec.Process.Capabilities)
 	if err != nil {
 		return nil, fmt.Errorf("error creating capabilities: %v", err)
 	}
 
 	// Convert the spec's additional GIDs to KGIDs.
-	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
-	for _, GID := range spec.Process.User.AdditionalGids {
+	extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
+	for _, GID := range args.Spec.Process.User.AdditionalGids {
 		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
 	}
 
 	// Create credentials.
 	creds := auth.NewUserCredentials(
-		auth.KUID(spec.Process.User.UID),
-		auth.KGID(spec.Process.User.GID),
+		auth.KUID(args.Spec.Process.User.UID),
+		auth.KGID(args.Spec.Process.User.GID),
 		extraKGIDs,
 		caps,
 		auth.NewRootUserNamespace())
 
-	// Get CPU numbers from spec.
-	cpuNum, err := specutils.CalculateCPUNumber(spec)
-	if err != nil {
-		return nil, fmt.Errorf("cannot get cpus from spec: %v", err)
+	if args.NumCPU == 0 {
+		args.NumCPU = runtime.NumCPU()
+	}
+	log.Infof("CPUs: %d", args.NumCPU)
+
+	if args.TotalMem > 0 {
+		// Adjust the total memory returned by the Sentry so that applications that
+		// use /proc/meminfo can make allocations based on this limit.
+		usage.MinimumTotalMemoryBytes = args.TotalMem
+		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30))
 	}
 
 	// Initiate the Kernel object, which is required by the Context passed
@@ -214,9 +246,9 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 		Timekeeper:                  tk,
 		RootUserNamespace:           creds.UserNamespace,
 		NetworkStack:                networkStack,
-		ApplicationCores:            uint(cpuNum),
+		ApplicationCores:            uint(args.NumCPU),
 		Vdso:                        vdso,
-		RootUTSNamespace:            kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace),
+		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, "", creds.UserNamespace),
 		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
 		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
 	}); err != nil {
@@ -224,7 +256,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 	}
 
 	// Turn on packet logging if enabled.
-	if conf.LogPackets {
+	if args.Conf.LogPackets {
 		log.Infof("Packet logging enabled")
 		atomic.StoreUint32(&sniffer.LogPackets, 1)
 	} else {
@@ -233,7 +265,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 	}
 
 	// Create a watchdog.
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, conf.WatchdogAction)
+	watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
 
 	// Create the control server using the provided FD.
 	//
@@ -244,7 +276,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 	// misconfigured process will cause an error, and we want the control
 	// server up before that so that we don't time out trying to connect to
 	// it.
-	ctrl, err := newController(controllerFD, k, watchdog)
+	ctrl, err := newController(args.ControllerFD, k, watchdog)
 	if err != nil {
 		return nil, fmt.Errorf("error creating control server: %v", err)
 	}
@@ -255,20 +287,20 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
 	}
 	// Ensure that signals received are forwarded to the emulated kernel.
-	ps := syscall.Signal(conf.PanicSignal)
+	ps := syscall.Signal(args.Conf.PanicSignal)
 	startSignalForwarding := sighandling.PrepareForwarding(k, ps)
-	if conf.PanicSignal != -1 {
-		// Panics if the sentry receives 'conf.PanicSignal'.
+	if args.Conf.PanicSignal != -1 {
+		// Panics if the sentry receives 'Config.PanicSignal'.
 		panicChan := make(chan os.Signal, 1)
 		signal.Notify(panicChan, ps)
 		go func() { // S/R-SAFE: causes sentry panic.
 			<-panicChan
 			panic("Signal-induced panic")
 		}()
-		log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal)
+		log.Infof("Panic signal set to %v(%d)", ps, args.Conf.PanicSignal)
 	}
 
-	procArgs, err := newProcess(id, spec, creds, k)
+	procArgs, err := newProcess(args.ID, args.Spec, creds, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
@@ -276,15 +308,15 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 	l := &Loader{
 		k:                     k,
 		ctrl:                  ctrl,
-		conf:                  conf,
-		console:               console,
+		conf:                  args.Conf,
+		console:               args.Console,
 		watchdog:              watchdog,
-		spec:                  spec,
-		goferFDs:              goferFDs,
-		stdioFDs:              stdioFDs,
+		spec:                  args.Spec,
+		goferFDs:              args.GoferFDs,
+		stdioFDs:              args.StdioFDs,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
-		sandboxID:             id,
+		sandboxID:             args.ID,
 		processes:             make(map[execID]*execProcess),
 	}
 	ctrl.manager.l = l
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index ea8411a8b..10efa4427 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -102,7 +102,16 @@ func createLoader() (*Loader, func(), error) {
 	}
 
 	stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())}
-	l, err := New("foo", spec, conf, fd, -1 /* device fd */, []int{sandEnd}, stdio, false)
+	args := Args{
+		ID:           "foo",
+		Spec:         spec,
+		Conf:         conf,
+		ControllerFD: fd,
+		DeviceFD:     -1,
+		GoferFDs:     []int{sandEnd},
+		StdioFDs:     stdio,
+	}
+	l, err := New(args)
 	if err != nil {
 		cleanup()
 		return nil, nil, err
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
new file mode 100644
index 000000000..4a535d230
--- /dev/null
+++ b/runsc/cgroup/BUILD
@@ -0,0 +1,24 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "cgroup",
+    srcs = ["cgroup.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/cgroup",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//runsc/specutils",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
+
+go_test(
+    name = "cgroup_test",
+    size = "small",
+    srcs = ["cgroup_test.go"],
+    embed = [":cgroup"],
+)
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
new file mode 100644
index 000000000..6a0092be8
--- /dev/null
+++ b/runsc/cgroup/cgroup.go
@@ -0,0 +1,405 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cgroup provides an interface to read and write configuration to
+// cgroup.
+package cgroup
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+	cgroupRoot = "/sys/fs/cgroup"
+)
+
+var controllers = map[string]controller{
+	"blkio":    &blockIO{},
+	"cpu":      &cpu{},
+	"cpuset":   &cpuSet{},
+	"memory":   &memory{},
+	"net_cls":  &networkClass{},
+	"net_prio": &networkPrio{},
+
+	// These controllers either don't have anything in the OCI spec or is
+	// irrevalant for a sandbox, e.g. pids.
+	"devices":    &noop{},
+	"freezer":    &noop{},
+	"perf_event": &noop{},
+	"pids":       &noop{},
+	"systemd":    &noop{},
+}
+
+func setOptionalValueInt(path, name string, val *int64) error {
+	if val == nil || *val == 0 {
+		return nil
+	}
+	str := strconv.FormatInt(*val, 10)
+	return setValue(path, name, str)
+}
+
+func setOptionalValueUint(path, name string, val *uint64) error {
+	if val == nil || *val == 0 {
+		return nil
+	}
+	str := strconv.FormatUint(*val, 10)
+	return setValue(path, name, str)
+}
+
+func setOptionalValueUint32(path, name string, val *uint32) error {
+	if val == nil || *val == 0 {
+		return nil
+	}
+	str := strconv.FormatUint(uint64(*val), 10)
+	return setValue(path, name, str)
+}
+
+func setOptionalValueUint16(path, name string, val *uint16) error {
+	if val == nil || *val == 0 {
+		return nil
+	}
+	str := strconv.FormatUint(uint64(*val), 10)
+	return setValue(path, name, str)
+}
+
+func setValue(path, name, data string) error {
+	fullpath := filepath.Join(path, name)
+	return ioutil.WriteFile(fullpath, []byte(data), 0700)
+}
+
+func getValue(path, name string) (string, error) {
+	fullpath := filepath.Join(path, name)
+	out, err := ioutil.ReadFile(fullpath)
+	if err != nil {
+		return "", err
+	}
+	return string(out), nil
+}
+
+// fillFromAncestor sets the value of a cgroup file from the first ancestor
+// that has content. It does nothing if the file in 'path' has already been set.
+func fillFromAncestor(path string) (string, error) {
+	out, err := ioutil.ReadFile(path)
+	if err != nil {
+		return "", err
+	}
+	val := strings.TrimSpace(string(out))
+	if val != "" {
+		// File is set, stop here.
+		return val, nil
+	}
+
+	// File is not set, recurse to parent and then  set here.
+	name := filepath.Base(path)
+	parent := filepath.Dir(filepath.Dir(path))
+	val, err = fillFromAncestor(filepath.Join(parent, name))
+	if err != nil {
+		return "", err
+	}
+	if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil {
+		return "", err
+	}
+	return val, nil
+}
+
+func countCpuset(cpuset string) (int, error) {
+	var count int
+	for _, p := range strings.Split(cpuset, ",") {
+		interval := strings.Split(p, "-")
+		switch len(interval) {
+		case 1:
+			if _, err := strconv.Atoi(interval[0]); err != nil {
+				return 0, err
+			}
+			count++
+
+		case 2:
+			start, err := strconv.Atoi(interval[0])
+			if err != nil {
+				return 0, err
+			}
+			end, err := strconv.Atoi(interval[1])
+			if err != nil {
+				return 0, err
+			}
+			if start < 0 || end < 0 || start > end {
+				return 0, fmt.Errorf("invalid cpuset: %q", p)
+			}
+			count += end - start + 1
+
+		default:
+			return 0, fmt.Errorf("invalid cpuset: %q", p)
+		}
+	}
+	return count, nil
+}
+
+// Cgroup represents a group inside all controllers. For example: Name='/foo/bar'
+// maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers.
+type Cgroup struct {
+	Name string `json:"name"`
+	Own  bool   `json:"own"`
+}
+
+// New creates a new Cgroup instance if the spec includes a cgroup path.
+// Otherwise it returns nil and false.
+func New(spec *specs.Spec) (*Cgroup, bool) {
+	if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
+		return nil, false
+	}
+	return &Cgroup{Name: spec.Linux.CgroupsPath}, true
+}
+
+// Install creates and configures cgroups according to 'res'. If cgroup path
+// already exists, it means that the caller has already provided a
+// pre-configured cgroups, and 'res' is ignored.
+func (c *Cgroup) Install(res *specs.LinuxResources) error {
+	if _, err := os.Stat(c.makePath("memory")); err == nil {
+		// If cgroup has already been created; it has been setup by caller. Don't
+		// make any changes to configuration, just join when sandbox/gofer starts.
+		log.Debugf("Using pre-created cgroup %q", c.Name)
+		return nil
+	}
+
+	// Mark that cgroup resources are owned by me.
+	log.Debugf("Creating cgroup %q", c.Name)
+	c.Own = true
+	clean := specutils.MakeCleanup(func() { c.Uninstall() })
+	defer clean.Clean()
+
+	for key, ctrl := range controllers {
+		path := c.makePath(key)
+		if err := os.MkdirAll(path, 0755); err != nil {
+			return err
+		}
+		if res != nil {
+			if err := ctrl.set(res, path); err != nil {
+				return err
+			}
+		}
+	}
+	clean.Release()
+	return nil
+}
+
+// Uninstall removes the settings done in Install(). If cgroup path already
+// existed when Install() was called, Uninstall is a noop.
+func (c *Cgroup) Uninstall() error {
+	if !c.Own {
+		// cgroup is managed by caller, don't touch it.
+		return nil
+	}
+	log.Debugf("Deleting cgroup %q", c.Name)
+	for key := range controllers {
+		if err := syscall.Rmdir(c.makePath(key)); err != nil && !os.IsNotExist(err) {
+			return err
+		}
+	}
+	return nil
+}
+
+// Add adds given process to all controllers.
+func (c *Cgroup) Add(pid int) error {
+	for key := range controllers {
+		if err := setValue(c.makePath(key), "cgroup.procs", strconv.Itoa(pid)); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
+func (c *Cgroup) NumCPU() (int, error) {
+	path := c.makePath("cpuset")
+	cpuset, err := getValue(path, "cpuset.cpus")
+	if err != nil {
+		return 0, err
+	}
+	return countCpuset(strings.TrimSpace(cpuset))
+}
+
+// MemoryLimit returns the memory limit.
+func (c *Cgroup) MemoryLimit() (uint64, error) {
+	path := c.makePath("memory")
+	limStr, err := getValue(path, "memory.limit_in_bytes")
+	if err != nil {
+		return 0, err
+	}
+	return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64)
+}
+
+func (c *Cgroup) makePath(controllerName string) string {
+	return filepath.Join(cgroupRoot, controllerName, c.Name)
+}
+
+type controller interface {
+	set(*specs.LinuxResources, string) error
+}
+
+type noop struct{}
+
+func (*noop) set(*specs.LinuxResources, string) error {
+	return nil
+}
+
+type memory struct{}
+
+func (*memory) set(spec *specs.LinuxResources, path string) error {
+	if spec.Memory == nil {
+		return nil
+	}
+	if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "memory.soft_limit_in_bytes", spec.Memory.Reservation); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "memory.memsw.limit_in_bytes", spec.Memory.Swap); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "memory.kmem.limit_in_bytes", spec.Memory.Kernel); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "memory.kmem.tcp.limit_in_bytes", spec.Memory.KernelTCP); err != nil {
+		return err
+	}
+	if err := setOptionalValueUint(path, "memory.swappiness", spec.Memory.Swappiness); err != nil {
+		return err
+	}
+
+	if spec.Memory.DisableOOMKiller != nil && *spec.Memory.DisableOOMKiller {
+		if err := setValue(path, "memory.oom_control", "1"); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+type cpu struct{}
+
+func (*cpu) set(spec *specs.LinuxResources, path string) error {
+	if spec.CPU == nil {
+		return nil
+	}
+	if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil {
+		return err
+	}
+	return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period)
+}
+
+type cpuSet struct{}
+
+func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
+	// cpuset.cpus and mems are required fields, but are not set on a new cgroup.
+	// If not set in the spec, get it from one of the ancestors cgroup.
+	if spec.CPU == nil || spec.CPU.Cpus == "" {
+		if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil {
+			return err
+		}
+	} else {
+		if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil {
+			return err
+		}
+	}
+
+	if spec.CPU == nil || spec.CPU.Mems == "" {
+		_, err := fillFromAncestor(filepath.Join(path, "cpuset.mems"))
+		return err
+	}
+	mems := spec.CPU.Mems
+	return setValue(path, "cpuset.mems", mems)
+}
+
+type blockIO struct{}
+
+func (*blockIO) set(spec *specs.LinuxResources, path string) error {
+	if spec.BlockIO == nil {
+		return nil
+	}
+
+	if err := setOptionalValueUint16(path, "blkio.weight", spec.BlockIO.Weight); err != nil {
+		return err
+	}
+	if err := setOptionalValueUint16(path, "blkio.leaf_weight", spec.BlockIO.LeafWeight); err != nil {
+		return err
+	}
+
+	for _, dev := range spec.BlockIO.WeightDevice {
+		val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight)
+		if err := setValue(path, "blkio.weight_device", val); err != nil {
+			return err
+		}
+		val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight)
+		if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
+			return err
+		}
+	}
+	if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil {
+		return err
+	}
+	if err := setThrottle(path, "blkio.throttle.write_bps_device", spec.BlockIO.ThrottleWriteBpsDevice); err != nil {
+		return err
+	}
+	if err := setThrottle(path, "blkio.throttle.read_iops_device", spec.BlockIO.ThrottleReadIOPSDevice); err != nil {
+		return err
+	}
+	return setThrottle(path, "blkio.throttle.write_iops_device", spec.BlockIO.ThrottleWriteIOPSDevice)
+}
+
+func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error {
+	for _, dev := range devs {
+		val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate)
+		if err := setValue(path, name, val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+type networkClass struct{}
+
+func (*networkClass) set(spec *specs.LinuxResources, path string) error {
+	if spec.Network == nil {
+		return nil
+	}
+	return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID)
+}
+
+type networkPrio struct{}
+
+func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
+	if spec.Network == nil {
+		return nil
+	}
+	for _, prio := range spec.Network.Priorities {
+		val := fmt.Sprintf("%s %d", prio.Name, prio.Priority)
+		if err := setValue(path, "net_prio.ifpriomap", val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
new file mode 100644
index 000000000..cde915329
--- /dev/null
+++ b/runsc/cgroup/cgroup_test.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cgroup
+
+import (
+	"testing"
+)
+
+func TestCountCpuset(t *testing.T) {
+	for _, tc := range []struct {
+		str   string
+		want  int
+		error bool
+	}{
+		{str: "0", want: 1},
+		{str: "0,1,2,8,9,10", want: 6},
+		{str: "0-1", want: 2},
+		{str: "0-7", want: 8},
+		{str: "0-7,16,32-39,64,65", want: 19},
+		{str: "a", error: true},
+		{str: "5-a", error: true},
+		{str: "a-5", error: true},
+		{str: "-10", error: true},
+		{str: "15-", error: true},
+		{str: "-", error: true},
+		{str: "--", error: true},
+	} {
+		t.Run(tc.str, func(t *testing.T) {
+			got, err := countCpuset(tc.str)
+			if tc.error {
+				if err == nil {
+					t.Errorf("countCpuset(%q) should have failed", tc.str)
+				}
+			} else {
+				if err != nil {
+					t.Errorf("countCpuset(%q) failed: %v", tc.str, err)
+				}
+				if tc.want != got {
+					t.Errorf("countCpuset(%q) want: %d, got: %d", tc.str, tc.want, got)
+				}
+			}
+		})
+	}
+}
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index c6f78f63f..d26e92bcd 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -59,6 +59,13 @@ type Boot struct {
 	// applyCaps determines if capabilities defined in the spec should be applied
 	// to the process.
 	applyCaps bool
+
+	// cpuNum number of CPUs to create inside the sandbox.
+	cpuNum int
+
+	// totalMem sets the initial amount of total memory to report back to the
+	// container.
+	totalMem uint64
 }
 
 // Name implements subcommands.Command.Name.
@@ -86,6 +93,8 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
+	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
 }
 
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
@@ -143,7 +152,19 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-	l, err := boot.New(f.Arg(0), spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.stdioFDs.GetArray(), b.console)
+	bootArgs := boot.Args{
+		ID:           f.Arg(0),
+		Spec:         spec,
+		Conf:         conf,
+		ControllerFD: b.controllerFD,
+		DeviceFD:     b.deviceFD,
+		GoferFDs:     b.ioFDs.GetArray(),
+		StdioFDs:     b.stdioFDs.GetArray(),
+		Console:      b.console,
+		NumCPU:       b.cpuNum,
+		TotalMem:     b.totalMem,
+	}
+	l, err := boot.New(bootArgs)
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index f0cdee8d3..eaa62daf1 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -262,6 +262,8 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		Status:        Creating,
 		Owner:         os.Getenv("USER"),
 	}
+	cu := specutils.MakeCleanup(func() { c.Destroy() })
+	defer cu.Clean()
 
 	// If the metadata annotations indicate that this container should be
 	// started in an existing sandbox, we must do so. The metadata will
@@ -276,12 +278,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 
 		// Start a new sandbox for this container. Any errors after this point
 		// must destroy the container.
-		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles)
+		c.Sandbox, err = sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles)
 		if err != nil {
-			c.Destroy()
 			return nil, err
 		}
-		c.Sandbox = s
+		if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil {
+			return nil, err
+		}
 	} else {
 		// This is sort of confusing. For a sandbox with a root
 		// container and a child container in it, runsc sees:
@@ -300,7 +303,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		// Find the sandbox associated with this ID.
 		sb, err := Load(conf.RootDir, sbid)
 		if err != nil {
-			c.Destroy()
 			return nil, err
 		}
 		c.Sandbox = sb.Sandbox
@@ -309,7 +311,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 
 	// Save the metadata file.
 	if err := c.save(); err != nil {
-		c.Destroy()
 		return nil, err
 	}
 
@@ -317,11 +318,11 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// this file is created, so it must be the last thing we do.
 	if pidFile != "" {
 		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil {
-			c.Destroy()
 			return nil, fmt.Errorf("error writing PID file: %v", err)
 		}
 	}
 
+	cu.Release()
 	return c, nil
 }
 
@@ -358,6 +359,9 @@ func (c *Container) Start(conf *boot.Config) error {
 		if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil {
 			return err
 		}
+		if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil {
+			return err
+		}
 	}
 
 	// "If any poststart hook fails, the runtime MUST log a warning, but
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 09965dcc0..eb9c4cd76 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/sentry/platform/kvm",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/cgroup",
         "//runsc/console",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 847417a15..26d725bdd 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -34,6 +34,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/cgroup"
 	"gvisor.googlesource.com/gvisor/runsc/console"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -58,12 +59,26 @@ type Sandbox struct {
 	// Chroot is the path to the chroot directory that the sandbox process
 	// is running in.
 	Chroot string `json:"chroot"`
+
+	// Ccroup has the cgroup configuration for the sandbox.
+	Cgroup *cgroup.Cgroup `json:"cgroup"`
 }
 
 // Create creates the sandbox process. The caller must call Destroy() on the
 // sandbox.
 func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
+	c := specutils.MakeCleanup(func() { s.destroy() })
+	defer c.Clean()
+
+	if cg, ok := cgroup.New(spec); ok {
+		s.Cgroup = cg
+
+		// If there is cgroup config, install it before creating sandbox process.
+		if err := s.Cgroup.Install(spec.Linux.Resources); err != nil {
+			return nil, fmt.Errorf("error configuring cgroup: %v", err)
+		}
+	}
 
 	// Create the sandbox process.
 	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, ioFiles); err != nil {
@@ -75,6 +90,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		return nil, err
 	}
 
+	if s.Cgroup != nil {
+		if err := s.Cgroup.Add(s.Pid); err != nil {
+			return nil, fmt.Errorf("error adding sandbox to cgroup: %v", err)
+		}
+	}
+
+	c.Release()
 	return s, nil
 }
 
@@ -483,6 +505,24 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		}
 	}
 
+	if s.Cgroup != nil {
+		cpuNum, err := s.Cgroup.NumCPU()
+		if err != nil {
+			return fmt.Errorf("error getting cpu count from cgroups: %v", err)
+		}
+		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
+
+		mem, err := s.Cgroup.MemoryLimit()
+		if err != nil {
+			return fmt.Errorf("error getting memory limit from cgroups: %v", err)
+		}
+		// When memory limit is unset, a "large" number is returned. In that case,
+		// just stick with the default.
+		if mem < 0x7ffffffffffff000 {
+			cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10))
+		}
+	}
+
 	// Add container as the last argument.
 	cmd.Args = append(cmd.Args, s.ID)
 
@@ -590,8 +630,15 @@ func (s *Sandbox) destroy() error {
 		}
 	}
 
+	if s.Cgroup != nil {
+		if err := s.Cgroup.Uninstall(); err != nil {
+			return err
+		}
+	}
 	if s.Chroot != "" {
-		return tearDownChroot(s.Chroot)
+		if err := tearDownChroot(s.Chroot); err != nil {
+			return err
+		}
 	}
 
 	return nil
@@ -761,6 +808,14 @@ func (s *Sandbox) waitForStopped() error {
 	return backoff.Retry(op, b)
 }
 
+// AddGoferToCgroup adds the gofer process to the sandbox's cgroup.
+func (s *Sandbox) AddGoferToCgroup(pid int) error {
+	if s.Cgroup != nil {
+		return s.Cgroup.Add(pid)
+	}
+	return nil
+}
+
 // deviceFileForPlatform opens the device file for the given platform. If the
 // platform does not need a device file, then nil is returned.
 func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) {
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index f1a99ce48..e73b2293f 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "specutils",
     srcs = [
-        "cpu.go",
         "namespace.go",
         "specutils.go",
     ],
diff --git a/runsc/specutils/cpu.go b/runsc/specutils/cpu.go
deleted file mode 100644
index 9abe26b64..000000000
--- a/runsc/specutils/cpu.go
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package specutils
-
-import (
-	"fmt"
-	"runtime"
-	"strconv"
-	"strings"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-)
-
-// CalculateCPUNumber calculates the number of CPUs that should be exposed
-// inside the sandbox.
-func CalculateCPUNumber(spec *specs.Spec) (int, error) {
-	// If spec does not contain CPU field, then return the number of host CPUs.
-	if spec == nil || spec.Linux == nil || spec.Linux.Resources == nil || spec.Linux.Resources.CPU == nil {
-		return runtime.NumCPU(), nil
-	}
-	cpuSpec := spec.Linux.Resources.CPU
-
-	// If cpuSpec.Cpus is specified, then parse and return that. They must be in
-	// the list format for cpusets, which is "a comma-separated list of CPU
-	// numbers and ranges of numbers, in ASCII decimal." --man 7 cpuset.
-	cpus := cpuSpec.Cpus
-	if cpus != "" {
-		cpuNum := 0
-		for _, subs := range strings.Split(cpus, ",") {
-			result, err := parseCPUNumber(subs)
-			if err != nil {
-				return 0, err
-			}
-			cpuNum += result
-		}
-		return cpuNum, nil
-	}
-
-	// If CPU.Quota and CPU.Period are specified, we can divide them to get an
-	// approximation of the number of CPUs needed.
-	if cpuSpec.Quota != nil && cpuSpec.Period != nil && *cpuSpec.Period != 0 {
-		cpuQuota := *cpuSpec.Quota
-		cpuPeriod := *cpuSpec.Period
-		return int(cpuQuota)/int(cpuPeriod) + 1, nil
-	}
-
-	// Default to number of host cpus.
-	return runtime.NumCPU(), nil
-}
-
-// parseCPUNumber converts a cpuset string into the number of cpus included in
-// the string , e.g. "3-6" -> 4.
-func parseCPUNumber(cpus string) (int, error) {
-	switch cpusSlice := strings.Split(cpus, "-"); len(cpusSlice) {
-	case 1:
-		// cpus is not a range. We must only check that it is a valid number.
-		if _, err := strconv.Atoi(cpus); err != nil {
-			return 0, fmt.Errorf("invalid individual cpu number %q", cpus)
-		}
-		return 1, nil
-	case 2:
-		// cpus is a range. We must check that start and end are valid numbers,
-		// and calculate their difference (inclusively).
-		first, err := strconv.Atoi(cpusSlice[0])
-		if err != nil || first < 0 {
-			return 0, fmt.Errorf("invalid first cpu number %q in range %q", cpusSlice[0], cpus)
-		}
-		last, err := strconv.Atoi(cpusSlice[1])
-		if err != nil || last < 0 {
-			return 0, fmt.Errorf("invalid last cpu number %q in range %q", cpusSlice[1], cpus)
-		}
-		cpuNum := last - first + 1
-		if cpuNum <= 0 {
-			return 0, fmt.Errorf("cpu range %q does not include positive number of cpus", cpus)
-		}
-	}
-	return 0, fmt.Errorf("invalid cpu string %q", cpus)
-}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index daf10b875..ac017ba2d 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -43,6 +43,13 @@ func LogSpec(spec *specs.Spec) {
 	log.Debugf("Spec: %+v", spec)
 	log.Debugf("Spec.Hooks: %+v", spec.Hooks)
 	log.Debugf("Spec.Linux: %+v", spec.Linux)
+	if spec.Linux != nil && spec.Linux.Resources != nil {
+		res := spec.Linux.Resources
+		log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory)
+		log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU)
+		log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO)
+		log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network)
+	}
 	log.Debugf("Spec.Process: %+v", spec.Process)
 	log.Debugf("Spec.Root: %+v", spec.Root)
 }
@@ -402,3 +409,33 @@ func ContainsStr(strs []string, str string) bool {
 	}
 	return false
 }
+
+// Cleanup allows defers to be aborted when cleanup needs to happen
+// conditionally. Usage:
+// c := MakeCleanup(func() { f.Close() })
+// defer c.Clean() // any failure before release is called will close the file.
+// ...
+// c.Release() // on success, aborts closing the file and return it.
+// return f
+type Cleanup struct {
+	clean    func()
+	released bool
+}
+
+// MakeCleanup creates a new Cleanup object.
+func MakeCleanup(f func()) Cleanup {
+	return Cleanup{clean: f}
+}
+
+// Clean calls the cleanup function.
+func (c *Cleanup) Clean() {
+	if !c.released {
+		c.clean()
+	}
+}
+
+// Release releases the cleanup from its duties, i.e. cleanup function is not
+// called after this point.
+func (c *Cleanup) Release() {
+	c.released = true
+}
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index 4407016ad..726ebf49e 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -15,9 +15,7 @@ go_test(
         "manual",
         "local",
     ],
-    deps = [
-        "//runsc/test/testutil",
-    ],
+    deps = ["//runsc/test/testutil"],
 )
 
 go_library(
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 5f24aeed5..5480c5bbe 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -26,6 +26,7 @@ import (
 	"net"
 	"net/http"
 	"os"
+	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -179,6 +180,89 @@ func TestConnectToSelf(t *testing.T) {
 	}
 }
 
+func TestMemLimit(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("cgroup-test")
+	cmd := "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'"
+	out, err := d.RunFg("--memory=500MB", "alpine", "sh", "-c", cmd)
+	if err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+	defer d.CleanUp()
+
+	// Remove warning message that swap isn't present.
+	if strings.HasPrefix(out, "WARNING") {
+		lines := strings.Split(out, "\n")
+		if len(lines) != 3 {
+			t.Fatalf("invalid output: %s", out)
+		}
+		out = lines[1]
+	}
+
+	got, err := strconv.ParseUint(strings.TrimSpace(out), 10, 64)
+	if err != nil {
+		t.Fatalf("failed to parse %q: %v", out, err)
+	}
+	if want := uint64(500 * 1024); got != want {
+		t.Errorf("MemTotal got: %d, want: %d", got, want)
+	}
+}
+
+func TestNumCPU(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("cgroup-test")
+	cmd := "cat /proc/cpuinfo | grep 'processor.*:' | wc -l"
+	out, err := d.RunFg("--cpuset-cpus=0", "alpine", "sh", "-c", cmd)
+	if err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+	defer d.CleanUp()
+
+	got, err := strconv.Atoi(strings.TrimSpace(out))
+	if err != nil {
+		t.Fatalf("failed to parse %q: %v", out, err)
+	}
+	if want := 1; got != want {
+		t.Errorf("MemTotal got: %d, want: %d", got, want)
+	}
+}
+
+// TestCgroup sets cgroup options and checks that container can start.
+// TODO: Verify that these were set to cgroup on the host.
+func TestCgroup(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("cgroup-test")
+
+	var args []string
+	args = append(args, "--cpu-shares=1000")
+	args = append(args, "--cpu-period=2000")
+	args = append(args, "--cpu-quota=3000")
+	args = append(args, "--cpuset-cpus=0")
+	args = append(args, "--cpuset-mems=0")
+	args = append(args, "--kernel-memory=100MB")
+	args = append(args, "--memory=1GB")
+	args = append(args, "--memory-reservation=500MB")
+	args = append(args, "--memory-swap=2GB")
+	args = append(args, "--memory-swappiness=5")
+	args = append(args, "--blkio-weight=750")
+
+	args = append(args, "hello-world")
+	if err := d.Run(args...); err != nil {
+		t.Fatal("docker create failed:", err)
+	}
+	defer d.CleanUp()
+
+	if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
+		t.Fatalf("docker didn't say hello: %v", err)
+	}
+}
+
 func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index d70b4377a..2f15ab818 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -198,6 +198,15 @@ func (d *Docker) Run(args ...string) error {
 	return err
 }
 
+// RunFg calls 'docker run' with the arguments provided in the foreground. It
+// blocks until the container exits and returns the output.
+func (d *Docker) RunFg(args ...string) (string, error) {
+	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name}
+	a = append(a, args...)
+	out, err := do(a...)
+	return string(out), err
+}
+
 // Logs calls 'docker logs'.
 func (d *Docker) Logs() (string, error) {
 	return do("logs", d.Name)
-- 
cgit v1.2.3


From ddb34b3690c07f6c8efe2b96f89166145c4a7d3c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 10 Oct 2018 14:09:24 -0700
Subject: Enforce message size limits and avoid host calls with too many iovecs

Currently, in the face of FileMem fragmentation and a large sendmsg or
recvmsg call, host sockets may pass > 1024 iovecs to the host, which
will immediately cause the host to return EMSGSIZE.

When we detect this case, use a single intermediate buffer to pass to
the kernel, copying to/from the src/dst buffer.

To avoid creating unbounded intermediate buffers, enforce message size
checks and truncation w.r.t. the send buffer size. The same
functionality is added to netstack unix sockets for feature parity.

PiperOrigin-RevId: 216590198
Change-Id: I719a32e71c7b1098d5097f35e6daf7dd5190eff7
---
 pkg/sentry/fs/host/BUILD                   |   1 +
 pkg/sentry/fs/host/socket.go               | 145 +++++++++++++++++++++--------
 pkg/sentry/fs/host/socket_iovec.go         | 113 ++++++++++++++++++++++
 pkg/sentry/fs/host/socket_unsafe.go        |  64 ++++++++-----
 pkg/sentry/socket/unix/unix.go             |  17 +++-
 pkg/syserr/netstack.go                     |   2 +
 pkg/syserror/syserror.go                   |   1 +
 pkg/tcpip/link/rawfile/errors.go           |   2 +
 pkg/tcpip/tcpip.go                         |   2 +
 pkg/tcpip/transport/queue/queue.go         |  69 +++++++++++---
 pkg/tcpip/transport/tcp/endpoint_state.go  |   2 +
 pkg/tcpip/transport/udp/endpoint.go        |   6 ++
 pkg/tcpip/transport/unix/connectionless.go |   6 +-
 pkg/tcpip/transport/unix/unix.go           |  49 ++++++----
 runsc/boot/filter/config.go                |  10 +-
 15 files changed, 386 insertions(+), 103 deletions(-)
 create mode 100644 pkg/sentry/fs/host/socket_iovec.go

(limited to 'runsc')

diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index c34f1c26b..6d5640f0a 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -15,6 +15,7 @@ go_library(
         "inode_state.go",
         "ioctl_unsafe.go",
         "socket.go",
+        "socket_iovec.go",
         "socket_state.go",
         "socket_unsafe.go",
         "tty.go",
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index e11772946..68ebf6402 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -19,6 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -33,6 +34,11 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
+// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
+//
+// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
+const maxSendBufferSize = 8 << 20
+
 // endpoint encapsulates the state needed to represent a host Unix socket.
 //
 // TODO: Remove/merge with ConnectedEndpoint.
@@ -41,15 +47,17 @@ import (
 type endpoint struct {
 	queue waiter.Queue `state:"zerovalue"`
 
-	// stype is the type of Unix socket. (Ex: unix.SockStream,
-	// unix.SockSeqpacket, unix.SockDgram)
-	stype unix.SockType `state:"nosave"`
-
 	// fd is the host fd backing this file.
 	fd int `state:"nosave"`
 
 	// If srfd >= 0, it is the host fd that fd was imported from.
 	srfd int `state:"wait"`
+
+	// stype is the type of Unix socket.
+	stype unix.SockType `state:"nosave"`
+
+	// sndbuf is the size of the send buffer.
+	sndbuf int `state:"nosave"`
 }
 
 func (e *endpoint) init() error {
@@ -67,12 +75,21 @@ func (e *endpoint) init() error {
 	if err != nil {
 		return err
 	}
+	e.stype = unix.SockType(stype)
+
+	e.sndbuf, err = syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return err
+	}
+	if e.sndbuf > maxSendBufferSize {
+		log.Warningf("Socket send buffer too large: %d", e.sndbuf)
+		return syserror.EINVAL
+	}
 
 	if err := syscall.SetNonblock(e.fd, true); err != nil {
 		return err
 	}
 
-	e.stype = unix.SockType(stype)
 	return fdnotifier.AddFD(int32(e.fd), &e.queue)
 }
 
@@ -189,13 +206,13 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = 0
 		return nil
 	case *tcpip.SendBufferSizeOption:
-		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
-		*o = tcpip.SendBufferSizeOption(v)
-		return translateError(err)
+		*o = tcpip.SendBufferSizeOption(e.sndbuf)
+		return nil
 	case *tcpip.ReceiveBufferSizeOption:
-		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
-		*o = tcpip.ReceiveBufferSizeOption(v)
-		return translateError(err)
+		// N.B. Unix sockets don't use the receive buffer. We'll claim it is
+		// the same size as the send buffer.
+		*o = tcpip.ReceiveBufferSizeOption(e.sndbuf)
+		return nil
 	case *tcpip.ReuseAddressOption:
 		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR)
 		*o = tcpip.ReuseAddressOption(v)
@@ -240,33 +257,47 @@ func (e *endpoint) SendMsg(data [][]byte, controlMessages unix.ControlMessages,
 	if to != nil {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
-	return sendMsg(e.fd, data, controlMessages)
+
+	// Since stream sockets don't preserve message boundaries, we can write
+	// only as much of the message as fits in the send buffer.
+	truncate := e.stype == unix.SockStream
+
+	return sendMsg(e.fd, data, controlMessages, e.sndbuf, truncate)
 }
 
-func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages) (uintptr, *tcpip.Error) {
+func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages, maxlen int, truncate bool) (uintptr, *tcpip.Error) {
 	if !controlMessages.Empty() {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
-	n, err := fdWriteVec(fd, data)
+	n, totalLen, err := fdWriteVec(fd, data, maxlen, truncate)
+	if n < totalLen && err == nil {
+		// The host only returns a short write if it would otherwise
+		// block (and only for stream sockets).
+		err = syserror.EAGAIN
+	}
 	return n, translateError(err)
 }
 
 // RecvMsg implements unix.Endpoint.RecvMsg.
 func (e *endpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
-	return recvMsg(e.fd, data, numRights, peek, addr)
+	// N.B. Unix sockets don't have a receive buffer, the send buffer
+	// serves both purposes.
+	rl, ml, cm, err := recvMsg(e.fd, data, numRights, peek, addr, e.sndbuf)
+	if rl > 0 && err == tcpip.ErrWouldBlock {
+		// Message did not fill buffer; that's fine, no need to block.
+		err = nil
+	}
+	return rl, ml, cm, err
 }
 
-func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
+func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress, maxlen int) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
 	var cm unet.ControlMessage
 	if numRights > 0 {
 		cm.EnableFDs(int(numRights))
 	}
-	rl, ml, cl, err := fdReadVec(fd, data, []byte(cm), peek)
-	if err == syscall.EAGAIN {
-		return 0, 0, unix.ControlMessages{}, tcpip.ErrWouldBlock
-	}
-	if err != nil {
-		return 0, 0, unix.ControlMessages{}, translateError(err)
+	rl, ml, cl, rerr := fdReadVec(fd, data, []byte(cm), peek, maxlen)
+	if rl == 0 && rerr != nil {
+		return 0, 0, unix.ControlMessages{}, translateError(rerr)
 	}
 
 	// Trim the control data if we received less than the full amount.
@@ -276,7 +307,7 @@ func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.Fu
 
 	// Avoid extra allocations in the case where there isn't any control data.
 	if len(cm) == 0 {
-		return rl, ml, unix.ControlMessages{}, nil
+		return rl, ml, unix.ControlMessages{}, translateError(rerr)
 	}
 
 	fds, err := cm.ExtractFDs()
@@ -285,9 +316,9 @@ func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.Fu
 	}
 
 	if len(fds) == 0 {
-		return rl, ml, unix.ControlMessages{}, nil
+		return rl, ml, unix.ControlMessages{}, translateError(rerr)
 	}
-	return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil
+	return rl, ml, control.New(nil, nil, newSCMRights(fds)), translateError(rerr)
 }
 
 // NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host FD
@@ -307,7 +338,27 @@ func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*Conne
 		return nil, tcpip.ErrInvalidEndpointState
 	}
 
-	e := &ConnectedEndpoint{path: path, queue: queue, file: file}
+	stype, err := syscall.GetsockoptInt(file.FD(), syscall.SOL_SOCKET, syscall.SO_TYPE)
+	if err != nil {
+		return nil, translateError(err)
+	}
+
+	sndbuf, err := syscall.GetsockoptInt(file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return nil, translateError(err)
+	}
+	if sndbuf > maxSendBufferSize {
+		log.Warningf("Socket send buffer too large: %d", sndbuf)
+		return nil, tcpip.ErrInvalidEndpointState
+	}
+
+	e := &ConnectedEndpoint{
+		path:   path,
+		queue:  queue,
+		file:   file,
+		stype:  unix.SockType(stype),
+		sndbuf: sndbuf,
+	}
 
 	// AtomicRefCounters start off with a single reference. We need two.
 	e.ref.IncRef()
@@ -346,6 +397,17 @@ type ConnectedEndpoint struct {
 	// writeClosed is true if the FD has write shutdown or if it has been
 	// closed.
 	writeClosed bool
+
+	// stype is the type of Unix socket.
+	stype unix.SockType
+
+	// sndbuf is the size of the send buffer.
+	//
+	// N.B. When this is smaller than the host size, we present it via
+	// GetSockOpt and message splitting/rejection in SendMsg, but do not
+	// prevent lots of small messages from filling the real send buffer
+	// size on the host.
+	sndbuf int
 }
 
 // Send implements unix.ConnectedEndpoint.Send.
@@ -355,7 +417,12 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMess
 	if c.writeClosed {
 		return 0, false, tcpip.ErrClosedForSend
 	}
-	n, err := sendMsg(c.file.FD(), data, controlMessages)
+
+	// Since stream sockets don't preserve message boundaries, we can write
+	// only as much of the message as fits in the send buffer.
+	truncate := c.stype == unix.SockStream
+
+	n, err := sendMsg(c.file.FD(), data, controlMessages, c.sndbuf, truncate)
 	// There is no need for the callee to call SendNotify because sendMsg uses
 	// the host's sendmsg(2) and the host kernel's queue.
 	return n, false, err
@@ -411,7 +478,15 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 	if c.readClosed {
 		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
 	}
-	rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil)
+
+	// N.B. Unix sockets don't have a receive buffer, the send buffer
+	// serves both purposes.
+	rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil, c.sndbuf)
+	if rl > 0 && err == tcpip.ErrWouldBlock {
+		// Message did not fill buffer; that's fine, no need to block.
+		err = nil
+	}
+
 	// There is no need for the callee to call RecvNotify because recvMsg uses
 	// the host's recvmsg(2) and the host kernel's queue.
 	return rl, ml, cm, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, err
@@ -460,20 +535,14 @@ func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
 
 // SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize.
 func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
-	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
-	if err != nil {
-		return -1
-	}
-	return int64(v)
+	return int64(c.sndbuf)
 }
 
 // RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize.
 func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
-	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_RCVBUF)
-	if err != nil {
-		return -1
-	}
-	return int64(v)
+	// N.B. Unix sockets don't use the receive buffer. We'll claim it is
+	// the same size as the send buffer.
+	return int64(c.sndbuf)
 }
 
 // Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release.
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
new file mode 100644
index 000000000..1a9587b90
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// maxIovs is the maximum number of iovecs to pass to the host.
+var maxIovs = linux.UIO_MAXIOV
+
+// copyToMulti copies as many bytes from src to dst as possible.
+func copyToMulti(dst [][]byte, src []byte) {
+	for _, d := range dst {
+		done := copy(d, src)
+		src = src[done:]
+		if len(src) == 0 {
+			break
+		}
+	}
+}
+
+// copyFromMulti copies as many bytes from src to dst as possible.
+func copyFromMulti(dst []byte, src [][]byte) {
+	for _, s := range src {
+		done := copy(dst, s)
+		dst = dst[done:]
+		if len(dst) == 0 {
+			break
+		}
+	}
+}
+
+// buildIovec builds an iovec slice from the given []byte slice.
+//
+// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error.
+//
+// If length < the total length of bufs, err indicates why, even when returning
+// a truncated iovec.
+//
+// If intermediate != nil, iovecs references intermediate rather than bufs and
+// the caller must copy to/from bufs as necessary.
+func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovecs []syscall.Iovec, intermediate []byte, err error) {
+	var iovsRequired int
+	for _, b := range bufs {
+		length += uintptr(len(b))
+		if len(b) > 0 {
+			iovsRequired++
+		}
+	}
+
+	stopLen := length
+	if length > uintptr(maxlen) {
+		if truncate {
+			stopLen = uintptr(maxlen)
+			err = syserror.EAGAIN
+		} else {
+			return 0, nil, nil, syserror.EMSGSIZE
+		}
+	}
+
+	if iovsRequired > maxIovs {
+		// The kernel will reject our call if we pass this many iovs.
+		// Use a single intermediate buffer instead.
+		b := make([]byte, stopLen)
+
+		return stopLen, []syscall.Iovec{{
+			Base: &b[0],
+			Len:  uint64(stopLen),
+		}}, b, err
+	}
+
+	var total uintptr
+	iovecs = make([]syscall.Iovec, 0, iovsRequired)
+	for i := range bufs {
+		l := len(bufs[i])
+		if l == 0 {
+			continue
+		}
+
+		stop := l
+		if total+uintptr(stop) > stopLen {
+			stop = int(stopLen - total)
+		}
+
+		iovecs = append(iovecs, syscall.Iovec{
+			Base: &bufs[i][0],
+			Len:  uint64(stop),
+		})
+
+		total += uintptr(stop)
+		if total >= stopLen {
+			break
+		}
+	}
+
+	return total, iovecs, nil, err
+}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index bf8da6867..5e4c5feed 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -19,29 +19,23 @@ import (
 	"unsafe"
 )
 
-// buildIovec builds an iovec slice from the given []byte slice.
-func buildIovec(bufs [][]byte) (uintptr, []syscall.Iovec) {
-	var length uintptr
-	iovecs := make([]syscall.Iovec, 0, 10)
-	for i := range bufs {
-		if l := len(bufs[i]); l > 0 {
-			length += uintptr(l)
-			iovecs = append(iovecs, syscall.Iovec{
-				Base: &bufs[i][0],
-				Len:  uint64(l),
-			})
-		}
-	}
-	return length, iovecs
-}
-
-func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) {
+// fdReadVec receives from fd to bufs.
+//
+// If the total length of bufs is > maxlen, fdReadVec will do a partial read
+// and err will indicate why the message was truncated.
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) {
 	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
 	if peek {
 		flags |= syscall.MSG_PEEK
 	}
 
-	length, iovecs := buildIovec(bufs)
+	// Always truncate the receive buffer. All socket types will truncate
+	// received messages.
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, 0, 0, err
+	}
 
 	var msg syscall.Msghdr
 	if len(control) != 0 {
@@ -53,30 +47,52 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintpt
 		msg.Iov = &iovecs[0]
 		msg.Iovlen = uint64(len(iovecs))
 	}
+
 	n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
 	if e != 0 {
+		// N.B. prioritize the syscall error over the buildIovec error.
 		return 0, 0, 0, e
 	}
 
+	// Copy data back to bufs.
+	if intermediate != nil {
+		copyToMulti(bufs, intermediate)
+	}
+
 	if n > length {
-		return length, n, msg.Controllen, nil
+		return length, n, msg.Controllen, err
 	}
 
-	return n, n, msg.Controllen, nil
+	return n, n, msg.Controllen, err
 }
 
-func fdWriteVec(fd int, bufs [][]byte) (uintptr, error) {
-	_, iovecs := buildIovec(bufs)
+// fdWriteVec sends from bufs to fd.
+//
+// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
+// partial write and err will indicate why the message was truncated.
+func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uintptr, error) {
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, length, err
+	}
+
+	// Copy data to intermediate buf.
+	if intermediate != nil {
+		copyFromMulti(intermediate, bufs)
+	}
 
 	var msg syscall.Msghdr
 	if len(iovecs) > 0 {
 		msg.Iov = &iovecs[0]
 		msg.Iovlen = uint64(len(iovecs))
 	}
+
 	n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
 	if e != 0 {
-		return 0, e
+		// N.B. prioritize the syscall error over the buildIovec error.
+		return 0, length, e
 	}
 
-	return n, nil
+	return n, length, err
 }
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1c22e78b3..e30378e60 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -378,7 +378,8 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		w.To = ep
 	}
 
-	if n, err := src.CopyInTo(t, &w); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	n, err := src.CopyInTo(t, &w)
+	if err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		return int(n), syserr.FromError(err)
 	}
 
@@ -388,15 +389,23 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	s.EventRegister(&e, waiter.EventOut)
 	defer s.EventUnregister(&e)
 
+	total := n
 	for {
-		if n, err := src.CopyInTo(t, &w); err != syserror.ErrWouldBlock {
-			return int(n), syserr.FromError(err)
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst64(n)
+
+		n, err = src.CopyInTo(t, &w)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
 		}
 
 		if err := t.Block(ch); err != nil {
-			return 0, syserr.FromError(err)
+			break
 		}
 	}
+
+	return int(total), syserr.FromError(err)
 }
 
 // Passcred implements unix.Credentialer.Passcred.
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index c40fb7dbf..b9786b48f 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -78,6 +78,8 @@ var netstackErrorTranslations = map[*tcpip.Error]*Error{
 	tcpip.ErrNoLinkAddress:         ErrHostDown,
 	tcpip.ErrBadAddress:            ErrBadAddress,
 	tcpip.ErrNetworkUnreachable:    ErrNetworkUnreachable,
+	tcpip.ErrMessageTooLong:        ErrMessageTooLong,
+	tcpip.ErrNoBufferSpace:         ErrNoBufferSpace,
 }
 
 // TranslateNetstackError converts an error from the tcpip package to a sentry
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 6f8a7a319..5bc74e65e 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -44,6 +44,7 @@ var (
 	ELIBBAD      = error(syscall.ELIBBAD)
 	ELOOP        = error(syscall.ELOOP)
 	EMFILE       = error(syscall.EMFILE)
+	EMSGSIZE     = error(syscall.EMSGSIZE)
 	ENAMETOOLONG = error(syscall.ENAMETOOLONG)
 	ENOATTR      = ENODATA
 	ENODATA      = error(syscall.ENODATA)
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
index 7f213793e..de7593d9c 100644
--- a/pkg/tcpip/link/rawfile/errors.go
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -41,6 +41,8 @@ var translations = map[syscall.Errno]*tcpip.Error{
 	syscall.ENOTCONN:      tcpip.ErrNotConnected,
 	syscall.ECONNRESET:    tcpip.ErrConnectionReset,
 	syscall.ECONNABORTED:  tcpip.ErrConnectionAborted,
+	syscall.EMSGSIZE:      tcpip.ErrMessageTooLong,
+	syscall.ENOBUFS:       tcpip.ErrNoBufferSpace,
 }
 
 // TranslateErrno translate an errno from the syscall package into a
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f5b5ec86b..cef27948c 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -98,6 +98,8 @@ var (
 	ErrNoLinkAddress         = &Error{msg: "no remote link address"}
 	ErrBadAddress            = &Error{msg: "bad address"}
 	ErrNetworkUnreachable    = &Error{msg: "network is unreachable"}
+	ErrMessageTooLong        = &Error{msg: "message too long"}
+	ErrNoBufferSpace         = &Error{msg: "no buffer space available"}
 )
 
 // Errors related to Subnet
diff --git a/pkg/tcpip/transport/queue/queue.go b/pkg/tcpip/transport/queue/queue.go
index eb9ee8a3f..b3d2ea68b 100644
--- a/pkg/tcpip/transport/queue/queue.go
+++ b/pkg/tcpip/transport/queue/queue.go
@@ -24,12 +24,23 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// Entry implements Linker interface and has both Length and Release methods.
+// Entry implements Linker interface and has additional required methods.
 type Entry interface {
 	ilist.Linker
+
+	// Length returns the number of bytes stored in the entry.
 	Length() int64
+
+	// Release releases any resources held by the entry.
 	Release()
+
+	// Peek returns a copy of the entry. It must be Released separately.
 	Peek() Entry
+
+	// Truncate reduces the number of bytes stored in the entry to n bytes.
+	//
+	// Preconditions: n <= Length().
+	Truncate(n int64)
 }
 
 // Queue is a buffer queue.
@@ -52,7 +63,7 @@ func New(ReaderQueue *waiter.Queue, WriterQueue *waiter.Queue, limit int64) *Que
 }
 
 // Close closes q for reading and writing. It is immediately not writable and
-// will become unreadble will no more data is pending.
+// will become unreadable when no more data is pending.
 //
 // Both the read and write queues must be notified after closing:
 // q.ReaderQueue.Notify(waiter.EventIn)
@@ -86,38 +97,74 @@ func (q *Queue) IsReadable() bool {
 	return q.closed || q.dataList.Front() != nil
 }
 
+// bufWritable returns true if there is space for writing.
+//
+// N.B. Linux only considers a unix socket "writable" if >75% of the buffer is
+// free.
+//
+// See net/unix/af_unix.c:unix_writeable.
+func (q *Queue) bufWritable() bool {
+	return 4*q.used < q.limit
+}
+
 // IsWritable determines if q is currently writable.
 func (q *Queue) IsWritable() bool {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 
-	return q.closed || q.used < q.limit
+	return q.closed || q.bufWritable()
 }
 
 // Enqueue adds an entry to the data queue if room is available.
 //
+// If truncate is true, Enqueue may truncate the message beforing enqueuing it.
+// Otherwise, the entire message must fit. If n < e.Length(), err indicates why.
+//
 // If notify is true, ReaderQueue.Notify must be called:
 // q.ReaderQueue.Notify(waiter.EventIn)
-func (q *Queue) Enqueue(e Entry) (notify bool, err *tcpip.Error) {
+func (q *Queue) Enqueue(e Entry, truncate bool) (l int64, notify bool, err *tcpip.Error) {
 	q.mu.Lock()
 
 	if q.closed {
 		q.mu.Unlock()
-		return false, tcpip.ErrClosedForSend
+		return 0, false, tcpip.ErrClosedForSend
+	}
+
+	free := q.limit - q.used
+
+	l = e.Length()
+
+	if l > free && truncate {
+		if free == 0 {
+			// Message can't fit right now.
+			q.mu.Unlock()
+			return 0, false, tcpip.ErrWouldBlock
+		}
+
+		e.Truncate(free)
+		l = e.Length()
+		err = tcpip.ErrWouldBlock
+	}
+
+	if l > q.limit {
+		// Message is too big to ever fit.
+		q.mu.Unlock()
+		return 0, false, tcpip.ErrMessageTooLong
 	}
 
-	if q.used >= q.limit {
+	if l > free {
+		// Message can't fit right now.
 		q.mu.Unlock()
-		return false, tcpip.ErrWouldBlock
+		return 0, false, tcpip.ErrWouldBlock
 	}
 
 	notify = q.dataList.Front() == nil
-	q.used += e.Length()
+	q.used += l
 	q.dataList.PushBack(e)
 
 	q.mu.Unlock()
 
-	return notify, nil
+	return l, notify, err
 }
 
 // Dequeue removes the first entry in the data queue, if one exists.
@@ -137,13 +184,13 @@ func (q *Queue) Dequeue() (e Entry, notify bool, err *tcpip.Error) {
 		return nil, false, err
 	}
 
-	notify = q.used >= q.limit
+	notify = !q.bufWritable()
 
 	e = q.dataList.Front().(Entry)
 	q.dataList.Remove(e)
 	q.used -= e.Length()
 
-	notify = notify && q.used < q.limit
+	notify = notify && q.bufWritable()
 
 	q.mu.Unlock()
 
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 6143390b3..bed7ec6a6 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -315,6 +315,8 @@ func loadError(s string) *tcpip.Error {
 			tcpip.ErrNoLinkAddress,
 			tcpip.ErrBadAddress,
 			tcpip.ErrNetworkUnreachable,
+			tcpip.ErrMessageTooLong,
+			tcpip.ErrNoBufferSpace,
 		}
 
 		messageToError = make(map[string]*tcpip.Error)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 6ed805357..840e95302 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -15,6 +15,7 @@
 package udp
 
 import (
+	"math"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/sleep"
@@ -264,6 +265,11 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 		return 0, nil, tcpip.ErrInvalidOptionValue
 	}
 
+	if p.Size() > math.MaxUint16 {
+		// Payload can't possibly fit in a packet.
+		return 0, nil, tcpip.ErrMessageTooLong
+	}
+
 	to := opts.To
 
 	e.mu.RLock()
diff --git a/pkg/tcpip/transport/unix/connectionless.go b/pkg/tcpip/transport/unix/connectionless.go
index ebd4802b0..ae93c61d7 100644
--- a/pkg/tcpip/transport/unix/connectionless.go
+++ b/pkg/tcpip/transport/unix/connectionless.go
@@ -105,14 +105,12 @@ func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to Bo
 	e.Lock()
 	n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
 	e.Unlock()
-	if err != nil {
-		return 0, err
-	}
+
 	if notify {
 		connected.SendNotify()
 	}
 
-	return n, nil
+	return n, err
 }
 
 // Type implements Endpoint.Type.
diff --git a/pkg/tcpip/transport/unix/unix.go b/pkg/tcpip/transport/unix/unix.go
index 0bb00df42..718606cd1 100644
--- a/pkg/tcpip/transport/unix/unix.go
+++ b/pkg/tcpip/transport/unix/unix.go
@@ -260,20 +260,28 @@ type message struct {
 	Address tcpip.FullAddress
 }
 
-// Length returns number of bytes stored in the Message.
+// Length returns number of bytes stored in the message.
 func (m *message) Length() int64 {
 	return int64(len(m.Data))
 }
 
-// Release releases any resources held by the Message.
+// Release releases any resources held by the message.
 func (m *message) Release() {
 	m.Control.Release()
 }
 
+// Peek returns a copy of the message.
 func (m *message) Peek() queue.Entry {
 	return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
 }
 
+// Truncate reduces the length of the message payload to n bytes.
+//
+// Preconditions: n <= m.Length().
+func (m *message) Truncate(n int64) {
+	m.Data.CapLength(int(n))
+}
+
 // A Receiver can be used to receive Messages.
 type Receiver interface {
 	// Recv receives a single message. This method does not block.
@@ -623,23 +631,33 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
 
 // Send implements ConnectedEndpoint.Send.
 func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
-	var l int
+	var l int64
 	for _, d := range data {
-		l += len(d)
-	}
-	// Discard empty stream packets. Since stream sockets don't preserve
-	// message boundaries, sending zero bytes is a no-op. In Linux, the
-	// receiver actually uses a zero-length receive as an indication that the
-	// stream was closed.
-	if l == 0 && e.endpoint.Type() == SockStream {
-		controlMessages.Release()
-		return 0, false, nil
+		l += int64(len(d))
+	}
+
+	truncate := false
+	if e.endpoint.Type() == SockStream {
+		// Since stream sockets don't preserve message boundaries, we
+		// can write only as much of the message as fits in the queue.
+		truncate = true
+
+		// Discard empty stream packets. Since stream sockets don't
+		// preserve message boundaries, sending zero bytes is a no-op.
+		// In Linux, the receiver actually uses a zero-length receive
+		// as an indication that the stream was closed.
+		if l == 0 {
+			controlMessages.Release()
+			return 0, false, nil
+		}
 	}
+
 	v := make([]byte, 0, l)
 	for _, d := range data {
 		v = append(v, d...)
 	}
-	notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from})
+
+	l, notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from}, truncate)
 	return uintptr(l), notify, err
 }
 
@@ -793,15 +811,12 @@ func (e *baseEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoin
 
 	n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
 	e.Unlock()
-	if err != nil {
-		return 0, err
-	}
 
 	if notify {
 		e.connected.SendNotify()
 	}
 
-	return n, nil
+	return n, err
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 06c04e3bb..92a73db9a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -121,11 +121,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(syscall.SOL_SOCKET),
 			seccomp.AllowValue(syscall.SO_SNDBUF),
 		},
-		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_RCVBUF),
-		},
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(syscall.SOL_SOCKET),
@@ -304,6 +299,11 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.AllowValue(syscall.SOL_SOCKET),
 				seccomp.AllowValue(syscall.SO_SNDBUF),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_RCVBUF),
+			},
 			{
 				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_SOCKET),
-- 
cgit v1.2.3


From 8388a505e735045f31c6f7180711ef57148dc517 Mon Sep 17 00:00:00 2001
From: Jonathan Giannuzzi <jonathan@giannuzzi.be>
Date: Wed, 10 Oct 2018 14:17:27 -0700
Subject: Support for older Linux kernels without getrandom

Change-Id: I1fb9f5b47a264a7617912f6f56f995f3c4c5e578
PiperOrigin-RevId: 216591484
---
 pkg/rand/rand_linux.go | 31 +++++++++++++++++++++++++++----
 runsc/boot/BUILD       |  1 +
 runsc/boot/loader.go   | 12 ++++++++++--
 3 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 37ac07620..a2be66b3b 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -17,23 +17,46 @@
 package rand
 
 import (
+	"crypto/rand"
 	"io"
+	"sync"
 
 	"golang.org/x/sys/unix"
 )
 
 // reader implements an io.Reader that returns pseudorandom bytes.
-type reader struct{}
+type reader struct {
+	once         sync.Once
+	useGetrandom bool
+}
 
 // Read implements io.Reader.Read.
-func (reader) Read(p []byte) (int, error) {
-	return unix.Getrandom(p, 0)
+func (r *reader) Read(p []byte) (int, error) {
+	r.once.Do(func() {
+		_, err := unix.Getrandom(p, 0)
+		if err != unix.ENOSYS {
+			r.useGetrandom = true
+		}
+	})
+
+	if r.useGetrandom {
+		return unix.Getrandom(p, 0)
+	}
+	return rand.Read(p)
 }
 
 // Reader is the default reader.
-var Reader io.Reader = reader{}
+var Reader io.Reader = &reader{}
 
 // Read reads from the default reader.
 func Read(b []byte) (int, error) {
 	return io.ReadFull(Reader, b)
 }
+
+// Init can be called to make sure /dev/urandom is pre-opened on kernels that
+// do not support getrandom(2).
+func Init() error {
+	p := make([]byte, 1)
+	_, err := Read(p)
+	return err
+}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index a38a3a94e..c1e035d3b 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -25,6 +25,7 @@ go_library(
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/log",
+        "//pkg/rand",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/control",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index dc3c6c3d0..859446344 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -17,7 +17,7 @@ package boot
 
 import (
 	"fmt"
-	"math/rand"
+	mrand "math/rand"
 	"os"
 	"os/signal"
 	"runtime"
@@ -30,6 +30,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
@@ -133,7 +134,7 @@ type execProcess struct {
 
 func init() {
 	// Initialize the random number generator.
-	rand.Seed(gtime.Now().UnixNano())
+	mrand.Seed(gtime.Now().UnixNano())
 
 	// Register the global syscall table.
 	kernel.RegisterSyscallTable(slinux.AMD64)
@@ -167,9 +168,16 @@ type Args struct {
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
 func New(args Args) (*Loader, error) {
+	// We initialize the rand package now to make sure /dev/urandom is pre-opened
+	// on kernels that do not support getrandom(2).
+	if err := rand.Init(); err != nil {
+		return nil, fmt.Errorf("error setting up rand: %v", err)
+	}
+
 	if err := usage.Init(); err != nil {
 		return nil, fmt.Errorf("error setting up memory usage: %v", err)
 	}
+
 	// Create kernel and platform.
 	p, err := createPlatform(args.Conf, args.DeviceFD)
 	if err != nil {
-- 
cgit v1.2.3


From 1939cd020f19a17707eb5e597e010fea8ab35de5 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 10 Oct 2018 14:33:59 -0700
Subject: runsc: Pass controlling TTY by FD in the *new* process, not current
 process.

When setting Cmd.SysProcAttr.Ctty, the FD must be the FD of the controlling TTY
in the new process, not the current process. The ioctl call is made after
duping all FDs in Cmd.ExtraFiles, which may stomp on the old TTY FD.

This fixes the "bad address" flakes in runsc/container:container_test, although
some other flakes remain.

PiperOrigin-RevId: 216594394
Change-Id: Idfd1677abb866aa82ad7e8be776f0c9087256862
---
 runsc/sandbox/sandbox.go | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 26d725bdd..7f1afc34b 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -363,11 +363,6 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			return fmt.Errorf("error setting up console with socket %q: %v", consoleSocket, err)
 		}
 		defer tty.Close()
-		fd := int(tty.Fd())
-
-		// Set the TTY as a controlling TTY on the sandbox process.
-		cmd.SysProcAttr.Setctty = true
-		cmd.SysProcAttr.Ctty = fd
 
 		// Ideally we would set the sandbox stdin to this process'
 		// stdin, but for some reason Docker does not like that (it
@@ -378,6 +373,13 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		cmd.Stdout = tty
 		cmd.Stderr = tty
 
+		// Set the TTY as a controlling TTY on the sandbox process.
+		// Note that the Ctty field must be the FD of the TTY in the
+		// *new* process, not this process. Since we set the TTY to
+		// stdin, we can use FD 0 here.
+		cmd.SysProcAttr.Setctty = true
+		cmd.SysProcAttr.Ctty = 0
+
 		// Pass the tty as all stdio fds to sandbox.
 		for i := 0; i < 3; i++ {
 			cmd.ExtraFiles = append(cmd.ExtraFiles, tty)
-- 
cgit v1.2.3


From e21ba16d9cf7ba4f2d5f65651e06ab592032ef86 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 10 Oct 2018 16:49:40 -0700
Subject: Removes irrelevant TODO.

PiperOrigin-RevId: 216616873
Change-Id: I4d974ab968058eadd01542081e18a987ef08f50a
---
 runsc/boot/loader.go | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 859446344..60b278295 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -501,9 +501,6 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		caps,
 		l.k.RootUserNamespace())
 
-	// TODO New containers should be started in new PID namespaces
-	// when indicated by the spec.
-
 	procArgs, err := newProcess(cid, spec, creds, l.k)
 	if err != nil {
 		return fmt.Errorf("failed to create new process: %v", err)
-- 
cgit v1.2.3


From f413e4b11794cd71cc3b2b64c8f6861f5394a3f1 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 11 Oct 2018 11:55:45 -0700
Subject: Add bare bones unsupported syscall logging

This change introduces a new flags to create/run called
--user-log. Logs to this files are visible to users and
are meant to help debugging problems with their images
and containers.

For now only unsupported syscalls are sent to this log,
and only minimum support was added. We can build more
infrastructure around it as needed.

PiperOrigin-RevId: 216735977
Change-Id: I54427ca194604991c407d49943ab3680470de2d0
---
 runsc/boot/BUILD                        |  6 +++
 runsc/boot/compat.go                    | 76 +++++++++++++++++++++++++++
 runsc/boot/loader.go                    |  6 +++
 runsc/cmd/boot.go                       |  5 ++
 runsc/cmd/capability_test.go            |  2 +-
 runsc/cmd/checkpoint.go                 |  2 +-
 runsc/cmd/create.go                     | 10 +++-
 runsc/cmd/run.go                        |  2 +-
 runsc/container/container.go            |  8 +--
 runsc/container/container_test.go       | 93 ++++++++++++++++++++++-----------
 runsc/container/multi_container_test.go |  2 +-
 runsc/container/test_app.go             | 36 +++++++++++++
 runsc/sandbox/sandbox.go                | 18 +++++--
 13 files changed, 223 insertions(+), 43 deletions(-)
 create mode 100644 runsc/boot/compat.go

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index c1e035d3b..f8f848ebf 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "boot",
     srcs = [
+        "compat.go",
         "config.go",
         "controller.go",
         "debug.go",
@@ -21,12 +22,15 @@ go_library(
         "//runsc:__subpackages__",
     ],
     deps = [
+        "//pkg/abi",
         "//pkg/abi/linux",
         "//pkg/control/server",
         "//pkg/cpuid",
+        "//pkg/eventchannel",
         "//pkg/log",
         "//pkg/rand",
         "//pkg/sentry/arch",
+        "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/context",
         "//pkg/sentry/control",
         "//pkg/sentry/fs",
@@ -55,6 +59,7 @@ go_library(
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
+        "//pkg/sentry/syscalls:unimplemented_syscall_go_proto",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/time",
         "//pkg/sentry/usage",
@@ -74,6 +79,7 @@ go_library(
         "//pkg/urpc",
         "//runsc/boot/filter",
         "//runsc/specutils",
+        "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
new file mode 100644
index 000000000..3250cdcdc
--- /dev/null
+++ b/runsc/boot/compat.go
@@ -0,0 +1,76 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/golang/protobuf/proto"
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+	spb "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto"
+)
+
+func initCompatLogs(fd int) error {
+	ce, err := newCompatEmitter(fd)
+	if err != nil {
+		return err
+	}
+	eventchannel.AddEmitter(ce)
+	return nil
+}
+
+type compatEmitter struct {
+	sink    *log.BasicLogger
+	nameMap strace.SyscallMap
+}
+
+func newCompatEmitter(logFD int) (*compatEmitter, error) {
+	// Always logs to default logger.
+	nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+	if !ok {
+		return nil, fmt.Errorf("amd64 Linux syscall table not found")
+	}
+	c := &compatEmitter{sink: log.Log(), nameMap: nameMap}
+
+	if logFD > 0 {
+		f := os.NewFile(uintptr(logFD), "user log file")
+		target := log.MultiEmitter{c.sink, log.GoogleEmitter{&log.Writer{Next: f}}}
+		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
+	}
+	return c, nil
+}
+
+// Emit implements eventchannel.Emitter.
+func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
+	// Only interested in UnimplementedSyscall, skip the rest.
+	if us, ok := msg.(*spb.UnimplementedSyscall); ok {
+		regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+		sysnr := regs.OrigRax
+		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+	}
+	return false, nil
+}
+
+// Close implements eventchannel.Emitter.
+func (c *compatEmitter) Close() error {
+	c.sink = nil
+	return nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 60b278295..0a3f67774 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -163,6 +163,8 @@ type Args struct {
 	// TotalMem is the initial amount of total memory to report back to the
 	// container.
 	TotalMem uint64
+	// UserLogFD is the file descriptor to write user logs to.
+	UserLogFD int
 }
 
 // New initializes a new kernel loader configured by spec.
@@ -313,6 +315,10 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
 
+	if err := initCompatLogs(args.UserLogFD); err != nil {
+		return nil, fmt.Errorf("init compat logs: %v", err)
+	}
+
 	l := &Loader{
 		k:                     k,
 		ctrl:                  ctrl,
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index d26e92bcd..023b63dc0 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -66,6 +66,9 @@ type Boot struct {
 	// totalMem sets the initial amount of total memory to report back to the
 	// container.
 	totalMem uint64
+
+	// userLogFD is the file descriptor to write user logs to.
+	userLogFD int
 }
 
 // Name implements subcommands.Command.Name.
@@ -95,6 +98,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
 	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
 	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
+	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
 }
 
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
@@ -163,6 +167,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Console:      b.console,
 		NumCPU:       b.cpuNum,
 		TotalMem:     b.totalMem,
+		UserLogFD:    b.userLogFD,
 	}
 	l, err := boot.New(bootArgs)
 	if err != nil {
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index be9ef2e7b..3329b308d 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -97,7 +97,7 @@ func TestCapabilities(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	c, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	c, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index d074b8617..023ab2455 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -133,7 +133,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("error destroying container: %v", err)
 	}
 
-	cont, err = container.Create(id, spec, conf, bundleDir, "", "")
+	cont, err = container.Create(id, spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		Fatalf("error restoring container: %v", err)
 	}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 38ae03e7a..ecd76ee93 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -41,6 +41,13 @@ type Create struct {
 	// pseudoterminal.  This is ignored unless spec.Process.Terminal is
 	// true.
 	consoleSocket string
+
+	// userLog is the path to send user-visible logs to. This log is different
+	// from debug logs. The former is meant to be consumed by the users and should
+	// contain only information that is relevant to the person running the
+	// container, e.g. unsuported syscalls, while the later is more verbose and
+	// consumed by developers.
+	userLog string
 }
 
 // Name implements subcommands.Command.Name.
@@ -64,6 +71,7 @@ func (c *Create) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
 	f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
 	f.StringVar(&c.pidFile, "pid-file", "", "filename that the container pid will be written to")
+	f.StringVar(&c.userLog, "user-log", "", "filename to send user-visible logs to. Empty means no logging.")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -90,7 +98,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	// Create the container. A new sandbox will be created for the
 	// container unless the metadata specifies that it should be run in an
 	// existing container.
-	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile); err != nil {
+	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, c.userLog); err != nil {
 		Fatalf("error creating container: %v", err)
 	}
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 92aa6bc40..826e6e875 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -75,7 +75,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	specutils.LogSpec(spec)
 	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
-	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile)
+	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog)
 	if err != nil {
 		Fatalf("error running container: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index eaa62daf1..10108db5a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -230,7 +230,7 @@ func List(rootDir string) ([]string, error) {
 // Create creates the container in a new Sandbox process, unless the metadata
 // indicates that an existing Sandbox should be used. The caller must call
 // Destroy() on the container.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (*Container, error) {
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile, userLog string) (*Container, error) {
 	log.Debugf("Create container %q in root dir: %s", id, conf.RootDir)
 	if err := validateID(id); err != nil {
 		return nil, err
@@ -278,7 +278,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 
 		// Start a new sandbox for this container. Any errors after this point
 		// must destroy the container.
-		c.Sandbox, err = sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles)
+		c.Sandbox, err = sandbox.Create(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles)
 		if err != nil {
 			return nil, err
 		}
@@ -396,9 +396,9 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 }
 
 // Run is a helper that calls Create + Start + Wait.
-func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (syscall.WaitStatus, error) {
+func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile, userLog string) (syscall.WaitStatus, error) {
 	log.Debugf("Run container %q in root dir: %s", id, conf.RootDir)
-	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile)
+	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, userLog)
 	if err != nil {
 		return 0, fmt.Errorf("error creating container: %v", err)
 	}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 84b59ffd8..7ea99d06b 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -210,18 +210,9 @@ func run(spec *specs.Spec, conf *boot.Config) error {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
-		return fmt.Errorf("error creating container: %v", err)
-	}
-	defer c.Destroy()
-	if err := c.Start(conf); err != nil {
-		return fmt.Errorf("error starting container: %v", err)
-	}
-
-	ws, err := c.Wait()
-	if err != nil {
-		return fmt.Errorf("error waiting on container: %v", err)
+		return fmt.Errorf("running container: %v", err)
 	}
 	if !ws.Exited() || ws.ExitStatus() != 0 {
 		return fmt.Errorf("container failed, waitStatus: %v", ws)
@@ -299,7 +290,7 @@ func TestLifecycle(t *testing.T) {
 		}
 		// Create the container.
 		id := testutil.UniqueContainerID()
-		c, err := Create(id, spec, conf, bundleDir, "", "")
+		c, err := Create(id, spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -420,7 +411,7 @@ func TestExePath(t *testing.T) {
 				t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
 			}
 
-			ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+			ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 
 			os.RemoveAll(rootDir)
 			os.RemoveAll(bundleDir)
@@ -453,7 +444,7 @@ func TestAppExitStatus(t *testing.T) {
 	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
-	ws, err := Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir, "", "")
+	ws, err := Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
@@ -472,7 +463,7 @@ func TestAppExitStatus(t *testing.T) {
 	defer os.RemoveAll(rootDir2)
 	defer os.RemoveAll(bundleDir2)
 
-	ws, err = Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir2, "", "")
+	ws, err = Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir2, "", "", "")
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
@@ -497,7 +488,7 @@ func TestExec(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -603,7 +594,7 @@ func TestCheckpointRestore(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -649,7 +640,7 @@ func TestCheckpointRestore(t *testing.T) {
 		defer outputFile2.Close()
 
 		// Restore into a new container.
-		cont2, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont2, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -688,7 +679,7 @@ func TestCheckpointRestore(t *testing.T) {
 		defer outputFile3.Close()
 
 		// Restore into a new container.
-		cont3, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont3, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -767,7 +758,7 @@ func TestUnixDomainSockets(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -814,7 +805,7 @@ func TestUnixDomainSockets(t *testing.T) {
 		defer outputFile2.Close()
 
 		// Restore into a new container.
-		contRestore, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		contRestore, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -868,7 +859,7 @@ func TestPauseResume(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -973,7 +964,7 @@ func TestPauseResumeStatus(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -1037,7 +1028,7 @@ func TestCapabilities(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1138,7 +1129,7 @@ func TestConsoleSocket(t *testing.T) {
 
 		// Create the container and pass the socket name.
 		id := testutil.UniqueContainerID()
-		c, err := Create(id, spec, conf, bundleDir, socketRelPath, "")
+		c, err := Create(id, spec, conf, bundleDir, socketRelPath, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1262,7 +1253,7 @@ func TestReadonlyRoot(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create, start and wait for the container.
-		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1306,7 +1297,7 @@ func TestReadonlyMount(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create, start and wait for the container.
-		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1349,7 +1340,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		cont, err := Create(cid, spec, conf, bundleDir, "", "")
+		cont, err := Create(cid, spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -1409,7 +1400,7 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -1531,7 +1522,7 @@ func TestGoferExits(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -1591,7 +1582,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	defer os.RemoveAll(bundleDir)
 
 	// Create and start the container.
-	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
@@ -1694,6 +1685,46 @@ func TestJobControlSignalExec(t *testing.T) {
 	}
 }
 
+func TestUserLog(t *testing.T) {
+	app, err := testutil.FindFile("runsc/container/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	// sched_rr_get_interval = 148 - not implemented in gvisor.
+	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "user_log_test")
+	if err != nil {
+		t.Fatalf("error creating tmp dir: %v", err)
+	}
+	userLog := filepath.Join(dir, "user.log")
+
+	// Create, start and wait for the container.
+	ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", userLog)
+	if err != nil {
+		t.Fatalf("error running container: %v", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Fatalf("container failed, waitStatus: %v", ws)
+	}
+
+	out, err := ioutil.ReadFile(userLog)
+	if err != nil {
+		t.Fatalf("error opening user log file %q: %v", userLog, err)
+	}
+	if want := "Unsupported syscall: sched_rr_get_interval"; !strings.Contains(string(out), want) {
+		t.Errorf("user log file doesn't contain %q, out: %s", want, string(out))
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index d23d36c37..77f8da8b0 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -83,7 +83,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		}
 		bundles = append(bundles, bundleDir)
 
-		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "", "")
 		if err != nil {
 			cleanup()
 			return nil, nil, fmt.Errorf("error creating container: %v", err)
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index f69cfdf83..9e4b5326d 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -24,6 +24,7 @@ import (
 	"os"
 	"os/exec"
 	"strconv"
+	sys "syscall"
 	"time"
 
 	"flag"
@@ -38,6 +39,7 @@ func main() {
 	subcommands.Register(new(taskTree), "")
 	subcommands.Register(new(forkBomb), "")
 	subcommands.Register(new(reaper), "")
+	subcommands.Register(new(syscall), "")
 
 	flag.Parse()
 
@@ -241,3 +243,37 @@ func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface
 	defer stop()
 	select {}
 }
+
+type syscall struct {
+	sysno uint64
+}
+
+// Name implements subcommands.Command.
+func (*syscall) Name() string {
+	return "syscall"
+}
+
+// Synopsis implements subcommands.Command.
+func (*syscall) Synopsis() string {
+	return "syscall makes a syscall"
+}
+
+// Usage implements subcommands.Command.
+func (*syscall) Usage() string {
+	return "syscall <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (s *syscall) SetFlags(f *flag.FlagSet) {
+	f.Uint64Var(&s.sysno, "syscall", 0, "syscall to call")
+}
+
+// Execute implements subcommands.Command.
+func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if _, _, errno := sys.Syscall(uintptr(s.sysno), 0, 0, 0); errno != 0 {
+		fmt.Printf("syscall(%d, 0, 0...) failed: %v\n", s.sysno, errno)
+	} else {
+		fmt.Printf("syscall(%d, 0, 0...) success\n", s.sysno)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 7f1afc34b..37a3efd09 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -66,7 +66,7 @@ type Sandbox struct {
 
 // Create creates the sandbox process. The caller must call Destroy() on the
 // sandbox.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) {
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
 	c := specutils.MakeCleanup(func() { s.destroy() })
 	defer c.Clean()
@@ -81,7 +81,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	}
 
 	// Create the sandbox process.
-	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, ioFiles); err != nil {
+	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles); err != nil {
 		return nil, err
 	}
 
@@ -266,7 +266,7 @@ func (s *Sandbox) connError(err error) error {
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) error {
+func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -525,6 +525,18 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		}
 	}
 
+	if userLog != "" {
+		f, err := os.OpenFile(userLog, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+		if err != nil {
+			return fmt.Errorf("opening compat log file: %v", err)
+		}
+		defer f.Close()
+
+		cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+		cmd.Args = append(cmd.Args, "--user-log-fd", strconv.Itoa(nextFD))
+		nextFD++
+	}
+
 	// Add container as the last argument.
 	cmd.Args = append(cmd.Args, s.ID)
 
-- 
cgit v1.2.3


From d40d80106988e9302aaa354d4f58caa6c31429b4 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 11 Oct 2018 11:56:42 -0700
Subject: Sandbox cgroup tests

Verify that cgroup is being properly set.

PiperOrigin-RevId: 216736137
Change-Id: I0e27fd604eca67e7dd2e3548dc372ca9cc416309
---
 runsc/cgroup/cgroup.go                     |   2 +
 runsc/test/integration/integration_test.go |  32 -------
 runsc/test/root/BUILD                      |   5 +-
 runsc/test/root/cgroup_test.go             | 140 +++++++++++++++++++++++++++++
 runsc/test/testutil/docker.go              |   9 ++
 5 files changed, 155 insertions(+), 33 deletions(-)
 create mode 100644 runsc/test/root/cgroup_test.go

(limited to 'runsc')

diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 6a0092be8..af0252bb3 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -123,6 +123,8 @@ func fillFromAncestor(path string) (string, error) {
 	return val, nil
 }
 
+// countCpuset returns the number of CPU in a string formatted like:
+// 		"0-2,7,12-14  # bits 0, 1, 2, 7, 12, 13, and 14 set" - man 7 cpuset
 func countCpuset(cpuset string) (int, error) {
 	var count int
 	for _, p := range strings.Split(cpuset, ",") {
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 5480c5bbe..e93171b8a 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -231,38 +231,6 @@ func TestNumCPU(t *testing.T) {
 	}
 }
 
-// TestCgroup sets cgroup options and checks that container can start.
-// TODO: Verify that these were set to cgroup on the host.
-func TestCgroup(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("cgroup-test")
-
-	var args []string
-	args = append(args, "--cpu-shares=1000")
-	args = append(args, "--cpu-period=2000")
-	args = append(args, "--cpu-quota=3000")
-	args = append(args, "--cpuset-cpus=0")
-	args = append(args, "--cpuset-mems=0")
-	args = append(args, "--kernel-memory=100MB")
-	args = append(args, "--memory=1GB")
-	args = append(args, "--memory-reservation=500MB")
-	args = append(args, "--memory-swap=2GB")
-	args = append(args, "--memory-swappiness=5")
-	args = append(args, "--blkio-weight=750")
-
-	args = append(args, "hello-world")
-	if err := d.Run(args...); err != nil {
-		t.Fatal("docker create failed:", err)
-	}
-	defer d.CleanUp()
-
-	if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
-		t.Fatalf("docker didn't say hello: %v", err)
-	}
-}
-
 func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
index dbc0f1d26..c69249b52 100644
--- a/runsc/test/root/BUILD
+++ b/runsc/test/root/BUILD
@@ -11,7 +11,10 @@ go_library(
 go_test(
     name = "root_test",
     size = "small",
-    srcs = ["chroot_test.go"],
+    srcs = [
+        "cgroup_test.go",
+        "chroot_test.go",
+    ],
     embed = [":root"],
     tags = [
         # Requires docker and runsc to be configured before the test runs.
diff --git a/runsc/test/root/cgroup_test.go b/runsc/test/root/cgroup_test.go
new file mode 100644
index 000000000..5cb4b794f
--- /dev/null
+++ b/runsc/test/root/cgroup_test.go
@@ -0,0 +1,140 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package root
+
+import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// TestCgroup sets cgroup options and checks that cgroup was properly configured.
+func TestCgroup(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("cgroup-test")
+
+	attrs := []struct {
+		arg            string
+		ctrl           string
+		file           string
+		want           string
+		skipIfNotFound bool
+	}{
+		{
+			arg:  "--cpu-shares=1000",
+			ctrl: "cpu",
+			file: "cpu.shares",
+			want: "1000",
+		},
+		{
+			arg:  "--cpu-period=2000",
+			ctrl: "cpu",
+			file: "cpu.cfs_period_us",
+			want: "2000",
+		},
+		{
+			arg:  "--cpu-quota=3000",
+			ctrl: "cpu",
+			file: "cpu.cfs_quota_us",
+			want: "3000",
+		},
+		{
+			arg:  "--cpuset-cpus=0",
+			ctrl: "cpuset",
+			file: "cpuset.cpus",
+			want: "0",
+		},
+		{
+			arg:  "--cpuset-mems=0",
+			ctrl: "cpuset",
+			file: "cpuset.mems",
+			want: "0",
+		},
+		{
+			arg:  "--kernel-memory=100MB",
+			ctrl: "memory",
+			file: "memory.kmem.limit_in_bytes",
+			want: "104857600",
+		},
+		{
+			arg:  "--memory=1GB",
+			ctrl: "memory",
+			file: "memory.limit_in_bytes",
+			want: "1073741824",
+		},
+		{
+			arg:  "--memory-reservation=500MB",
+			ctrl: "memory",
+			file: "memory.soft_limit_in_bytes",
+			want: "524288000",
+		},
+		{
+			arg:            "--memory-swap=2GB",
+			ctrl:           "memory",
+			file:           "memory.memsw.limit_in_bytes",
+			want:           "2147483648",
+			skipIfNotFound: true, // swap may be disabled on the machine.
+		},
+		{
+			arg:  "--memory-swappiness=5",
+			ctrl: "memory",
+			file: "memory.swappiness",
+			want: "5",
+		},
+		{
+			arg:  "--blkio-weight=750",
+			ctrl: "blkio",
+			file: "blkio.weight",
+			want: "750",
+		},
+	}
+
+	args := make([]string, 0, len(attrs))
+	for _, attr := range attrs {
+		args = append(args, attr.arg)
+	}
+
+	args = append(args, "alpine", "sleep", "10000")
+	if err := d.Run(args...); err != nil {
+		t.Fatal("docker create failed:", err)
+	}
+	defer d.CleanUp()
+
+	gid, err := d.ID()
+	if err != nil {
+		t.Fatalf("Docker.ID() failed: %v", err)
+	}
+	t.Logf("cgroup ID: %s", gid)
+	for _, attr := range attrs {
+		path := filepath.Join("/sys/fs/cgroup", attr.ctrl, "docker", gid, attr.file)
+		out, err := ioutil.ReadFile(path)
+		if err != nil {
+			if os.IsNotExist(err) && attr.skipIfNotFound {
+				t.Logf("skipped %s/%s", attr.ctrl, attr.file)
+				continue
+			}
+			t.Fatalf("failed to read %q: %v", path, err)
+		}
+		if got := strings.TrimSpace(string(out)); got != attr.want {
+			t.Errorf("arg: %q, cgroup attribute %s/%s, got: %q, want: %q", attr.arg, attr.ctrl, attr.file, got, attr.want)
+		}
+	}
+}
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 2f15ab818..8a51d3eed 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -298,6 +298,15 @@ func (d *Docker) RootDirInHost() (string, error) {
 	return strings.TrimSuffix(string(out), "\n"), nil
 }
 
+// ID returns the container ID.
+func (d *Docker) ID() (string, error) {
+	out, err := do("inspect", "-f={{.Id}}", d.Name)
+	if err != nil {
+		return "", fmt.Errorf("error retrieving ID: %v", err)
+	}
+	return strings.TrimSpace(string(out)), nil
+}
+
 // WaitForOutput calls 'docker logs' to retrieve containers output and searches
 // for the given pattern.
 func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
-- 
cgit v1.2.3


From e68d86e1bd47f7905e4452f7ce0e04e683561f85 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 11 Oct 2018 14:28:15 -0700
Subject: Make debug log file name configurable

This is a breaking change if you're using --debug-log-dir.
The fix is to replace it with --debug-log and add a '/' at
the end:
  --debug-log-dir=/tmp/runsc ==> --debug-log=/tmp/runsc/

PiperOrigin-RevId: 216761212
Change-Id: I244270a0a522298c48115719fa08dad55e34ade1
---
 README.md                    |  2 +-
 runsc/boot/config.go         |  7 +++----
 runsc/main.go                | 19 ++++++++-----------
 runsc/sandbox/sandbox.go     |  6 +++---
 runsc/specutils/specutils.go | 25 +++++++++++++++++++------
 runsc/test/install.sh        |  2 +-
 6 files changed, 35 insertions(+), 26 deletions(-)

(limited to 'runsc')

diff --git a/README.md b/README.md
index d85948ce5..a4fca1f62 100644
--- a/README.md
+++ b/README.md
@@ -297,7 +297,7 @@ Docker configuration (`/etc/docker/daemon.json`):
         "runsc": {
             "path": "/usr/local/bin/runsc",
             "runtimeArgs": [
-                "--debug-log-dir=/tmp/runsc",
+                "--debug-log=/tmp/runsc/",
                 "--debug",
                 "--strace"
             ]
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index cd977c8a5..41af084b9 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -160,9 +160,8 @@ type Config struct {
 	// LogFormat is the log format, "text" or "json".
 	LogFormat string
 
-	// DebugLogDir is the directory to log debug information to, if not
-	// empty.
-	DebugLogDir string
+	// DebugLog is the path to log debug information to, if not empty.
+	DebugLog string
 
 	// FileAccess indicates how the filesystem is accessed.
 	FileAccess FileAccessType
@@ -217,7 +216,7 @@ func (c *Config) ToFlags() []string {
 		"--debug=" + strconv.FormatBool(c.Debug),
 		"--log=" + c.LogFilename,
 		"--log-format=" + c.LogFormat,
-		"--debug-log-dir=" + c.DebugLogDir,
+		"--debug-log=" + c.DebugLog,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
 		"--network=" + c.Network.String(),
diff --git a/runsc/main.go b/runsc/main.go
index 16d30f7a0..27aec1cd9 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -45,10 +45,10 @@ var (
 	// system that are not covered by the runtime spec.
 
 	// Debugging flags.
-	debugLogDir = flag.String("debug-log-dir", "", "additional location for logs. It creates individual log files per command")
-	logPackets  = flag.Bool("log-packets", false, "enable network packet logging")
-	logFD       = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
-	debugLogFD  = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	debugLog   = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+	logPackets = flag.Bool("log-packets", false, "enable network packet logging")
+	logFD      = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
 
 	// Debugging flags: strace related
 	strace         = flag.Bool("strace", false, "enable strace")
@@ -131,7 +131,7 @@ func main() {
 		Debug:          *debug,
 		LogFilename:    *logFilename,
 		LogFormat:      *logFormat,
-		DebugLogDir:    *debugLogDir,
+		DebugLog:       *debugLog,
 		FileAccess:     fsAccess,
 		Overlay:        *overlay,
 		Network:        netType,
@@ -195,13 +195,10 @@ func main() {
 		}
 
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
-	} else if *debugLogDir != "" {
-		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
-			cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err)
-		}
-		f, err := specutils.DebugLogFile(*debugLogDir, subcommand)
+	} else if *debugLog != "" {
+		f, err := specutils.DebugLogFile(*debugLog, subcommand)
 		if err != nil {
-			cmd.Fatalf("error opening debug log file in %q: %v", *debugLogDir, err)
+			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
 		}
 		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 37a3efd09..a0de4a175 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -291,10 +291,10 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD))
 		nextFD++
 	}
-	if conf.DebugLogDir != "" {
-		debugLogFile, err := specutils.DebugLogFile(conf.DebugLogDir, "boot")
+	if conf.DebugLog != "" {
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot")
 		if err != nil {
-			return fmt.Errorf("error opening debug log file in %q: %v", conf.DebugLogDir, err)
+			return fmt.Errorf("error opening debug log file in %q: %v", conf.DebugLog, err)
 		}
 		defer debugLogFile.Close()
 		cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index ac017ba2d..6b3e52021 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -351,12 +351,25 @@ func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) er
 	return backoff.Retry(op, b)
 }
 
-// DebugLogFile opens a file in logDir based on the timestamp and subcommand
-// for writing.
-func DebugLogFile(logDir, subcommand string) (*os.File, error) {
-	// Format: <debug-log-dir>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
-	filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), subcommand)
-	return os.OpenFile(filepath.Join(logDir, filename), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+// DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern'
+// ends with '/', it's used as a directory with default file name.
+// 'logPattern' can contain variables that are substitued:
+//   - %TIMESTAMP%: is replaced with a timestamp using the following format:
+//			<yyyymmdd-hhmmss.uuuuuu>
+//	 - %COMMAND%: is replaced with 'command'
+func DebugLogFile(logPattern, command string) (*os.File, error) {
+	if strings.HasSuffix(logPattern, "/") {
+		// Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
+		logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%"
+	}
+	logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1)
+	logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1)
+
+	dir := filepath.Dir(logPattern)
+	if err := os.MkdirAll(dir, 0775); err != nil {
+		return nil, fmt.Errorf("error creating dir %q: %v", dir, err)
+	}
+	return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
 }
 
 // Mount creates the mount point and calls Mount with the given flags.
diff --git a/runsc/test/install.sh b/runsc/test/install.sh
index c110d96f9..c239588d4 100755
--- a/runsc/test/install.sh
+++ b/runsc/test/install.sh
@@ -75,7 +75,7 @@ if [[ ${uninstall} == 0 ]]; then
   mkdir -p "${logdir}"
   sudo -n chmod a+wx "${logdir}"
 
-  declare -r args="--debug-log-dir "${logdir}" --debug --strace --log-packets"
+  declare -r args="--debug-log '${logdir}/' --debug --strace --log-packets"
   sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" ${args}
   sudo -n "${dockercfg}" runtime-add "${runtime}"-kvm "${runsc}" --platform=kvm ${args}
   sudo -n "${dockercfg}" runtime-add "${runtime}"-hostnet "${runsc}" --network=host ${args}
-- 
cgit v1.2.3


From ea5f6ed6ecab7f8b2648836117f62629b3c2cbb8 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 11 Oct 2018 16:05:44 -0700
Subject: Make Wait() return the sandbox exit status if the sandbox has exited.

It's possible for Start() and Wait() calls to race, if the sandboxed
application is short-lived. If the application finishes before (or during) the
Wait RPC, then Wait will fail.  In practice this looks like "connection
refused" or "EOF" errors when waiting for an RPC response.

This race is especially bad in tests, where we often run "true" inside a
sandbox.

This CL does a best-effort fix, by returning the sandbox exit status as the
container exit status.  In most cases, these are the same.

This fixes the remaining flakes in runsc/container:container_test.

PiperOrigin-RevId: 216777793
Change-Id: I9dfc6e6ec885b106a736055bc7a75b2008dfff7a
---
 runsc/container/container_test.go | 47 +++++++++++++++++++++++++++++++++++++++
 runsc/sandbox/sandbox.go          | 24 +++++++++++++++++---
 2 files changed, 68 insertions(+), 3 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7ea99d06b..94572667e 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1725,6 +1725,53 @@ func TestUserLog(t *testing.T) {
 	}
 }
 
+func TestWaitOnExitedSandbox(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		// Run a shell that exits immediately with a non-zero code.
+		const wantExit = 17
+		cmd := fmt.Sprintf("exit %d", wantExit)
+		spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// Create and Start the container.
+		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer c.Destroy()
+		if err := c.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+
+		// Wait for the sandbox to stop running.
+		if err := testutil.Poll(func() error {
+			if c.Sandbox.IsRunning() {
+				return nil
+			}
+			return fmt.Errorf("sandbox still running")
+		}, 10*time.Second); err != nil {
+			t.Fatalf("error waiting for sandbox to exit: %v", err)
+		}
+
+		// Now call Wait.
+		ws, err := c.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
+
+		if got := ws.ExitStatus(); got != wantExit {
+			t.Errorf("got exit status %d, want %d", got, wantExit)
+		}
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index a0de4a175..39c855db9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -596,10 +596,28 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	}
 	defer conn.Close()
 
-	if err := conn.Call(boot.ContainerWait, &cid, &ws); err != nil {
-		return ws, fmt.Errorf("error waiting on container %q: %v", cid, err)
+	// First try the Wait RPC to the sandbox.
+	if err := conn.Call(boot.ContainerWait, &cid, &ws); err == nil {
+		return ws, nil
 	}
-	return ws, nil
+	log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+
+	// The sandbox may have already exited, or exited while handling the
+	// Wait RPC. The best we can do is ask Linux what the sandbox exit
+	// status was, since in most cases that will be the same as the
+	// container exit status.
+	p, err := os.FindProcess(s.Pid)
+	if err != nil {
+		// "On Unix systems, FindProcess always succeeds and returns a
+		// Process for the given pid, regardless of whether the process
+		// exists."
+		return ws, fmt.Errorf("FindProcess(%d) failed: %v", s.Pid, err)
+	}
+	ps, err := p.Wait()
+	if err != nil {
+		return ws, fmt.Errorf("sandbox no longer running, tried to get exit status, but Wait failed: %v", err)
+	}
+	return ps.Sys().(syscall.WaitStatus), nil
 }
 
 // WaitPID waits for process 'pid' in the container's sandbox and returns its
-- 
cgit v1.2.3


From 3bc5e6482b110a03651abcfb02c93eef8a7ee90f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 11 Oct 2018 16:22:44 -0700
Subject: Fix reference leak in tests.

PiperOrigin-RevId: 216780438
Change-Id: Ide637fe36f8d2a61fea9e5b16d1b3401f2540416
---
 runsc/boot/loader_test.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 10efa4427..7d35dcae2 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -398,8 +398,10 @@ func TestCreateMountNamespace(t *testing.T) {
 			root := mm.Root()
 			defer root.DecRef()
 			for _, p := range tc.expectedPaths {
-				if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
+				if d, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				} else {
+					d.DecRef()
 				}
 			}
 		})
-- 
cgit v1.2.3


From f074f0c2c77c4aec24700a49ebcbca1a7f2285e0 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 11 Oct 2018 17:44:50 -0700
Subject: Make the gofer process enter namespaces

This is done to further isolate the gofer from the host.

PiperOrigin-RevId: 216790991
Change-Id: Ia265b77e4e50f815d08f743a05669f9d75ad7a6f
---
 runsc/container/container.go | 12 +++++++++++-
 runsc/sandbox/sandbox.go     | 17 ++++++++---------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 10108db5a..37e607bed 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -726,11 +726,21 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	cmd := exec.Command(binPath, args...)
 	cmd.ExtraFiles = goferEnds
 
+	// Enter new namespaces to isolate from the rest of the system. Don't unshare
+	// cgroup because gofer is added to a cgroup in the caller's namespace.
+	nss := []specs.LinuxNamespace{
+		{Type: specs.IPCNamespace},
+		{Type: specs.MountNamespace},
+		{Type: specs.NetworkNamespace},
+		{Type: specs.PIDNamespace},
+		{Type: specs.UTSNamespace},
+	}
+
 	// Setup any uid/gid mappings, and create or join the configured user
 	// namespace so the gofer's view of the filesystem aligns with the
 	// users in the sandbox.
+	nss = append(nss, specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)...)
 	specutils.SetUIDGIDMappings(cmd, spec)
-	nss := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
 
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 39c855db9..6c1b39be7 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -408,12 +408,14 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	cmd.SysProcAttr.Setsid = true
 
 	// nss is the set of namespaces to join or create before starting the sandbox
-	// process. IPC and UTS namespaces from the host are not used as they
+	// process. Mount, IPC and UTS namespaces from the host are not used as they
 	// are virtualized inside the sandbox. Be paranoid and run inside an empty
-	// namespace for these.
-	log.Infof("Sandbox will be started in new IPC and UTS namespaces")
+	// namespace for these. Don't unshare cgroup because sandbox is added to a
+	// cgroup in the caller's namespace.
+	log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces")
 	nss := []specs.LinuxNamespace{
 		{Type: specs.IPCNamespace},
+		{Type: specs.MountNamespace},
 		{Type: specs.UTSNamespace},
 	}
 
@@ -426,9 +428,6 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
 	}
 
-	log.Infof("Sandbox will be started in new mount namespace")
-	nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
-
 	// Joins the network namespace if network is enabled. the sandbox talks
 	// directly to the host network, which may have been configured in the
 	// namespace.
@@ -440,9 +439,9 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
 	}
 
-	// User namespace depends on the following options:
-	//   - Host network/filesystem: requires to run inside the user namespace
-	//       specified in the spec or the current namespace if none is configured.
+	// User namespace depends on the network type. Host network requires to run
+	// inside the user namespace specified in the spec or the current namespace
+	// if none is configured.
 	if conf.Network == boot.NetworkHost {
 		if userns, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
 			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
-- 
cgit v1.2.3


From a771775f3a4680b3a121deb6f583ed62f4da8bef Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Fri, 12 Oct 2018 12:58:42 -0700
Subject: Added spec command to create OCI spec config.json

The spec command is analygous to the 'runc spec' command and allows for
the convenient creation of a config.json file for users that don't have
runc handy.

Change-Id: Ifdfec37e023048ea461c32da1a9042a45b37d856
PiperOrigin-RevId: 216907826
---
 runsc/cmd/BUILD   |   1 +
 runsc/cmd/spec.go | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 runsc/main.go     |   1 +
 3 files changed, 184 insertions(+)
 create mode 100644 runsc/cmd/spec.go

(limited to 'runsc')

diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 7c90ff2c5..7040eb4ec 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -23,6 +23,7 @@ go_library(
         "restore.go",
         "resume.go",
         "run.go",
+        "spec.go",
         "start.go",
         "state.go",
         "wait.go",
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
new file mode 100644
index 000000000..6281fc49d
--- /dev/null
+++ b/runsc/cmd/spec.go
@@ -0,0 +1,182 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+)
+
+var specTemplate = []byte(`{
+	"ociVersion": "1.0.0",
+	"process": {
+		"terminal": true,
+		"user": {
+			"uid": 0,
+			"gid": 0
+		},
+		"args": [
+			"sh"
+		],
+		"env": [
+			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+			"TERM=xterm"
+		],
+		"cwd": "/",
+		"capabilities": {
+			"bounding": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"effective": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"inheritable": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"permitted": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"ambient": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			]
+		},
+		"rlimits": [
+			{
+				"type": "RLIMIT_NOFILE",
+				"hard": 1024,
+				"soft": 1024
+			}
+		]
+	},
+	"root": {
+		"path": "rootfs",
+		"readonly": true
+	},
+	"hostname": "runsc",
+	"mounts": [
+		{
+			"destination": "/proc",
+			"type": "proc",
+			"source": "proc"
+		},
+		{
+			"destination": "/dev",
+			"type": "tmpfs",
+			"source": "tmpfs",
+			"options": []
+		},
+		{
+			"destination": "/sys",
+			"type": "sysfs",
+			"source": "sysfs",
+			"options": [
+				"nosuid",
+				"noexec",
+				"nodev",
+				"ro"
+			]
+		}
+	],
+	"linux": {
+		"namespaces": [
+			{
+				"type": "pid"
+			},
+			{
+				"type": "network"
+			},
+			{
+				"type": "ipc"
+			},
+			{
+				"type": "uts"
+			},
+			{
+				"type": "mount"
+			}
+		]
+	}
+}`)
+
+// Spec implements subcommands.Command for the "spec" command.
+type Spec struct {
+	bundle string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Spec) Name() string {
+	return "spec"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Spec) Synopsis() string {
+	return "create a new OCI bundle specification file"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Spec) Usage() string {
+	return `spec [options] - create a new OCI bundle specification file.
+
+The spec command creates a new specification file (config.json) for a new OCI bundle.
+
+The specification file is a starter file that runs the "sh" command in the container. You
+should edit the file to suit your needs. You can find out more about the format of the
+specification file by visiting the OCI runtime spec repository:
+https://github.com/opencontainers/runtime-spec/
+
+EXAMPLE:
+    $ mkdir -p bundle/rootfs
+    $ cd bundle
+    $ runsc spec
+    $ docker export $(docker create hello-world) | tar -xf - -C rootfs
+    $ sed -i 's;"sh";"/hello";' config.json
+    $ sudo runsc run hello
+
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (s *Spec) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	confPath := filepath.Join(s.bundle, "config.json")
+	if _, err := os.Stat(confPath); !os.IsNotExist(err) {
+		Fatalf("file %q already exists", confPath)
+	}
+
+	if err := ioutil.WriteFile(confPath, specTemplate, 0664); err != nil {
+		Fatalf("error writing to %q: %v", confPath, err)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/main.go b/runsc/main.go
index 27aec1cd9..62b1f01b3 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -86,6 +86,7 @@ func main() {
 	subcommands.Register(new(cmd.Restore), "")
 	subcommands.Register(new(cmd.Resume), "")
 	subcommands.Register(new(cmd.Run), "")
+	subcommands.Register(new(cmd.Spec), "")
 	subcommands.Register(new(cmd.Start), "")
 	subcommands.Register(new(cmd.State), "")
 	subcommands.Register(new(cmd.Wait), "")
-- 
cgit v1.2.3


From 3f0532595679c388362203bbce1d4b6c4d2e336b Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 15 Oct 2018 11:06:20 -0700
Subject: Never send boot process stdio to application stdio.

We treat handle the boot process stdio separately from the application stdio
(which gets passed via flags), but we were still sending both to same place. As
a result, some logs that are written directly to os.Stderr by the boot process
were ending up in the application logs.

This CL starts sendind boot process stdio to the null device (since we don't
have any better options). The boot process is already configured to send all
logs (and panics) to the log file, so we won't miss anything important.

PiperOrigin-RevId: 217173020
Change-Id: I5ab980da037f34620e7861a3736ba09c18d73794
---
 runsc/sandbox/sandbox.go | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 6c1b39be7..be68e864f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -351,6 +351,15 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nextFD++
 	}
 
+	// The current process' stdio must be passed to the application via the
+	// --stdio-fds flag. The stdio of the sandbox process itself must not
+	// be connected to the same FDs, otherwise we risk leaking sandbox
+	// errors to the application, so we set the sandbox stdio to nil,
+	// causing them to read/write from the null device.
+	cmd.Stdin = nil
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+
 	// If the console control socket file is provided, then create a new
 	// pty master/slave pair and set the TTY on the sandbox process.
 	if consoleSocket != "" {
@@ -364,21 +373,13 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		}
 		defer tty.Close()
 
-		// Ideally we would set the sandbox stdin to this process'
-		// stdin, but for some reason Docker does not like that (it
-		// never calls `runsc start`). Instead we set stdio to the
-		// console TTY, but note that this is distinct from the
-		// container stdio, which is passed via the flags below.
-		cmd.Stdin = tty
-		cmd.Stdout = tty
-		cmd.Stderr = tty
-
 		// Set the TTY as a controlling TTY on the sandbox process.
 		// Note that the Ctty field must be the FD of the TTY in the
-		// *new* process, not this process. Since we set the TTY to
+		// *new* process, not this process. Since we are about to
+		// assign the TTY to nextFD, we can use that value here.
 		// stdin, we can use FD 0 here.
 		cmd.SysProcAttr.Setctty = true
-		cmd.SysProcAttr.Ctty = 0
+		cmd.SysProcAttr.Ctty = nextFD
 
 		// Pass the tty as all stdio fds to sandbox.
 		for i := 0; i < 3; i++ {
@@ -386,14 +387,15 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
 			nextFD++
 		}
-	} else {
-		// Connect the sandbox process to this process's stdios. Note
-		// that this is distinct from the container's stdio, which is
-		// passed by the flags below.
-		cmd.Stdin = os.Stdin
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
 
+		if conf.Debug {
+			// If debugging, send the boot process stdio to the
+			// TTY, so that it is easier to find.
+			cmd.Stdin = tty
+			cmd.Stdout = tty
+			cmd.Stderr = tty
+		}
+	} else {
 		// If not using a console, pass our current stdio as the
 		// container stdio via flags.
 		for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
@@ -401,6 +403,14 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
 			nextFD++
 		}
+
+		if conf.Debug {
+			// If debugging, send the boot process stdio to the
+			// this process' stdio, so that is is easier to find.
+			cmd.Stdin = os.Stdin
+			cmd.Stdout = os.Stdout
+			cmd.Stderr = os.Stderr
+		}
 	}
 
 	// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
-- 
cgit v1.2.3


From cea51641d4dacf24cc53a30144fc14ec053f5aa2 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 16 Oct 2018 20:33:20 -0700
Subject: Bump sandbox start and stop timeouts.

PiperOrigin-RevId: 217433699
Change-Id: Icef08285728c23ee7dd650706aaf18da51c25dff
---
 runsc/container/BUILD        | 2 --
 runsc/container/container.go | 2 +-
 runsc/sandbox/sandbox.go     | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index bf8b9a2ab..60f1d3033 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -40,8 +40,6 @@ go_test(
     ],
     embed = [":container"],
     tags = [
-        # FIXME
-        "flaky",
         "requires-kvm",
     ],
     deps = [
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 37e607bed..774cb6e07 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -658,7 +658,7 @@ func (c *Container) stop() error {
 }
 
 func (c *Container) waitForStopped() error {
-	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index be68e864f..713b326a6 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -86,7 +86,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	}
 
 	// Wait for the control server to come up (or timeout).
-	if err := s.waitForCreated(10 * time.Second); err != nil {
+	if err := s.waitForCreated(20 * time.Second); err != nil {
 		return nil, err
 	}
 
-- 
cgit v1.2.3


From 4fae756645cf11a9f8bae87dd845d82dce5e428e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 17 Oct 2018 09:01:42 -0700
Subject: Make removing cgroups retry up to 5 seconds.

Sometimes if we try to remove the cgroup directory too soon after killing the
sandbox we EBUSY. This CL adds a retry (up to 5 seconds) for removing.

Deflakes ChrootTest.

PiperOrigin-RevId: 217526909
Change-Id: I749bb172117e2298c9888ecad094072393b94810
---
 runsc/cgroup/BUILD     |  1 +
 runsc/cgroup/cgroup.go | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index 4a535d230..10a8e5feb 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -12,6 +12,7 @@ go_library(
     deps = [
         "//pkg/log",
         "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
 )
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index af0252bb3..7a75a189a 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -17,6 +17,7 @@
 package cgroup
 
 import (
+	"context"
 	"fmt"
 	"io/ioutil"
 	"os"
@@ -24,7 +25,9 @@ import (
 	"strconv"
 	"strings"
 	"syscall"
+	"time"
 
+	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -214,8 +217,19 @@ func (c *Cgroup) Uninstall() error {
 	}
 	log.Debugf("Deleting cgroup %q", c.Name)
 	for key := range controllers {
-		if err := syscall.Rmdir(c.makePath(key)); err != nil && !os.IsNotExist(err) {
-			return err
+		path := c.makePath(key)
+		log.Debugf("Removing cgroup controller for key=%q path=%q", key, path)
+
+		// If we try to remove the cgroup too soon after killing the
+		// sandbox we might get EBUSY, so we retry for a few seconds
+		// until it succeeds.
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+		if err := backoff.Retry(func() error {
+			return syscall.Rmdir(path)
+		}, b); err != nil {
+			return fmt.Errorf("error removing cgroup path %q: %v", path, err)
 		}
 	}
 	return nil
-- 
cgit v1.2.3


From bdcf8d143ef33d190fd92fcf7343bba06c3dba1f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 17 Oct 2018 09:08:45 -0700
Subject: Bump Pause/Resume integration test timeout in attempt to deflake
 Kokoro.

This is one of the many tests that fails periodically, making Kokoro unstable.

PiperOrigin-RevId: 217528257
Change-Id: I2508ecf4d74d71b91feff1183544d61d7bd16995
---
 runsc/test/integration/integration_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index e93171b8a..b7d07309d 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -106,7 +106,7 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 20*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
 		t.Fatal("WaitForHTTP() timeout:", err)
 	}
 
-- 
cgit v1.2.3


From ba33a70e47492f9cc8e3550ed795c892553ac1d4 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 17 Oct 2018 09:30:11 -0700
Subject: Attempt to deflake TestPythonHello

It has timed out running with kokoro a few times. I passes
consistently on my machine (200+ runsc). Increase the timeout
to see if it helps.

Failure: image_test.go:212: WaitForHTTP() timeout: Get http://localhost:32785/: dial tcp [::1]:32785: connect: connection refused
PiperOrigin-RevId: 217532428
Change-Id: Ibf860aecf537830bef832e436f2e804b3fc12f2d
---
 runsc/test/image/image_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 428f05c04..40c92e1c0 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -208,7 +208,7 @@ func TestPythonHello(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 20*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
-- 
cgit v1.2.3


From 9b3550f70bf1612e2c474b3826b0347b21503401 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 17 Oct 2018 10:50:24 -0700
Subject: runsc: Add --pid flag to runsc kill.

--pid allows specific processes to be signalled rather than the container root
process or all processes in the container. containerd needs to SIGKILL exec'd
processes that timeout and check whether processes are still alive.

PiperOrigin-RevId: 217547636
Change-Id: I2058ebb548b51c8eb748f5884fb88bad0b532e45
---
 runsc/boot/loader.go                    | 16 ++++++++-
 runsc/cmd/kill.go                       | 17 ++++++++--
 runsc/container/container.go            | 32 ++++++++++++------
 runsc/container/container_test.go       | 58 ++++++++++++++++++++++++++++++++-
 runsc/container/multi_container_test.go | 12 +++----
 runsc/container/test_app.go             | 10 +++++-
 6 files changed, 124 insertions(+), 21 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0a3f67774..fa169d090 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -756,8 +756,22 @@ func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess boo
 	ep, ok := l.processes[eid]
 	l.mu.Unlock()
 
+	// The caller may be signaling a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace and
+	// signal it.
 	if !ok {
-		return fmt.Errorf("failed to signal container %q PID %d: no such PID", cid, pid)
+		ep, ok := l.processes[execID{cid: cid}]
+		if !ok {
+			return fmt.Errorf("no container with ID: %q", cid)
+		}
+		tg := ep.tg.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
+		if tg == nil {
+			return fmt.Errorf("failed to signal container %q PID %d: no such process", cid, pid)
+		}
+		if tg.Leader().ContainerID() != cid {
+			return fmt.Errorf("process %d is part of a different container: %q", pid, tg.Leader().ContainerID())
+		}
+		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
 	}
 
 	if !sendToFGProcess {
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index dcb2988e3..7a98d10a2 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -31,6 +31,7 @@ import (
 // Kill implements subcommands.Command for the "kill" command.
 type Kill struct {
 	all bool
+	pid int
 }
 
 // Name implements subcommands.Command.Name.
@@ -51,6 +52,7 @@ func (*Kill) Usage() string {
 // SetFlags implements subcommands.Command.SetFlags.
 func (k *Kill) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&k.all, "all", false, "send the specified signal to all processes inside the container")
+	f.IntVar(&k.pid, "pid", 0, "send the specified signal to a specific process")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -63,6 +65,10 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
 
+	if k.pid != 0 && k.all {
+		Fatalf("it is invalid to specify both --all and --pid")
+	}
+
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
 		Fatalf("error loading container: %v", err)
@@ -80,8 +86,15 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if err != nil {
 		Fatalf("%v", err)
 	}
-	if err := c.Signal(sig, k.all); err != nil {
-		Fatalf("%v", err)
+
+	if k.pid != 0 {
+		if err := c.SignalProcess(sig, int32(k.pid)); err != nil {
+			Fatalf("failed to signal pid %d: %v", k.pid, err)
+		}
+	} else {
+		if err := c.SignalContainer(sig, k.all); err != nil {
+			Fatalf("%v", err)
+		}
 	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 774cb6e07..0ec4d03c1 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -174,7 +174,7 @@ func Load(rootDir, id string) (*Container, error) {
 		} else if c.Status == Running {
 			// Container state should reflect the actual state of the application, so
 			// we don't consider gofer process here.
-			if err := c.Signal(syscall.Signal(0), false); err != nil {
+			if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
 				c.changeStatus(Stopped)
 			}
 		}
@@ -445,7 +445,7 @@ func (c *Container) SandboxPid() int {
 func (c *Container) Wait() (syscall.WaitStatus, error) {
 	log.Debugf("Wait on container %q", c.ID)
 	if !c.isSandboxRunning() {
-		return 0, fmt.Errorf("container is not running")
+		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.Wait(c.ID)
 }
@@ -455,7 +455,7 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
 	if !c.isSandboxRunning() {
-		return 0, fmt.Errorf("container is not running")
+		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
 }
@@ -465,16 +465,16 @@ func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus
 func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on PID %d in container %q", pid, c.ID)
 	if !c.isSandboxRunning() {
-		return 0, fmt.Errorf("container is not running")
+		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
 }
 
-// Signal sends the signal to the container. If all is true and signal is
-// SIGKILL, then waits for all processes to exit before returning.
-// Signal returns an error if the container is already stopped.
+// SignalContainer sends the signal to the container. If all is true and signal
+// is SIGKILL, then waits for all processes to exit before returning.
+// SignalContainer returns an error if the container is already stopped.
 // TODO: Distinguish different error types.
-func (c *Container) Signal(sig syscall.Signal, all bool) error {
+func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
 	log.Debugf("Signal container %q: %v", c.ID, sig)
 	// Signaling container in Stopped state is allowed. When all=false,
 	// an error will be returned anyway; when all=true, this allows
@@ -485,11 +485,23 @@ func (c *Container) Signal(sig syscall.Signal, all bool) error {
 		return err
 	}
 	if !c.isSandboxRunning() {
-		return fmt.Errorf("container is not running")
+		return fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.SignalContainer(c.ID, sig, all)
 }
 
+// SignalProcess sends sig to a specific process in the container.
+func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error {
+	log.Debugf("Signal process %d in container %q: %v", pid, c.ID, sig)
+	if err := c.requireStatus("signal a process inside", Running); err != nil {
+		return err
+	}
+	if !c.isSandboxRunning() {
+		return fmt.Errorf("sandbox is not running")
+	}
+	return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
+}
+
 // ForwardSignals forwards all signals received by the current process to the
 // container process inside the sandbox. It returns a function that will stop
 // forwarding signals.
@@ -663,7 +675,7 @@ func (c *Container) waitForStopped() error {
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
 		if c.isSandboxRunning() {
-			if err := c.Signal(syscall.Signal(0), false); err == nil {
+			if err := c.SignalContainer(syscall.Signal(0), false); err == nil {
 				return fmt.Errorf("container is still running")
 			}
 		}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 94572667e..d9cd38c0a 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -354,7 +354,7 @@ func TestLifecycle(t *testing.T) {
 		<-ch
 		time.Sleep(100 * time.Millisecond)
 		// Send the container a SIGTERM which will cause it to stop.
-		if err := c.Signal(syscall.SIGTERM, false); err != nil {
+		if err := c.SignalContainer(syscall.SIGTERM, false); err != nil {
 			t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
 		}
 		// Wait for it to die.
@@ -559,6 +559,62 @@ func TestExec(t *testing.T) {
 	}
 }
 
+// TestKillPid verifies that we can signal individual exec'd processes.
+func TestKillPid(t *testing.T) {
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		app, err := testutil.FindFile("runsc/container/test_app")
+		if err != nil {
+			t.Fatal("error finding test_app:", err)
+		}
+
+		const nProcs = 4
+		spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true")
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// Create and start the container.
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+
+		// Verify that all processes are running.
+		if err := waitForProcessCount(cont, nProcs); err != nil {
+			t.Fatalf("timed out waiting for processes to start: %v", err)
+		}
+
+		// Kill the child process with the largest PID.
+		procs, err := cont.Processes()
+		if err != nil {
+			t.Fatalf("failed to get process list: %v", err)
+		}
+		var pid int32
+		for _, p := range procs {
+			if pid < int32(p.PID) {
+				pid = int32(p.PID)
+			}
+		}
+		if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil {
+			t.Fatalf("failed to signal process %d: %v", pid, err)
+		}
+
+		// Verify that one process is gone.
+		if err := waitForProcessCount(cont, nProcs-1); err != nil {
+			t.Fatal(err)
+		}
+	}
+}
+
 // TestCheckpointRestore creates a container that continuously writes successive integers
 // to a file. To test checkpoint and restore functionality, the container is
 // checkpointed and the last number printed to the file is recorded. Then, it is restored in two
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 77f8da8b0..1781a4602 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -335,7 +335,7 @@ func TestMultiContainerSignal(t *testing.T) {
 		}
 
 		// Kill process 2.
-		if err := containers[1].Signal(syscall.SIGKILL, false); err != nil {
+		if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil {
 			t.Errorf("failed to kill process 2: %v", err)
 		}
 
@@ -368,12 +368,12 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Now that process 2 is gone, ensure we get an error trying to
 		// signal it again.
-		if err := containers[1].Signal(syscall.SIGKILL, false); err == nil {
+		if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil {
 			t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
 		}
 
 		// Kill process 1.
-		if err := containers[0].Signal(syscall.SIGKILL, false); err != nil {
+		if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil {
 			t.Errorf("failed to kill process 1: %v", err)
 		}
 
@@ -395,7 +395,7 @@ func TestMultiContainerSignal(t *testing.T) {
 		}
 
 		// The sentry should be gone, so signaling should yield an error.
-		if err := containers[0].Signal(syscall.SIGKILL, false); err == nil {
+		if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil {
 			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
 		}
 	}
@@ -577,7 +577,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 		if tc.killContainer {
 			// First kill the init process to make the container be stopped with
 			// processes still running inside.
-			containers[1].Signal(syscall.SIGKILL, false)
+			containers[1].SignalContainer(syscall.SIGKILL, false)
 			op := func() error {
 				c, err := Load(conf.RootDir, ids[1])
 				if err != nil {
@@ -598,7 +598,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 			t.Fatalf("failed to load child container %q: %v", c.ID, err)
 		}
 		// Kill'Em All
-		if err := c.Signal(syscall.SIGKILL, true); err != nil {
+		if err := c.SignalContainer(syscall.SIGKILL, true); err != nil {
 			t.Fatalf("failed to send SIGKILL to container %q: %v", c.ID, err)
 		}
 
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index 9e4b5326d..cc3b087e1 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -125,6 +125,7 @@ func server(listener net.Listener, out *os.File) {
 type taskTree struct {
 	depth int
 	width int
+	pause bool
 }
 
 // Name implements subcommands.Command.
@@ -146,6 +147,7 @@ func (*taskTree) Usage() string {
 func (c *taskTree) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&c.depth, "depth", 1, "number of levels to create")
 	f.IntVar(&c.width, "width", 1, "number of tasks at each level")
+	f.BoolVar(&c.pause, "pause", false, "whether the tasks should pause perpetually")
 }
 
 // Execute implements subcommands.Command.
@@ -164,7 +166,8 @@ func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 		cmd := exec.Command(
 			"/proc/self/exe", c.Name(),
 			"--depth", strconv.Itoa(c.depth-1),
-			"--width", strconv.Itoa(c.width))
+			"--width", strconv.Itoa(c.width),
+			"--pause", strconv.FormatBool(c.pause))
 		cmd.Stdout = os.Stdout
 		cmd.Stderr = os.Stderr
 
@@ -177,6 +180,11 @@ func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	for _, c := range cmds {
 		c.Wait()
 	}
+
+	if c.pause {
+		select {}
+	}
+
 	return subcommands.ExitSuccess
 }
 
-- 
cgit v1.2.3


From 8cbca46b6d99bcf0b2647ffa247b0963f872916b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 17 Oct 2018 10:54:19 -0700
Subject: Remove incorrect TODO.

PiperOrigin-RevId: 217548429
Change-Id: Ie640c881fdc4fc70af58c8ca834df1ac531e519a
---
 runsc/boot/events.go | 1 -
 1 file changed, 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index 832339cf4..595846b10 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -29,7 +29,6 @@ type Event struct {
 
 // Stats is the runc specific stats structure for stability when encoding and
 // decoding stats.
-// TODO: Many fields aren't obtainable due to a lack of cgroups.
 type Stats struct {
 	Memory Memory `json:"memory"`
 	Pids   Pids   `json:"pids"`
-- 
cgit v1.2.3


From 4e6f0892c96c374b1abcf5c39b75ba52d98c97f8 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 17 Oct 2018 12:27:58 -0700
Subject: runsc: Support job control signals for the root container.

Now containers run with "docker run -it" support control characters like ^C and
^Z.

This required refactoring our signal handling a bit. Signals delivered to the
"runsc boot" process are turned into loader.Signal calls with the appropriate
delivery mode. Previously they were always sent directly to PID 1.

PiperOrigin-RevId: 217566770
Change-Id: I5b7220d9a0f2b591a56335479454a200c6de8732
---
 pkg/sentry/kernel/kernel.go                |  27 +-
 pkg/sentry/sighandling/BUILD               |   6 +-
 pkg/sentry/sighandling/sighandling.go      |  25 +-
 runsc/boot/controller.go                   |  58 ++--
 runsc/boot/fds.go                          |  45 ++-
 runsc/boot/loader.go                       | 221 +++++++-------
 runsc/container/BUILD                      |   1 +
 runsc/container/console_test.go            | 452 +++++++++++++++++++++++++++++
 runsc/container/container_test.go          | 203 -------------
 runsc/sandbox/sandbox.go                   |  24 +-
 runsc/test/integration/exec_test.go        |   2 +-
 runsc/test/integration/integration_test.go |  48 +++
 runsc/test/testutil/docker.go              |  39 +++
 13 files changed, 776 insertions(+), 375 deletions(-)
 create mode 100644 runsc/container/console_test.go

(limited to 'runsc')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index cc664deec..84afdb530 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -839,17 +839,40 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 	k.tasks.mu.RLock()
 	defer k.tasks.mu.RUnlock()
 
+	var lastErr error
 	for t := range k.tasks.Root.tids {
 		if t == t.tg.leader && t.ContainerID() == cid {
 			t.tg.signalHandlers.mu.Lock()
 			defer t.tg.signalHandlers.mu.Unlock()
 			infoCopy := *info
 			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
-				return err
+				lastErr = err
 			}
 		}
 	}
-	return nil
+	return lastErr
+}
+
+// SendProcessGroupSignal sends a signal to all processes inside the process
+// group. It is analagous to kernel/signal.c:kill_pgrp.
+func (k *Kernel) SendProcessGroupSignal(pg *ProcessGroup, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	var lastErr error
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader && t.tg.ProcessGroup() == pg {
+			t.tg.signalHandlers.mu.Lock()
+			defer t.tg.signalHandlers.mu.Unlock()
+			infoCopy := *info
+			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				lastErr = err
+			}
+		}
+	}
+	return lastErr
 }
 
 // FeatureSet returns the FeatureSet.
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index f480f0735..751176747 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -10,9 +10,5 @@ go_library(
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling",
     visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/kernel",
-    ],
+    deps = ["//pkg/abi/linux"],
 )
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 0946ab075..29bcf55ab 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -23,18 +23,17 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 )
 
 // numSignals is the number of normal (non-realtime) signals on Linux.
 const numSignals = 32
 
-// forwardSignals listens for incoming signals and delivers them to k.
+// handleSignals listens for incoming signals and calls the given handler
+// function.
 //
 // It starts when the start channel is closed, stops when the stop channel
 // is closed, and closes done once it will no longer deliver signals to k.
-func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop, done chan struct{}) {
+func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, stop, done chan struct{}) {
 	// Build a select case.
 	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}}
 	for _, sigchan := range sigchans {
@@ -98,18 +97,19 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop, do
 			}
 		}
 
-		k.SendExternalSignal(&arch.SignalInfo{Signo: int32(signal)}, "sentry")
+		// Pass the signal to the handler.
+		handler(signal)
 	}
 }
 
-// PrepareForwarding ensures that synchronous signals are forwarded to k and
-// returns a callback that starts signal delivery, which itself returns a
-// callback that stops signal forwarding.
+// PrepareHandler ensures that synchronous signals are passed to the given
+// handler function and returns a callback that starts signal delivery, which
+// itself returns a callback that stops signal handling.
 //
 // Note that this function permanently takes over signal handling. After the
 // stop callback, signals revert to the default Go runtime behavior, which
 // cannot be overridden with external calls to signal.Notify.
-func PrepareForwarding(k *kernel.Kernel, skipSignal syscall.Signal) func() func() {
+func PrepareHandler(handler func(linux.Signal)) func() func() {
 	start := make(chan struct{})
 	stop := make(chan struct{})
 	done := make(chan struct{})
@@ -125,15 +125,10 @@ func PrepareForwarding(k *kernel.Kernel, skipSignal syscall.Signal) func() func(
 	for sig := 1; sig <= numSignals+1; sig++ {
 		sigchan := make(chan os.Signal, 1)
 		sigchans = append(sigchans, sigchan)
-
-		if syscall.Signal(sig) == skipSignal {
-			continue
-		}
-
 		signal.Notify(sigchan, syscall.Signal(sig))
 	}
 	// Start up our listener.
-	go forwardSignals(k, sigchans, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
+	go handleSignals(sigchans, handler, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
 
 	return func() func() {
 		close(start)
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index eaeb9e2d8..bee82f344 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -425,6 +425,26 @@ func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error
 	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
 }
 
+// SignalDeliveryMode enumerates different signal delivery modes.
+type SignalDeliveryMode int
+
+const (
+	// DeliverToProcess delivers the signal to the container process with
+	// the specified PID. If PID is 0, then the container init process is
+	// signaled.
+	DeliverToProcess SignalDeliveryMode = iota
+
+	// DeliverToAllProcesses delivers the signal to all processes in the
+	// container. PID must be 0.
+	DeliverToAllProcesses
+
+	// DeliverToForegroundProcessGroup delivers the signal to the
+	// foreground process group in the same TTY session as the specified
+	// process. If PID is 0, then the signal is delivered to the foreground
+	// process group for the TTY for the init process.
+	DeliverToForegroundProcessGroup
+)
+
 // SignalArgs are arguments to the Signal method.
 type SignalArgs struct {
 	// CID is the container ID.
@@ -433,36 +453,20 @@ type SignalArgs struct {
 	// Signo is the signal to send to the process.
 	Signo int32
 
-	// All is set when signal should be sent to all processes in the container.
-	// When false, the signal is sent to the root container process only.
-	All bool
-}
-
-// Signal sends a signal to the root process of the container.
-func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
-	log.Debugf("containerManager.Signal %q %d, all: %t", args.CID, args.Signo, args.All)
-	return cm.l.signalContainer(args.CID, args.Signo, args.All)
-}
-
-// SignalProcessArgs are arguments to the Signal method.
-type SignalProcessArgs struct {
-	// CID is the container ID.
-	CID string
-
 	// PID is the process ID in the given container that will be signaled.
+	// If 0, the root container will be signalled.
 	PID int32
 
-	// Signo is the signal to send to the process.
-	Signo int32
-
-	// SendToForegroundProcess indicates that the signal should be sent to
-	// the foreground process group in the session that PID belongs to.
-	// This is only valid if the process is attached to a host TTY.
-	SendToForegroundProcess bool
+	// Mode is the signal delivery mode.
+	Mode SignalDeliveryMode
 }
 
-// SignalProcess sends a signal to a particular process in the container.
-func (cm *containerManager) SignalProcess(args *SignalProcessArgs, _ *struct{}) error {
-	log.Debugf("containerManager.Signal: %+v", args)
-	return cm.l.signalProcess(args.CID, args.PID, args.Signo, args.SendToForegroundProcess)
+// Signal sends a signal to one or more processes in a container. If args.PID
+// is 0, then the container init process is used. Depending on the
+// args.SignalDeliveryMode option, the signal may be sent directly to the
+// indicated process, to all processes in the container, or to the foreground
+// process group.
+func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Signal %+v", args)
+	return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
 }
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index a5a6ba8af..9416e3a5c 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -35,6 +35,7 @@ func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, cons
 
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
+	mounter := fs.FileOwnerFromContext(ctx)
 
 	// Maps sandbox FD to host FD.
 	fdMap := map[int]int{
@@ -42,16 +43,44 @@ func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, cons
 		1: stdioFDs[1],
 		2: stdioFDs[2],
 	}
-	mounter := fs.FileOwnerFromContext(ctx)
 
-	for sfd, hfd := range fdMap {
-		file, err := host.ImportFile(ctx, hfd, mounter, console /* isTTY */)
-		if err != nil {
-			return nil, fmt.Errorf("failed to import fd %d: %v", hfd, err)
+	var ttyFile *fs.File
+	for appFD, hostFD := range fdMap {
+		var appFile *fs.File
+
+		if console && appFD < 3 {
+			// Import the file as a host TTY file.
+			if ttyFile == nil {
+				var err error
+				appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */)
+				if err != nil {
+					return nil, err
+				}
+				defer appFile.DecRef()
+
+				// Remember this in the TTY file, as we will
+				// use it for the other stdio FDs.
+				ttyFile = appFile
+			} else {
+				// Re-use the existing TTY file, as all three
+				// stdio FDs must point to the same fs.File in
+				// order to share TTY state, specifically the
+				// foreground process group id.
+				appFile = ttyFile
+			}
+		} else {
+			// Import the file as a regular host file.
+			var err error
+			appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */)
+			if err != nil {
+				return nil, err
+			}
+			defer appFile.DecRef()
 		}
-		defer file.DecRef()
-		if err := fdm.NewFDAt(kdefs.FD(sfd), file, kernel.FDFlags{}, l); err != nil {
-			return nil, fmt.Errorf("failed to add imported fd %d to FDMap: %v", hfd, err)
+
+		// Add the file to the FD map.
+		if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+			return nil, err
 		}
 	}
 
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index fa169d090..c79b95bde 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -19,7 +19,6 @@ import (
 	"fmt"
 	mrand "math/rand"
 	"os"
-	"os/signal"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -110,7 +109,7 @@ type Loader struct {
 	// mu guards processes.
 	mu sync.Mutex
 
-	// processes maps containers root process and invocation of exec. Root
+	// processes maps containers init process and invocation of exec. Root
 	// processes are keyed with container ID and pid=0, while exec invocations
 	// have the corresponding pid set.
 	//
@@ -291,28 +290,9 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("error creating control server: %v", err)
 	}
 
-	// We don't care about child signals; some platforms can generate a
-	// tremendous number of useless ones (I'm looking at you, ptrace).
-	if err := sighandling.IgnoreChildStop(); err != nil {
-		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
-	}
-	// Ensure that signals received are forwarded to the emulated kernel.
-	ps := syscall.Signal(args.Conf.PanicSignal)
-	startSignalForwarding := sighandling.PrepareForwarding(k, ps)
-	if args.Conf.PanicSignal != -1 {
-		// Panics if the sentry receives 'Config.PanicSignal'.
-		panicChan := make(chan os.Signal, 1)
-		signal.Notify(panicChan, ps)
-		go func() { // S/R-SAFE: causes sentry panic.
-			<-panicChan
-			panic("Signal-induced panic")
-		}()
-		log.Infof("Panic signal set to %v(%d)", ps, args.Conf.PanicSignal)
-	}
-
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create root process: %v", err)
+		return nil, fmt.Errorf("failed to create init process for root container: %v", err)
 	}
 
 	if err := initCompatLogs(args.UserLogFD); err != nil {
@@ -320,19 +300,47 @@ func New(args Args) (*Loader, error) {
 	}
 
 	l := &Loader{
-		k:                     k,
-		ctrl:                  ctrl,
-		conf:                  args.Conf,
-		console:               args.Console,
-		watchdog:              watchdog,
-		spec:                  args.Spec,
-		goferFDs:              args.GoferFDs,
-		stdioFDs:              args.StdioFDs,
-		startSignalForwarding: startSignalForwarding,
-		rootProcArgs:          procArgs,
-		sandboxID:             args.ID,
-		processes:             make(map[execID]*execProcess),
+		k:            k,
+		ctrl:         ctrl,
+		conf:         args.Conf,
+		console:      args.Console,
+		watchdog:     watchdog,
+		spec:         args.Spec,
+		goferFDs:     args.GoferFDs,
+		stdioFDs:     args.StdioFDs,
+		rootProcArgs: procArgs,
+		sandboxID:    args.ID,
+		processes:    make(map[execID]*execProcess),
 	}
+
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+	}
+
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
+
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if args.Console {
+			// Since we are running with a console, we should
+			// forward the signal to the foreground process group
+			// so that job control signals like ^C can be handled
+			// properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
+		}
+	})
+
 	ctrl.manager.l = l
 	return l, nil
 }
@@ -467,9 +475,15 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
-	l.mu.Lock()
 	eid := execID{cid: l.sandboxID}
-	l.processes[eid] = &execProcess{tg: l.k.GlobalInit()}
+	ep := execProcess{tg: l.k.GlobalInit()}
+	if l.console {
+		ttyFile := l.rootProcArgs.FDMap.GetFile(0)
+		defer ttyFile.DecRef()
+		ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
+	}
+	l.mu.Lock()
+	l.processes[eid] = &ep
 	l.mu.Unlock()
 
 	// Start signal forwarding only after an init process is created.
@@ -572,7 +586,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 // filesystem.
 func (l *Loader) destroyContainer(cid string) error {
 	// First kill and wait for all processes in the container.
-	if err := l.signalContainer(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
+	if err := l.signal(cid, 0, int32(linux.SIGKILL), DeliverToAllProcesses); err != nil {
 		return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
 	}
 
@@ -634,7 +648,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	return tgid, nil
 }
 
-// waitContainer waits for the root process of a container to exit.
+// waitContainer waits for the init process of a container to exit.
 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
@@ -740,11 +754,12 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	}
 }
 
-// signalProcess sends a signal to the process with the given PID. If
-// sendToFGProcess is true, then the signal will be sent to the foreground
-// process group in the same session that PID belongs to.
-func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess bool) error {
-	if pid <= 0 {
+// signal sends a signal to one or more processes in a container. If PID is 0,
+// then the container init process is used. Depending on the SignalDeliveryMode
+// option, the signal may be sent directly to the indicated process, to all
+// processes in the container, or to the foreground process group.
+func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
+	if pid < 0 {
 		return fmt.Errorf("failed to signal container %q PID %d: PID must be positive", cid, pid)
 	}
 
@@ -756,10 +771,16 @@ func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess boo
 	ep, ok := l.processes[eid]
 	l.mu.Unlock()
 
-	// The caller may be signaling a process not started directly via exec.
-	// In this case, find the process in the container's PID namespace and
-	// signal it.
-	if !ok {
+	switch mode {
+	case DeliverToProcess:
+		if ok {
+			// Send signal directly to the identified process.
+			return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		}
+
+		// The caller may be signaling a process not started directly via exec.
+		// In this case, find the process in the container's PID namespace and
+		// signal it.
 		ep, ok := l.processes[execID{cid: cid}]
 		if !ok {
 			return fmt.Errorf("no container with ID: %q", cid)
@@ -772,74 +793,60 @@ func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess boo
 			return fmt.Errorf("process %d is part of a different container: %q", pid, tg.Leader().ContainerID())
 		}
 		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
-	}
-
-	if !sendToFGProcess {
-		// Send signal directly to exec process.
-		return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
-	}
 
-	// Lookup foreground process group from the TTY for the given process,
-	// and send the signal to it.
-	if ep.tty == nil {
-		return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, pid)
-	}
-	pg := ep.tty.ForegroundProcessGroup()
-	if pg == nil {
-		// No foreground process group has been set. Signal the
-		// original thread group.
-		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, pid, pid)
-		return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
-	}
+	case DeliverToForegroundProcessGroup:
+		if !ok {
+			return fmt.Errorf("failed to signal foreground process group for container %q PID %d: no such PID", cid, pid)
+		}
 
-	// Send the signal to all processes in the process group.
-	var lastErr error
-	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
-		if tg.ProcessGroup() != pg {
-			continue
+		// Lookup foreground process group from the TTY for the given process,
+		// and send the signal to it.
+		if ep.tty == nil {
+			return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, pid)
 		}
-		if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
-			lastErr = err
+		pg := ep.tty.ForegroundProcessGroup()
+		if pg == nil {
+			// No foreground process group has been set. Signal the
+			// original thread group.
+			log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, pid, pid)
+			return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		}
+		// Send the signal to all processes in the process group.
+		var lastErr error
+		for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+			if tg.ProcessGroup() != pg {
+				continue
+			}
+			if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+				lastErr = err
+			}
+		}
+		return lastErr
+	case DeliverToAllProcesses:
+		if !ok {
+			return fmt.Errorf("failed to signal all processes in container %q PID %d: no such PID", cid, pid)
 		}
-	}
-	return lastErr
-}
-
-// signalContainer sends a signal to the root container process, or to all
-// processes in the container if all is true.
-func (l *Loader) signalContainer(cid string, signo int32, all bool) error {
-	si := arch.SignalInfo{Signo: signo}
-
-	l.mu.Lock()
-	defer l.mu.Unlock()
-
-	eid := execID{cid: cid}
-	ep, ok := l.processes[eid]
-	if !ok {
-		return fmt.Errorf("failed to signal container %q: no such container", cid)
-	}
-
-	if !all {
-		return ep.tg.SendSignal(&si)
-	}
 
-	// Pause the kernel to prevent new processes from being created while
-	// the signal is delivered. This prevents process leaks when SIGKILL is
-	// sent to the entire container.
-	l.k.Pause()
-	if err := l.k.SendContainerSignal(cid, &si); err != nil {
+		// Pause the kernel to prevent new processes from being created while
+		// the signal is delivered. This prevents process leaks when SIGKILL is
+		// sent to the entire container.
+		l.k.Pause()
+		if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
+			l.k.Unpause()
+			return err
+		}
 		l.k.Unpause()
-		return err
-	}
-	l.k.Unpause()
 
-	// If killing all processes, wait for them to exit.
-	if all && linux.Signal(signo) == linux.SIGKILL {
-		for _, t := range l.k.TaskSet().Root.Tasks() {
-			if t.ContainerID() == cid {
-				t.ThreadGroup().WaitExited()
+		// If SIGKILLing all processes, wait for them to exit.
+		if linux.Signal(signo) == linux.SIGKILL {
+			for _, t := range l.k.TaskSet().Root.Tasks() {
+				if t.ContainerID() == cid {
+					t.ThreadGroup().WaitExited()
+				}
 			}
 		}
+		return nil
+	default:
+		panic(fmt.Sprintf("unknown signal signal delivery mode %v", mode))
 	}
-	return nil
 }
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 60f1d3033..f4c6f1525 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -30,6 +30,7 @@ go_test(
     name = "container_test",
     size = "medium",
     srcs = [
+        "console_test.go",
         "container_test.go",
         "fs_test.go",
         "multi_container_test.go",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
new file mode 100644
index 000000000..82adcbb7d
--- /dev/null
+++ b/runsc/container/console_test.go
@@ -0,0 +1,452 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/kr/pty"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// createConsoleSocket creates a socket that will receive a console fd from the
+// sandbox. If no error occurs, it returns the server socket and a cleanup
+// function.
+func createConsoleSocket(socketPath string) (*unet.ServerSocket, func() error, error) {
+	cwd, err := os.Getwd()
+	if err != nil {
+		return nil, nil, fmt.Errorf("error getting cwd: %v", err)
+	}
+	// We use a relative path to avoid overflowing the unix path length
+	// limit (108 chars).
+	socketRelPath, err := filepath.Rel(cwd, socketPath)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+	}
+	if len(socketRelPath) > len(socketPath) {
+		socketRelPath = socketPath
+	}
+	srv, err := unet.BindAndListen(socketRelPath, false)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error binding and listening to socket %q: %v", socketPath, err)
+	}
+
+	cleanup := func() error {
+		if err := srv.Close(); err != nil {
+			return fmt.Errorf("error closing socket %q: %v", socketRelPath, err)
+		}
+		if err := os.Remove(socketPath); err != nil {
+			return fmt.Errorf("error removing socket %q: %v", socketRelPath, err)
+		}
+		return nil
+	}
+
+	return srv, cleanup, nil
+}
+
+// receiveConsolePTY accepts a connection on the server socket and reads fds.
+// It fails if more than one FD is received, or if the FD is not a PTY. It
+// returns the PTY master file.
+func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
+	sock, err := srv.Accept()
+	if err != nil {
+		return nil, fmt.Errorf("error accepting socket connection: %v", err)
+	}
+
+	// Allow 3 fds to be received.  We only expect 1.
+	r := sock.Reader(true /* blocking */)
+	r.EnableFDs(1)
+
+	// The socket is closed right after sending the FD, so EOF is
+	// an allowed error.
+	b := [][]byte{{}}
+	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+		return nil, fmt.Errorf("error reading from socket connection: %v", err)
+	}
+
+	// We should have gotten a control message.
+	fds, err := r.ExtractFDs()
+	if err != nil {
+		return nil, fmt.Errorf("error extracting fds from socket connection: %v", err)
+	}
+	if len(fds) != 1 {
+		return nil, fmt.Errorf("got %d fds from socket, wanted 1", len(fds))
+	}
+
+	// Verify that the fd is a terminal.
+	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+		return nil, fmt.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+	}
+
+	return os.NewFile(uintptr(fds[0]), "pty_master"), nil
+}
+
+// Test that an pty FD is sent over the console socket if one is provided.
+func TestConsoleSocket(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+		spec := testutil.NewSpecWithArgs("true")
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		socketPath := filepath.Join(bundleDir, "socket")
+		srv, cleanup, err := createConsoleSocket(socketPath)
+		if err != nil {
+			t.Fatalf("error creating socket at %q: %v", socketPath, err)
+		}
+		defer cleanup()
+
+		// Create the container and pass the socket name.
+		id := testutil.UniqueContainerID()
+		c, err := Create(id, spec, conf, bundleDir, socketPath, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer c.Destroy()
+
+		// Make sure we get a console PTY.
+		ptyMaster, err := receiveConsolePTY(srv)
+		if err != nil {
+			t.Fatalf("error receiving console FD: %v", err)
+		}
+		ptyMaster.Close()
+	}
+}
+
+// Test that job control signals work on a console created with "exec -ti".
+func TestJobControlSignalExec(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	conf := testutil.TestConfig()
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Create a pty master/slave. The slave will be passed to the exec
+	// process.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		t.Fatalf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+	defer ptySlave.Close()
+
+	// Exec bash and attach a terminal.
+	args := &control.ExecArgs{
+		Filename: "/bin/bash",
+		// Don't let bash execute from profile or rc files, otherwise
+		// our PID counts get messed up.
+		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
+		// Pass the pty slave as FD 0, 1, and 2.
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{ptySlave, ptySlave, ptySlave},
+		},
+		StdioIsPty: true,
+	}
+
+	pid, err := c.Execute(args)
+	if err != nil {
+		t.Fatalf("error executing: %v", err)
+	}
+	if pid != 2 {
+		t.Fatalf("exec got pid %d, wanted %d", pid, 2)
+	}
+
+	// Make sure all the processes are running.
+	expectedPL := []*control.Process{
+		// Root container process.
+		{PID: 1, Cmd: "sleep"},
+		// Bash from exec process.
+		{PID: 2, Cmd: "bash"},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Execute sleep.
+	ptyMaster.Write([]byte("sleep 100\n"))
+
+	// Wait for it to start. Sleep's PPID is bash's PID.
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Send a SIGTERM to the foreground process for the exec PID. Note that
+	// although we pass in the PID of "bash", it should actually terminate
+	// "sleep", since that is the foreground process.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGTERM, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Sleep process should be gone.
+	expectedPL = expectedPL[:len(expectedPL)-1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Sleep is dead, but it may take more time for bash to notice and
+	// change the foreground process back to itself. We know it is done
+	// when bash writes "Terminated" to the pty.
+	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
+		t.Fatalf("bash did not take over pty: %v", err)
+	}
+
+	// Send a SIGKILL to the foreground process again. This time "bash"
+	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
+	// because bash ignores those.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGKILL, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+	expectedPL = expectedPL[:1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Make sure the process indicates it was killed by a SIGKILL.
+	ws, err := c.WaitPID(pid, true)
+	if err != nil {
+		t.Errorf("waiting on container failed: %v", err)
+	}
+	if !ws.Signaled() {
+		t.Error("ws.Signaled() got false, want true")
+	}
+	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
+		t.Errorf("ws.Signal() got %v, want %v", got, want)
+	}
+}
+
+// Test that job control signals work on a console created with "run -ti".
+func TestJobControlSignalRootContainer(t *testing.T) {
+	conf := testutil.TestConfig()
+	// Don't let bash execute from profile or rc files, otherwise our PID
+	// counts get messed up.
+	spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc")
+	spec.Process.Terminal = true
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	socketPath := filepath.Join(bundleDir, "socket")
+	srv, cleanup, err := createConsoleSocket(socketPath)
+	if err != nil {
+		t.Fatalf("error creating socket at %q: %v", socketPath, err)
+	}
+	defer cleanup()
+
+	// Create the container and pass the socket name.
+	id := testutil.UniqueContainerID()
+	c, err := Create(id, spec, conf, bundleDir, socketPath, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+
+	// Get the PTY master.
+	ptyMaster, err := receiveConsolePTY(srv)
+	if err != nil {
+		t.Fatalf("error receiving console FD: %v", err)
+	}
+	defer ptyMaster.Close()
+
+	// Bash output as well as sandbox output will be written to the PTY
+	// file. Writes after a certain point will block unless we drain the
+	// PTY, so we must continually copy from it.
+	//
+	// We log the output to stdout for debugabilitly, and also to a buffer,
+	// since we wait on particular output from bash below. We use a custom
+	// blockingBuffer which is thread-safe and also blocks on Read calls,
+	// which makes this a suitable Reader for WaitUntilRead.
+	ptyBuf := newBlockingBuffer()
+	tee := io.TeeReader(ptyMaster, ptyBuf)
+	go io.Copy(os.Stdout, tee)
+
+	// Start the container.
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Start waiting for the container to exit in a goroutine. We do this
+	// very early, otherwise it might exit before we have a chance to call
+	// Wait.
+	var (
+		ws syscall.WaitStatus
+		wg sync.WaitGroup
+	)
+	wg.Add(1)
+	go func() {
+		var err error
+		ws, err = c.Wait()
+		if err != nil {
+			t.Errorf("error waiting on container: %v", err)
+		}
+		wg.Done()
+	}()
+
+	// Wait for bash to start.
+	expectedPL := []*control.Process{
+		{PID: 1, Cmd: "bash"},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	// Execute sleep via the terminal.
+	ptyMaster.Write([]byte("sleep 100\n"))
+
+	// Wait for sleep to start.
+	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"})
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	// Reset the pty buffer, so there is less output for us to scan later.
+	ptyBuf.Reset()
+
+	// Send a SIGTERM to the foreground process. We pass PID=0, indicating
+	// that the root process should be killed. However, by setting
+	// fgProcess=true, the signal should actually be sent to sleep.
+	if err := c.Sandbox.SignalProcess(c.ID, 0 /* PID */, syscall.SIGTERM, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Sleep process should be gone.
+	expectedPL = expectedPL[:len(expectedPL)-1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Sleep is dead, but it may take more time for bash to notice and
+	// change the foreground process back to itself. We know it is done
+	// when bash writes "Terminated" to the pty.
+	if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil {
+		t.Fatalf("bash did not take over pty: %v", err)
+	}
+
+	// Send a SIGKILL to the foreground process again. This time "bash"
+	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
+	// because bash ignores those.
+	if err := c.Sandbox.SignalProcess(c.ID, 0 /* PID */, syscall.SIGKILL, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Wait for the sandbox to exit. It should exit with a SIGKILL status.
+	wg.Wait()
+	if !ws.Signaled() {
+		t.Error("ws.Signaled() got false, want true")
+	}
+	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
+		t.Errorf("ws.Signal() got %v, want %v", got, want)
+	}
+}
+
+// blockingBuffer is a thread-safe buffer that blocks when reading if the
+// buffer is empty.  It implements io.ReadWriter.
+type blockingBuffer struct {
+	// A send to readCh indicates that a previously empty buffer now has
+	// data for reading.
+	readCh chan struct{}
+
+	// mu protects buf.
+	mu  sync.Mutex
+	buf bytes.Buffer
+}
+
+func newBlockingBuffer() *blockingBuffer {
+	return &blockingBuffer{
+		readCh: make(chan struct{}, 1),
+	}
+}
+
+// Write implements Writer.Write.
+func (bb *blockingBuffer) Write(p []byte) (int, error) {
+	bb.mu.Lock()
+	defer bb.mu.Unlock()
+	l := bb.buf.Len()
+	n, err := bb.buf.Write(p)
+	if l == 0 && n > 0 {
+		// New data!
+		bb.readCh <- struct{}{}
+	}
+	return n, err
+}
+
+// Read implements Reader.Read. It will block until data is available.
+func (bb *blockingBuffer) Read(p []byte) (int, error) {
+	for {
+		bb.mu.Lock()
+		n, err := bb.buf.Read(p)
+		if n > 0 || err != io.EOF {
+			if bb.buf.Len() == 0 {
+				// Reset the readCh.
+				select {
+				case <-bb.readCh:
+				default:
+				}
+			}
+			bb.mu.Unlock()
+			return n, err
+		}
+		bb.mu.Unlock()
+
+		// Wait for new data.
+		<-bb.readCh
+	}
+}
+
+// Reset resets the buffer.
+func (bb *blockingBuffer) Reset() {
+	bb.mu.Lock()
+	defer bb.mu.Unlock()
+	bb.buf.Reset()
+	// Reset the readCh.
+	select {
+	case <-bb.readCh:
+	default:
+	}
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index d9cd38c0a..e2bb7d8ec 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -17,7 +17,6 @@ package container
 import (
 	"bytes"
 	"fmt"
-	"io"
 	"io/ioutil"
 	"os"
 	"path"
@@ -31,15 +30,11 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
-	"github.com/kr/pty"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
-	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
@@ -1151,89 +1146,6 @@ func TestCapabilities(t *testing.T) {
 	}
 }
 
-// Test that an tty FD is sent over the console socket if one is provided.
-func TestConsoleSocket(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-		spec := testutil.NewSpecWithArgs("true")
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
-
-		// Create a named socket and start listening.  We use a relative path
-		// to avoid overflowing the unix path length limit (108 chars).
-		socketPath := filepath.Join(bundleDir, "socket")
-		cwd, err := os.Getwd()
-		if err != nil {
-			t.Fatalf("error getting cwd: %v", err)
-		}
-		socketRelPath, err := filepath.Rel(cwd, socketPath)
-		if err != nil {
-			t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
-		}
-		if len(socketRelPath) > len(socketPath) {
-			socketRelPath = socketPath
-		}
-		srv, err := unet.BindAndListen(socketRelPath, false)
-		if err != nil {
-			t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
-		}
-		defer os.Remove(socketPath)
-
-		// Create the container and pass the socket name.
-		id := testutil.UniqueContainerID()
-		c, err := Create(id, spec, conf, bundleDir, socketRelPath, "", "")
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		c.Destroy()
-
-		// Open the othe end of the socket.
-		sock, err := srv.Accept()
-		if err != nil {
-			t.Fatalf("error accepting socket connection: %v", err)
-		}
-
-		// Allow 3 fds to be received.  We only expect 1.
-		r := sock.Reader(true /* blocking */)
-		r.EnableFDs(1)
-
-		// The socket is closed right after sending the FD, so EOF is
-		// an allowed error.
-		b := [][]byte{{}}
-		if _, err := r.ReadVec(b); err != nil && err != io.EOF {
-			t.Fatalf("error reading from socket connection: %v", err)
-		}
-
-		// We should have gotten a control message.
-		fds, err := r.ExtractFDs()
-		if err != nil {
-			t.Fatalf("error extracting fds from socket connection: %v", err)
-		}
-		if len(fds) != 1 {
-			t.Fatalf("got %d fds from socket, wanted 1", len(fds))
-		}
-
-		// Verify that the fd is a terminal.
-		if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
-			t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
-		}
-
-		// Shut it down.
-		if err := c.Destroy(); err != nil {
-			t.Fatalf("error destroying container: %v", err)
-		}
-
-		// Close socket.
-		if err := srv.Close(); err != nil {
-			t.Fatalf("error destroying container: %v", err)
-		}
-	}
-}
-
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
@@ -1626,121 +1538,6 @@ func TestRootNotMount(t *testing.T) {
 	}
 }
 
-func TestJobControlSignalExec(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
-
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// Create and start the container.
-	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer c.Destroy()
-	if err := c.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-
-	// Create a pty master/slave. The slave will be passed to the exec
-	// process.
-	ptyMaster, ptySlave, err := pty.Open()
-	if err != nil {
-		t.Fatalf("error opening pty: %v", err)
-	}
-	defer ptyMaster.Close()
-	defer ptySlave.Close()
-
-	// Exec bash and attach a terminal.
-	args := &control.ExecArgs{
-		Filename: "/bin/bash",
-		// Don't let bash execute from profile or rc files, otherwise
-		// our PID counts get messed up.
-		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
-		// Pass the pty slave as FD 0, 1, and 2.
-		FilePayload: urpc.FilePayload{
-			Files: []*os.File{ptySlave, ptySlave, ptySlave},
-		},
-		StdioIsPty: true,
-	}
-
-	pid, err := c.Execute(args)
-	if err != nil {
-		t.Fatalf("error executing: %v", err)
-	}
-	if pid != 2 {
-		t.Fatalf("exec got pid %d, wanted %d", pid, 2)
-	}
-
-	// Make sure all the processes are running.
-	expectedPL := []*control.Process{
-		// Root container process.
-		{PID: 1, Cmd: "sleep"},
-		// Bash from exec process.
-		{PID: 2, Cmd: "bash"},
-	}
-	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Execute sleep.
-	ptyMaster.Write([]byte("sleep 100\n"))
-
-	// Wait for it to start. Sleep's PPID is bash's PID.
-	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
-	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Send a SIGTERM to the foreground process for the exec PID. Note that
-	// although we pass in the PID of "bash", it should actually terminate
-	// "sleep", since that is the foreground process.
-	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGTERM, true /* fgProcess */); err != nil {
-		t.Fatalf("error signaling container: %v", err)
-	}
-
-	// Sleep process should be gone.
-	expectedPL = expectedPL[:len(expectedPL)-1]
-	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Sleep is dead, but it may take more time for bash to notice and
-	// change the foreground process back to itself. We know it is done
-	// when bash writes "Terminated" to the pty.
-	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
-		t.Fatalf("bash did not take over pty: %v", err)
-	}
-
-	// Send a SIGKILL to the foreground process again. This time "bash"
-	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
-	// because bash ignores those.
-	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGKILL, true /* fgProcess */); err != nil {
-		t.Fatalf("error signaling container: %v", err)
-	}
-	expectedPL = expectedPL[:1]
-	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Make sure the process indicates it was killed by a SIGKILL.
-	ws, err := c.WaitPID(pid, true)
-	if err != nil {
-		t.Errorf("waiting on container failed: %v", err)
-	}
-	if !ws.Signaled() {
-		t.Error("ws.Signaled() got false, want true")
-	}
-	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
-		t.Errorf("ws.Signal() got %v, want %v", got, want)
-	}
-}
-
 func TestUserLog(t *testing.T) {
 	app, err := testutil.FindFile("runsc/container/test_app")
 	if err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 713b326a6..6dc8cf7f0 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -696,10 +696,15 @@ func (s *Sandbox) SignalContainer(cid string, sig syscall.Signal, all bool) erro
 	}
 	defer conn.Close()
 
+	mode := boot.DeliverToProcess
+	if all {
+		mode = boot.DeliverToAllProcesses
+	}
+
 	args := boot.SignalArgs{
 		CID:   cid,
 		Signo: int32(sig),
-		All:   all,
+		Mode:  mode,
 	}
 	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
 		return fmt.Errorf("err signaling container %q: %v", cid, err)
@@ -719,13 +724,18 @@ func (s *Sandbox) SignalProcess(cid string, pid int32, sig syscall.Signal, fgPro
 	}
 	defer conn.Close()
 
-	args := boot.SignalProcessArgs{
-		CID:                     cid,
-		Signo:                   int32(sig),
-		PID:                     pid,
-		SendToForegroundProcess: fgProcess,
+	mode := boot.DeliverToProcess
+	if fgProcess {
+		mode = boot.DeliverToForegroundProcessGroup
+	}
+
+	args := boot.SignalArgs{
+		CID:   cid,
+		Signo: int32(sig),
+		PID:   pid,
+		Mode:  mode,
 	}
-	if err := conn.Call(boot.ContainerSignalProcess, &args, nil); err != nil {
+	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
 		return fmt.Errorf("err signaling container %q PID %d: %v", cid, pid, err)
 	}
 	return nil
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index d08140ad3..3cac674d0 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -66,7 +66,7 @@ func TestExecJobControl(t *testing.T) {
 	if err := testutil.Pull("alpine"); err != nil {
 		t.Fatalf("docker pull failed: %v", err)
 	}
-	d := testutil.MakeDocker("exec-test")
+	d := testutil.MakeDocker("exec-job-control-test")
 
 	// Start the container.
 	if err := d.Run("alpine", "sleep", "1000"); err != nil {
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index b7d07309d..536bb17e0 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -28,6 +28,7 @@ import (
 	"os"
 	"strconv"
 	"strings"
+	"syscall"
 	"testing"
 	"time"
 
@@ -231,6 +232,53 @@ func TestNumCPU(t *testing.T) {
 	}
 }
 
+// TestJobControl tests that job control characters are handled properly.
+func TestJobControl(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("job-control-test")
+
+	// Start the container with an attached PTY.
+	_, ptmx, err := d.RunWithPty("alpine", "sh")
+	if err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer ptmx.Close()
+	defer d.CleanUp()
+
+	// Call "sleep 100" in the shell.
+	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Give shell a few seconds to start executing the sleep.
+	time.Sleep(2 * time.Second)
+
+	// Send a ^C to the pty, which should kill sleep, but not the shell.
+	// \x03 is ASCII "end of text", which is the same as ^C.
+	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// The shell should still be alive at this point. Sleep should have
+	// exited with code 2+128=130. We'll exit with 10 plus that number, so
+	// that we can be sure that the shell did not get signalled.
+	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Wait for the container to exit.
+	got, err := d.Wait(5 * time.Second)
+	if err != nil {
+		t.Fatalf("error getting exit code: %v", err)
+	}
+	// Container should exit with code 10+130=140.
+	if want := syscall.WaitStatus(140); got != want {
+		t.Errorf("container exited with code %d want %d", got, want)
+	}
+}
+
 func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 8a51d3eed..4e48817cf 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -25,6 +25,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"syscall"
 	"time"
 
 	"github.com/kr/pty"
@@ -198,6 +199,13 @@ func (d *Docker) Run(args ...string) error {
 	return err
 }
 
+// RunWithPty is like Run but with an attached pty.
+func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
+	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-it"}
+	a = append(a, args...)
+	return doWithPty(a...)
+}
+
 // RunFg calls 'docker run' with the arguments provided in the foreground. It
 // blocks until the container exits and returns the output.
 func (d *Docker) RunFg(args ...string) (string, error) {
@@ -307,6 +315,37 @@ func (d *Docker) ID() (string, error) {
 	return strings.TrimSpace(string(out)), nil
 }
 
+// Wait waits for container to exit, up to the given timeout. Returns error if
+// wait fails or timeout is hit. Returns the application return code otherwise.
+// Note that the application may have failed even if err == nil, always check
+// the exit code.
+func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
+	timeoutChan := time.After(timeout)
+	waitChan := make(chan (syscall.WaitStatus))
+	errChan := make(chan (error))
+
+	go func() {
+		out, err := do("wait", d.Name)
+		if err != nil {
+			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
+		}
+		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+		if err != nil {
+			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
+		}
+		waitChan <- syscall.WaitStatus(uint32(exit))
+	}()
+
+	select {
+	case ws := <-waitChan:
+		return ws, nil
+	case err := <-errChan:
+		return syscall.WaitStatus(1), err
+	case <-timeoutChan:
+		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
+	}
+}
+
 // WaitForOutput calls 'docker logs' to retrieve containers output and searches
 // for the given pattern.
 func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
-- 
cgit v1.2.3


From e4277cb6ff2d1921e9e7f6fac309647e544bbc04 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 17 Oct 2018 14:10:27 -0700
Subject: Relativize all socket paths in tests.

Otherwise they may exceed the maximum.

PiperOrigin-RevId: 217584658
Change-Id: I869e400d3409599c0d3b85c6590702c052f49550
---
 runsc/container/console_test.go | 64 +++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 24 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 82adcbb7d..8f019b54a 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -33,34 +33,44 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
-// createConsoleSocket creates a socket that will receive a console fd from the
-// sandbox. If no error occurs, it returns the server socket and a cleanup
-// function.
-func createConsoleSocket(socketPath string) (*unet.ServerSocket, func() error, error) {
+// socketPath creates a path inside bundleDir and ensures that the returned
+// path is under 108 charactors (the unix socket path length limit),
+// relativizing the path if necessary.
+func socketPath(bundleDir string) (string, error) {
+	path := filepath.Join(bundleDir, "socket")
 	cwd, err := os.Getwd()
 	if err != nil {
-		return nil, nil, fmt.Errorf("error getting cwd: %v", err)
+		return "", fmt.Errorf("error getting cwd: %v", err)
 	}
-	// We use a relative path to avoid overflowing the unix path length
-	// limit (108 chars).
-	socketRelPath, err := filepath.Rel(cwd, socketPath)
+	relPath, err := filepath.Rel(cwd, path)
 	if err != nil {
-		return nil, nil, fmt.Errorf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+		return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err)
 	}
-	if len(socketRelPath) > len(socketPath) {
-		socketRelPath = socketPath
+	if len(path) > len(relPath) {
+		path = relPath
 	}
-	srv, err := unet.BindAndListen(socketRelPath, false)
+	const maxPathLen = 108
+	if len(path) > maxPathLen {
+		return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path)
+	}
+	return path, nil
+}
+
+// createConsoleSocket creates a socket at the given path that will receive a
+// console fd from the sandbox. If no error occurs, it returns the server
+// socket and a cleanup function.
+func createConsoleSocket(path string) (*unet.ServerSocket, func() error, error) {
+	srv, err := unet.BindAndListen(path, false)
 	if err != nil {
-		return nil, nil, fmt.Errorf("error binding and listening to socket %q: %v", socketPath, err)
+		return nil, nil, fmt.Errorf("error binding and listening to socket %q: %v", path, err)
 	}
 
 	cleanup := func() error {
 		if err := srv.Close(); err != nil {
-			return fmt.Errorf("error closing socket %q: %v", socketRelPath, err)
+			return fmt.Errorf("error closing socket %q: %v", path, err)
 		}
-		if err := os.Remove(socketPath); err != nil {
-			return fmt.Errorf("error removing socket %q: %v", socketRelPath, err)
+		if err := os.Remove(path); err != nil {
+			return fmt.Errorf("error removing socket %q: %v", path, err)
 		}
 		return nil
 	}
@@ -117,16 +127,19 @@ func TestConsoleSocket(t *testing.T) {
 		defer os.RemoveAll(rootDir)
 		defer os.RemoveAll(bundleDir)
 
-		socketPath := filepath.Join(bundleDir, "socket")
-		srv, cleanup, err := createConsoleSocket(socketPath)
+		sock, err := socketPath(bundleDir)
 		if err != nil {
-			t.Fatalf("error creating socket at %q: %v", socketPath, err)
+			t.Fatalf("error getting socket path: %v", err)
+		}
+		srv, cleanup, err := createConsoleSocket(sock)
+		if err != nil {
+			t.Fatalf("error creating socket at %q: %v", sock, err)
 		}
 		defer cleanup()
 
 		// Create the container and pass the socket name.
 		id := testutil.UniqueContainerID()
-		c, err := Create(id, spec, conf, bundleDir, socketPath, "", "")
+		c, err := Create(id, spec, conf, bundleDir, sock, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
@@ -272,16 +285,19 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
-	socketPath := filepath.Join(bundleDir, "socket")
-	srv, cleanup, err := createConsoleSocket(socketPath)
+	sock, err := socketPath(bundleDir)
+	if err != nil {
+		t.Fatalf("error getting socket path: %v", err)
+	}
+	srv, cleanup, err := createConsoleSocket(sock)
 	if err != nil {
-		t.Fatalf("error creating socket at %q: %v", socketPath, err)
+		t.Fatalf("error creating socket at %q: %v", sock, err)
 	}
 	defer cleanup()
 
 	// Create the container and pass the socket name.
 	id := testutil.UniqueContainerID()
-	c, err := Create(id, spec, conf, bundleDir, socketPath, "", "")
+	c, err := Create(id, spec, conf, bundleDir, sock, "", "")
 	if err != nil {
 		t.Fatalf("error creating container: %v", err)
 	}
-- 
cgit v1.2.3


From e0bb94201f1edb6ce649192fe4a62e1781940b50 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 17 Oct 2018 16:17:35 -0700
Subject: Close the gofer socket gracefully in boot:boot_test.

We were closing the FD directly. If the test then created a new socket pair
with the same FD, in-flight RPCs would get directed to the new socket and break
the test.

Instead, we should use unet.Socket.Close(), which allows any in-flight RPCs to
finish.

PiperOrigin-RevId: 217608491
Change-Id: I8c5a76638899ba30f33ca976e6fac967fa0aadbf
---
 runsc/boot/loader_test.go | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 7d35dcae2..41ff3681b 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -84,8 +84,13 @@ func startGofer(root string) (int, func(), error) {
 			log.Infof("Gofer is stopping. FD: %d, err: %v\n", goferEnd, err)
 		}
 	}()
-	// Closing the gofer FD will stop the gofer and exit goroutine above.
-	return sandboxEnd, func() { syscall.Close(goferEnd) }, nil
+	// Closing the gofer socket will stop the gofer and exit goroutine above.
+	cleanup := func() {
+		if err := socket.Close(); err != nil {
+			log.Warningf("Error closing gofer socket: %v", err)
+		}
+	}
+	return sandboxEnd, cleanup, nil
 }
 
 func createLoader() (*Loader, func(), error) {
-- 
cgit v1.2.3


From 2a697791d1a473c76973f135f3af9240a32ad668 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 18 Oct 2018 11:59:35 -0700
Subject: Skip TestPythonHello on ptrace platform while we debug the issue.

PiperOrigin-RevId: 217743078
Change-Id: I47fabd88139b968b6183bcc0340065fdbbc1d40d
---
 runsc/test/image/image_test.go | 6 ++++++
 runsc/test/testutil/docker.go  | 9 +++++++++
 2 files changed, 15 insertions(+)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 40c92e1c0..d89d80a86 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -192,6 +192,12 @@ func TestMysql(t *testing.T) {
 }
 
 func TestPythonHello(t *testing.T) {
+	// TODO: This test occasionally hangs on the ptrace
+	// platform. Re-enable it once we have this issue fixed.
+	if testutil.IsPtracePlatform() {
+		t.Skipf("Skipping PythonHello test on ptrace platform")
+	}
+
 	if err := testutil.Pull("google/python-hello"); err != nil {
 		t.Fatalf("docker pull failed: %v", err)
 	}
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 4e48817cf..7d6a72e5f 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -49,6 +49,15 @@ func IsPauseResumeSupported() bool {
 	return !strings.Contains(getRuntime(), "hostnet")
 }
 
+// IsPtracePlatform returns true if the runtime is using ptrace platform.
+//
+// TODO: Tests should not depend on the platform, but
+// TestPythonHello sometimes hangs on ptrace. Once that is debugged, this
+// method should go away.
+func IsPtracePlatform() bool {
+	return !strings.Contains(getRuntime(), "kvm")
+}
+
 // EnsureSupportedDockerVersion checks if correct docker is installed.
 func EnsureSupportedDockerVersion() {
 	cmd := exec.Command("docker", "version")
-- 
cgit v1.2.3


From f3ffa4db525ea1a1d36307ea9593ed7b5e014ca7 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 18 Oct 2018 12:41:07 -0700
Subject: Resolve mount paths while setting up root fs mount

It's hard to resolve symlinks inside the sandbox because rootfs and mounts
may be read-only, forcing us to create mount points inside lower layer of an
overlay, **before** the volumes are mounted.

Since the destination must already be resolved outside the sandbox when creating
mounts, take this opportunity to rewrite the spec with paths resolved.
"runsc boot" will use the "resolved" spec to load mounts. In addition, symlink
traversals were disabled while mounting containers inside the sandbox.

It haven't been able to write a good test for it. So I'm relying on manual tests
for now.

PiperOrigin-RevId: 217749904
Change-Id: I7ac434d5befd230db1488446cda03300cc0751a9
---
 runsc/boot/config.go              |   3 -
 runsc/boot/fs.go                  | 143 +++++++++++++-------------------------
 runsc/cmd/create.go               |   3 -
 runsc/cmd/run.go                  |   2 -
 runsc/container/container.go      |  26 +++++--
 runsc/container/container_test.go |   2 +-
 runsc/container/fs.go             |  30 +++++---
 runsc/sandbox/sandbox.go          |   7 +-
 runsc/specutils/specutils.go      |  23 ++++++
 runsc/test/testutil/testutil.go   |   1 -
 10 files changed, 119 insertions(+), 121 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 41af084b9..51d20d06d 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -192,9 +192,6 @@ type Config struct {
 	// disabled. Pardon the double negation, but default to enabled is important.
 	DisableSeccomp bool
 
-	// SpecFile is the file containing the OCI spec.
-	SpecFile string
-
 	// WatchdogAction sets what action the watchdog takes when triggered.
 	WatchdogAction watchdog.Action
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 42e011beb..ea825e571 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -103,9 +103,14 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 		return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
 	}
 
-	if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil {
-		return nil, fmt.Errorf("failed to configure mounts: %v", err)
+	root := mns.Root()
+	defer root.DecRef()
+	for _, m := range mounts {
+		if err := mountSubmount(rootCtx, conf, mns, root, fds, m, mounts); err != nil {
+			return nil, fmt.Errorf("mount submount: %v", err)
+		}
 	}
+
 	if !fds.empty() {
 		return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
 	}
@@ -184,17 +189,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	return mounts
 }
 
-// setMounts iterates over mounts and mounts them in the specified
-// mount namespace.
-func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
-	for _, m := range mounts {
-		if err := mountSubmount(ctx, conf, mns, fds, m, mounts, m.Destination); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
 // createRootMount creates the root filesystem.
 func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
@@ -207,9 +201,9 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 
 	fd := fds.remove()
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	hostFS := mustFindFilesystem("9p")
+	p9FS := mustFindFilesystem("9p")
 	opts := p9MountOptions(fd, conf.FileAccess)
-	rootInode, err = hostFS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
+	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
 	if err != nil {
 		return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 	}
@@ -294,7 +288,11 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 	return fsName, opts, useOverlay, err
 }
 
-func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount, dest string) error {
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
@@ -340,60 +338,16 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fd
 		}
 	}
 
-	// Create destination in case it doesn't exist. This is required, in addition
-	// to 'addSubmountOverlay', in case there are symlinks to create directories
-	// in the right location, e.g.
-	//   mount: /var/run/secrets, may be created in '/run/secrets' if
-	//   '/var/run' => '/var'.
-	if err := mkdirAll(ctx, mns, dest); err != nil {
-		return err
-	}
-
-	root := mns.Root()
-	defer root.DecRef()
-	dirent, err := mns.FindInode(ctx, root, nil, dest, linux.MaxSymlinkTraversals)
+	dirent, err := mns.FindInode(ctx, root, root, m.Destination, 0 /* maxTraversals */)
 	if err != nil {
-		return fmt.Errorf("failed to find mount destination %q: %v", dest, err)
+		return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
 	}
 	defer dirent.DecRef()
 	if err := mns.Mount(ctx, dirent, inode); err != nil {
-		return fmt.Errorf("failed to mount at destination %q: %v", dest, err)
+		return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err)
 	}
 
-	log.Infof("Mounted %q to %q type %s", m.Source, dest, m.Type)
-	return nil
-}
-
-func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
-	log.Infof("mkdirAll called with path %s", path)
-	root := mns.Root()
-	defer root.DecRef()
-
-	// Starting at the root, walk the path.
-	parent := root
-	ps := strings.Split(filepath.Clean(path), string(filepath.Separator))
-	for _, pathElem := range ps {
-		if pathElem == "" {
-			// This will be case for the first and last element, if the path
-			// begins or ends with '/'. Note that we always treat the path as
-			// absolute, regardless of what the first character contains.
-			continue
-		}
-		d, err := mns.FindInode(ctx, root, parent, pathElem, fs.DefaultTraversalLimit)
-		if err == syserror.ENOENT {
-			// If we encounter a path that does not exist, then
-			// create it.
-			if err := parent.CreateDirectory(ctx, root, pathElem, fs.FilePermsFromMode(0755)); err != nil {
-				return fmt.Errorf("failed to create directory %q: %v", pathElem, err)
-			}
-			if d, err = parent.Walk(ctx, root, pathElem); err != nil {
-				return fmt.Errorf("walk to %q failed: %v", pathElem, err)
-			}
-		} else if err != nil {
-			return fmt.Errorf("failed to find inode %q: %v", pathElem, err)
-		}
-		parent = d
-	}
+	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
 	return nil
 }
 
@@ -437,14 +391,6 @@ func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, erro
 	return out, nil
 }
 
-func destinations(mounts []specs.Mount, extra ...string) []string {
-	var ds []string
-	for _, m := range mounts {
-		ds = append(ds, m.Destination)
-	}
-	return append(ds, extra...)
-}
-
 // mountDevice returns a device string based on the fs type and target
 // of the mount.
 func mountDevice(m specs.Mount) string {
@@ -544,7 +490,8 @@ func mustFindFilesystem(name string) fs.Filesystem {
 func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
 	// There is no real filesystem backing this ramfs tree, so we pass in
 	// "nil" here.
-	mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts)
+	msrc := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
 	if err != nil {
 		return nil, fmt.Errorf("error creating mount tree: %v", err)
 	}
@@ -608,12 +555,16 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	// namespace.
 	mns := k.RootMountNamespace()
 	if mns == nil {
+		// Setup the root container.
+
 		// Create the virtual filesystem.
 		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, goferFDs)
 		if err != nil {
 			return fmt.Errorf("error creating mounts: %v", err)
 		}
 		k.SetRootMountNamespace(mns)
+
+		// We're done with root container.
 		return nil
 	}
 
@@ -627,42 +578,48 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 		return fmt.Errorf("error creating filesystem for container: %v", err)
 	}
 
-	// Make directories for submounts within the container.
-	rootDir := mns.Root()
-	defer rootDir.DecRef()
-	containerRoot := filepath.Join(ChildContainersDir, cid)
-	mkdirAll(ctx, mns, containerRoot)
+	globalRoot := mns.Root()
+	defer globalRoot.DecRef()
 
-	// Mount the container's root filesystem to the newly created
-	// mount point.
-	containerRootDirent, err := mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
+	// Create mount point for the container's rootfs.
+	contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, 0 /* TraversalLimit */)
 	if err != nil {
-		return fmt.Errorf("failed to find mount destination: %q: %v", containerRoot, err)
+		return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
 	}
-	if err := mns.Mount(ctx, containerRootDirent, rootInode); err != nil {
-		return fmt.Errorf("failed to mount at destination %q: %v", containerRoot, err)
+	if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil {
+		return fmt.Errorf("create directory %q: %v", cid, err)
+	}
+	containerRoot, err := contDir.Walk(ctx, globalRoot, cid)
+	if err != nil {
+		return fmt.Errorf("walk to %q failed: %v", cid, err)
+	}
+	defer containerRoot.DecRef()
+
+	// Mount the container's root filesystem to the newly created mount point.
+	if err := mns.Mount(ctx, containerRoot, rootInode); err != nil {
+		return fmt.Errorf("mount container root: %v", err)
 	}
-	containerRootDirent.DecRef()
 
 	// We have to re-walk to the dirent to find the mounted
 	// directory. The old dirent is invalid at this point.
-	containerRootDirent, err = mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
+	containerRoot, err = contDir.Walk(ctx, globalRoot, cid)
 	if err != nil {
-		return fmt.Errorf("failed to find mount destination2: %q: %v", containerRoot, err)
+		return fmt.Errorf("find container mount point %q: %v", cid, err)
 	}
-	log.Infof("Mounted child's root fs to %q", containerRoot)
+
+	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
 
 	// Mount all submounts.
 	mounts := compileMounts(spec)
 	for _, m := range mounts {
-		dest := filepath.Join(containerRoot, m.Destination)
-		if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil {
+		if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), containerRoot, fds, m, mounts); err != nil {
+			containerRoot.DecRef()
 			return fmt.Errorf("error mounting filesystem for container: %v", err)
 		}
 	}
 
 	// Set the procArgs root directory.
-	procArgs.Root = containerRootDirent
+	procArgs.Root = containerRoot
 	return nil
 }
 
@@ -686,7 +643,7 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 	mnsRoot := mns.Root()
 	defer mnsRoot.DecRef()
 	containerRoot := path.Join(ChildContainersDir, cid)
-	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
+	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, 0 /* maxTraversals */)
 	if err == syserror.ENOENT {
 		// Container must have been destroyed already. That's fine.
 		return nil
@@ -720,7 +677,7 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 
 	// Get a reference to the parent directory and remove the root
 	// container directory.
-	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
+	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, 0 /* maxTraversals */)
 	if err != nil {
 		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
 	}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index ecd76ee93..275a96f57 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -15,8 +15,6 @@
 package cmd
 
 import (
-	"path/filepath"
-
 	"context"
 	"flag"
 	"github.com/google/subcommands"
@@ -93,7 +91,6 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 		Fatalf("error reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
-	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
 	// Create the container. A new sandbox will be created for the
 	// container unless the metadata specifies that it should be run in an
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 826e6e875..9a87cf240 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -15,7 +15,6 @@
 package cmd
 
 import (
-	"path/filepath"
 	"syscall"
 
 	"context"
@@ -73,7 +72,6 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 		Fatalf("error reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
-	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog)
 	if err != nil {
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 0ec4d03c1..f76bad1aa 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -271,6 +271,19 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// init container in the sandbox.
 	if specutils.ShouldCreateSandbox(spec) {
 		log.Debugf("Creating new sandbox for container %q", id)
+
+		// Setup rootfs and mounts. It returns a new mount list with destination
+		// paths resolved. Since the spec for the root container is read from disk,
+		// Write the new spec to a new file that will be used by the sandbox.
+		cleanMounts, err := setupFS(spec, conf, bundleDir)
+		if err != nil {
+			return nil, fmt.Errorf("setup mounts: %v", err)
+		}
+		spec.Mounts = cleanMounts
+		if err := specutils.WriteCleanSpec(bundleDir, spec); err != nil {
+			return nil, fmt.Errorf("writing clean spec: %v", err)
+		}
+
 		ioFiles, err := c.createGoferProcess(spec, conf, bundleDir)
 		if err != nil {
 			return nil, err
@@ -351,6 +364,15 @@ func (c *Container) Start(conf *boot.Config) error {
 			return err
 		}
 	} else {
+		// Setup rootfs and mounts. It returns a new mount list with destination
+		// paths resolved. Replace the original spec with new mount list and start
+		// container.
+		cleanMounts, err := setupFS(c.Spec, conf, c.BundleDir)
+		if err != nil {
+			return fmt.Errorf("setup mounts: %v", err)
+		}
+		c.Spec.Mounts = cleanMounts
+
 		// Create the gofer process.
 		ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
 		if err != nil {
@@ -691,10 +713,6 @@ func (c *Container) waitForStopped() error {
 }
 
 func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, error) {
-	if err := setupFS(spec, conf, bundleDir); err != nil {
-		return nil, fmt.Errorf("failed to setup mounts: %v", err)
-	}
-
 	// Start with the general config flags.
 	args := conf.ToFlags()
 	args = append(args, "gofer", "--bundle", bundleDir)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index e2bb7d8ec..662591b3b 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -458,7 +458,7 @@ func TestAppExitStatus(t *testing.T) {
 	defer os.RemoveAll(rootDir2)
 	defer os.RemoveAll(bundleDir2)
 
-	ws, err = Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir2, "", "", "")
+	ws, err = Run(testutil.UniqueContainerID(), errSpec, conf, bundleDir2, "", "", "")
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index 59edd9488..2ed42fd93 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -73,9 +73,13 @@ var optionsMap = map[string]mapping{
 // This allows the gofer serving the containers to be chroot under this
 // directory to create an extra layer to security in case the gofer gets
 // compromised.
-func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
+// Returns list of mounts equivalent to 'spec.Mounts' with all destination paths
+// cleaned and with symlinks resolved.
+func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mount, error) {
+	rv := make([]specs.Mount, 0, len(spec.Mounts))
 	for _, m := range spec.Mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+			rv = append(rv, m)
 			continue
 		}
 
@@ -83,39 +87,47 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		// container.
 		dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
 		if err != nil {
-			return fmt.Errorf("failed to resolve symlinks: %v", err)
+			return nil, fmt.Errorf("failed to resolve symlinks: %v", err)
 		}
 
 		flags := optionsToFlags(m.Options)
 		flags |= syscall.MS_BIND
 		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
 		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
-			return fmt.Errorf("failed to mount %v: %v", m, err)
+			return nil, fmt.Errorf("failed to mount %v: %v", m, err)
 		}
 
 		// Make the mount a slave, so that for recursive bind mount, umount won't
 		// propagate to the source.
 		flags = syscall.MS_SLAVE | syscall.MS_REC
 		if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
-			return fmt.Errorf("failed to rslave mount dst: %q, flags: %#x, err: %v", dst, flags, err)
+			return nil, fmt.Errorf("failed to rslave mount dst: %q, flags: %#x, err: %v", dst, flags, err)
 		}
+
+		cpy := m
+		relDst, err := filepath.Rel(spec.Root.Path, dst)
+		if err != nil {
+			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, spec.Root.Path, err))
+		}
+		cpy.Destination = filepath.Join("/", relDst)
+		rv = append(rv, cpy)
 	}
 
 	// If root is read only, check if it needs to be remounted as readonly.
 	if spec.Root.Readonly {
 		isMountPoint, readonly, err := mountInfo(spec.Root.Path)
 		if err != nil {
-			return err
+			return nil, err
 		}
 		if readonly {
-			return nil
+			return rv, nil
 		}
 		if !isMountPoint {
 			// Readonly root is not a mount point nor read-only. Can't do much other
 			// than just logging a warning. The gofer will prevent files to be open
 			// in write mode.
 			log.Warningf("Mount where root is located is not read-only and cannot be changed: %q", spec.Root.Path)
-			return nil
+			return rv, nil
 		}
 
 		// If root is a mount point but not read-only, we can change mount options
@@ -124,10 +136,10 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
 		src := spec.Root.Path
 		if err := syscall.Mount(src, src, "bind", flags, ""); err != nil {
-			return fmt.Errorf("failed to remount root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
+			return nil, fmt.Errorf("failed to remount root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
 		}
 	}
-	return nil
+	return rv, nil
 }
 
 // mountInfo returns whether the path is a mount point and whether the mount
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 6dc8cf7f0..923a52f7f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -321,12 +321,9 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	nextFD++
 
 	// Open the spec file to donate to the sandbox.
-	if conf.SpecFile == "" {
-		return fmt.Errorf("conf.SpecFile must be set")
-	}
-	specFile, err := os.Open(conf.SpecFile)
+	specFile, err := specutils.OpenCleanSpec(bundleDir)
 	if err != nil {
-		return fmt.Errorf("error opening spec file %q: %v", conf.SpecFile, err)
+		return fmt.Errorf("opening spec file: %v", err)
 	}
 	defer specFile.Close()
 	cmd.ExtraFiles = append(cmd.ExtraFiles, specFile)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 6b3e52021..b29802fde 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -170,6 +170,29 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error)
 	return &spec, nil
 }
 
+// OpenCleanSpec opens spec file that has destination mount paths resolved to
+// their absolute location.
+func OpenCleanSpec(bundleDir string) (*os.File, error) {
+	f, err := os.Open(filepath.Join(bundleDir, "config.clean.json"))
+	if err != nil {
+		return nil, err
+	}
+	if _, err := f.Seek(0, os.SEEK_SET); err != nil {
+		f.Close()
+		return nil, fmt.Errorf("error seeking to beginning of file %q: %v", f.Name(), err)
+	}
+	return f, nil
+}
+
+// WriteCleanSpec writes a spec file that has destination mount paths resolved.
+func WriteCleanSpec(bundleDir string, spec *specs.Spec) error {
+	bytes, err := json.Marshal(spec)
+	if err != nil {
+		return err
+	}
+	return ioutil.WriteFile(filepath.Join(bundleDir, "config.clean.json"), bytes, 0755)
+}
+
 // Capabilities takes in spec and returns a TaskCapabilities corresponding to
 // the spec.
 func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index b4664995c..4d7ac3bc9 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -179,7 +179,6 @@ func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (
 	}
 
 	conf.RootDir = rootDir
-	conf.SpecFile = filepath.Join(bundleDir, "config.json")
 	return bundleDir, nil
 }
 
-- 
cgit v1.2.3


From 8fce67af24945f82378b4c2731cca1788936d074 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 19 Oct 2018 16:34:09 -0700
Subject: Use correct company name in copyright header

PiperOrigin-RevId: 217951017
Change-Id: Ie08bf6987f98467d07457bcf35b5f1ff6e43c035
---
 kokoro/run_build.sh                                              | 2 +-
 kokoro/run_tests.sh                                              | 2 +-
 pkg/abi/abi.go                                                   | 2 +-
 pkg/abi/abi_linux.go                                             | 2 +-
 pkg/abi/flag.go                                                  | 2 +-
 pkg/abi/linux/aio.go                                             | 2 +-
 pkg/abi/linux/ashmem.go                                          | 2 +-
 pkg/abi/linux/binder.go                                          | 2 +-
 pkg/abi/linux/bpf.go                                             | 2 +-
 pkg/abi/linux/capability.go                                      | 2 +-
 pkg/abi/linux/dev.go                                             | 2 +-
 pkg/abi/linux/elf.go                                             | 2 +-
 pkg/abi/linux/errors.go                                          | 2 +-
 pkg/abi/linux/eventfd.go                                         | 2 +-
 pkg/abi/linux/exec.go                                            | 2 +-
 pkg/abi/linux/fcntl.go                                           | 2 +-
 pkg/abi/linux/file.go                                            | 2 +-
 pkg/abi/linux/fs.go                                              | 2 +-
 pkg/abi/linux/futex.go                                           | 2 +-
 pkg/abi/linux/inotify.go                                         | 2 +-
 pkg/abi/linux/ioctl.go                                           | 2 +-
 pkg/abi/linux/ip.go                                              | 2 +-
 pkg/abi/linux/ipc.go                                             | 2 +-
 pkg/abi/linux/limits.go                                          | 2 +-
 pkg/abi/linux/linux.go                                           | 2 +-
 pkg/abi/linux/mm.go                                              | 2 +-
 pkg/abi/linux/netdevice.go                                       | 2 +-
 pkg/abi/linux/netlink.go                                         | 2 +-
 pkg/abi/linux/netlink_route.go                                   | 2 +-
 pkg/abi/linux/poll.go                                            | 2 +-
 pkg/abi/linux/prctl.go                                           | 2 +-
 pkg/abi/linux/ptrace.go                                          | 2 +-
 pkg/abi/linux/rusage.go                                          | 2 +-
 pkg/abi/linux/sched.go                                           | 2 +-
 pkg/abi/linux/seccomp.go                                         | 2 +-
 pkg/abi/linux/sem.go                                             | 2 +-
 pkg/abi/linux/shm.go                                             | 2 +-
 pkg/abi/linux/signal.go                                          | 2 +-
 pkg/abi/linux/socket.go                                          | 2 +-
 pkg/abi/linux/time.go                                            | 2 +-
 pkg/abi/linux/timer.go                                           | 2 +-
 pkg/abi/linux/tty.go                                             | 2 +-
 pkg/abi/linux/uio.go                                             | 2 +-
 pkg/abi/linux/utsname.go                                         | 2 +-
 pkg/amutex/amutex.go                                             | 2 +-
 pkg/amutex/amutex_test.go                                        | 2 +-
 pkg/atomicbitops/atomic_bitops.go                                | 2 +-
 pkg/atomicbitops/atomic_bitops_amd64.s                           | 2 +-
 pkg/atomicbitops/atomic_bitops_common.go                         | 2 +-
 pkg/atomicbitops/atomic_bitops_test.go                           | 2 +-
 pkg/binary/binary.go                                             | 2 +-
 pkg/binary/binary_test.go                                        | 2 +-
 pkg/bits/bits.go                                                 | 2 +-
 pkg/bits/bits_template.go                                        | 2 +-
 pkg/bits/uint64_arch_amd64.go                                    | 2 +-
 pkg/bits/uint64_arch_amd64_asm.s                                 | 2 +-
 pkg/bits/uint64_arch_generic.go                                  | 2 +-
 pkg/bits/uint64_test.go                                          | 2 +-
 pkg/bpf/bpf.go                                                   | 2 +-
 pkg/bpf/decoder.go                                               | 2 +-
 pkg/bpf/decoder_test.go                                          | 2 +-
 pkg/bpf/input_bytes.go                                           | 2 +-
 pkg/bpf/interpreter.go                                           | 2 +-
 pkg/bpf/interpreter_test.go                                      | 2 +-
 pkg/bpf/program_builder.go                                       | 2 +-
 pkg/bpf/program_builder_test.go                                  | 2 +-
 pkg/compressio/compressio.go                                     | 2 +-
 pkg/compressio/compressio_test.go                                | 2 +-
 pkg/control/client/client.go                                     | 2 +-
 pkg/control/server/server.go                                     | 2 +-
 pkg/cpuid/cpu_amd64.s                                            | 2 +-
 pkg/cpuid/cpuid.go                                               | 2 +-
 pkg/cpuid/cpuid_parse_test.go                                    | 2 +-
 pkg/cpuid/cpuid_test.go                                          | 2 +-
 pkg/dhcp/client.go                                               | 2 +-
 pkg/dhcp/dhcp.go                                                 | 2 +-
 pkg/dhcp/dhcp_string.go                                          | 2 +-
 pkg/dhcp/dhcp_test.go                                            | 2 +-
 pkg/dhcp/server.go                                               | 2 +-
 pkg/eventchannel/event.go                                        | 2 +-
 pkg/eventchannel/event.proto                                     | 2 +-
 pkg/fd/fd.go                                                     | 2 +-
 pkg/fd/fd_test.go                                                | 2 +-
 pkg/gate/gate.go                                                 | 2 +-
 pkg/gate/gate_test.go                                            | 2 +-
 pkg/ilist/list.go                                                | 2 +-
 pkg/ilist/list_test.go                                           | 2 +-
 pkg/linewriter/linewriter.go                                     | 2 +-
 pkg/linewriter/linewriter_test.go                                | 2 +-
 pkg/log/glog.go                                                  | 2 +-
 pkg/log/glog_unsafe.go                                           | 2 +-
 pkg/log/json.go                                                  | 2 +-
 pkg/log/json_test.go                                             | 2 +-
 pkg/log/log.go                                                   | 2 +-
 pkg/log/log_test.go                                              | 2 +-
 pkg/metric/metric.go                                             | 2 +-
 pkg/metric/metric.proto                                          | 2 +-
 pkg/metric/metric_test.go                                        | 2 +-
 pkg/p9/buffer.go                                                 | 2 +-
 pkg/p9/client.go                                                 | 2 +-
 pkg/p9/client_file.go                                            | 2 +-
 pkg/p9/client_test.go                                            | 2 +-
 pkg/p9/file.go                                                   | 2 +-
 pkg/p9/handlers.go                                               | 2 +-
 pkg/p9/local_server/local_server.go                              | 2 +-
 pkg/p9/messages.go                                               | 2 +-
 pkg/p9/messages_test.go                                          | 2 +-
 pkg/p9/p9.go                                                     | 2 +-
 pkg/p9/p9_test.go                                                | 2 +-
 pkg/p9/p9test/client_test.go                                     | 2 +-
 pkg/p9/p9test/mocks.go                                           | 2 +-
 pkg/p9/pool.go                                                   | 2 +-
 pkg/p9/pool_test.go                                              | 2 +-
 pkg/p9/server.go                                                 | 2 +-
 pkg/p9/transport.go                                              | 2 +-
 pkg/p9/transport_test.go                                         | 2 +-
 pkg/p9/version.go                                                | 2 +-
 pkg/p9/version_test.go                                           | 2 +-
 pkg/rand/rand.go                                                 | 2 +-
 pkg/rand/rand_linux.go                                           | 2 +-
 pkg/refs/refcounter.go                                           | 2 +-
 pkg/refs/refcounter_state.go                                     | 2 +-
 pkg/refs/refcounter_test.go                                      | 2 +-
 pkg/seccomp/seccomp.go                                           | 2 +-
 pkg/seccomp/seccomp_rules.go                                     | 2 +-
 pkg/seccomp/seccomp_test.go                                      | 2 +-
 pkg/seccomp/seccomp_test_victim.go                               | 2 +-
 pkg/seccomp/seccomp_unsafe.go                                    | 2 +-
 pkg/secio/full_reader.go                                         | 2 +-
 pkg/secio/secio.go                                               | 2 +-
 pkg/secio/secio_test.go                                          | 2 +-
 pkg/segment/range.go                                             | 2 +-
 pkg/segment/set.go                                               | 2 +-
 pkg/segment/set_state.go                                         | 2 +-
 pkg/segment/test/segment_test.go                                 | 2 +-
 pkg/segment/test/set_functions.go                                | 2 +-
 pkg/sentry/arch/aligned.go                                       | 2 +-
 pkg/sentry/arch/arch.go                                          | 2 +-
 pkg/sentry/arch/arch_amd64.go                                    | 2 +-
 pkg/sentry/arch/arch_amd64.s                                     | 2 +-
 pkg/sentry/arch/arch_state_x86.go                                | 2 +-
 pkg/sentry/arch/arch_x86.go                                      | 2 +-
 pkg/sentry/arch/auxv.go                                          | 2 +-
 pkg/sentry/arch/registers.proto                                  | 2 +-
 pkg/sentry/arch/signal_act.go                                    | 2 +-
 pkg/sentry/arch/signal_amd64.go                                  | 2 +-
 pkg/sentry/arch/signal_info.go                                   | 2 +-
 pkg/sentry/arch/signal_stack.go                                  | 2 +-
 pkg/sentry/arch/stack.go                                         | 2 +-
 pkg/sentry/arch/syscalls_amd64.go                                | 2 +-
 pkg/sentry/context/context.go                                    | 2 +-
 pkg/sentry/context/contexttest/contexttest.go                    | 2 +-
 pkg/sentry/control/control.go                                    | 2 +-
 pkg/sentry/control/proc.go                                       | 2 +-
 pkg/sentry/control/proc_test.go                                  | 2 +-
 pkg/sentry/control/state.go                                      | 2 +-
 pkg/sentry/device/device.go                                      | 2 +-
 pkg/sentry/device/device_test.go                                 | 2 +-
 pkg/sentry/fs/anon/anon.go                                       | 2 +-
 pkg/sentry/fs/anon/device.go                                     | 2 +-
 pkg/sentry/fs/ashmem/area.go                                     | 2 +-
 pkg/sentry/fs/ashmem/device.go                                   | 2 +-
 pkg/sentry/fs/ashmem/pin_board.go                                | 2 +-
 pkg/sentry/fs/ashmem/pin_board_test.go                           | 2 +-
 pkg/sentry/fs/attr.go                                            | 2 +-
 pkg/sentry/fs/binder/binder.go                                   | 2 +-
 pkg/sentry/fs/context.go                                         | 2 +-
 pkg/sentry/fs/copy_up.go                                         | 2 +-
 pkg/sentry/fs/copy_up_test.go                                    | 2 +-
 pkg/sentry/fs/dentry.go                                          | 2 +-
 pkg/sentry/fs/dev/dev.go                                         | 2 +-
 pkg/sentry/fs/dev/device.go                                      | 2 +-
 pkg/sentry/fs/dev/fs.go                                          | 2 +-
 pkg/sentry/fs/dev/full.go                                        | 2 +-
 pkg/sentry/fs/dev/null.go                                        | 2 +-
 pkg/sentry/fs/dev/random.go                                      | 2 +-
 pkg/sentry/fs/dirent.go                                          | 2 +-
 pkg/sentry/fs/dirent_cache.go                                    | 2 +-
 pkg/sentry/fs/dirent_cache_test.go                               | 2 +-
 pkg/sentry/fs/dirent_refs_test.go                                | 2 +-
 pkg/sentry/fs/dirent_state.go                                    | 2 +-
 pkg/sentry/fs/fdpipe/pipe.go                                     | 2 +-
 pkg/sentry/fs/fdpipe/pipe_opener.go                              | 2 +-
 pkg/sentry/fs/fdpipe/pipe_opener_test.go                         | 2 +-
 pkg/sentry/fs/fdpipe/pipe_state.go                               | 2 +-
 pkg/sentry/fs/fdpipe/pipe_test.go                                | 2 +-
 pkg/sentry/fs/file.go                                            | 2 +-
 pkg/sentry/fs/file_operations.go                                 | 2 +-
 pkg/sentry/fs/file_overlay.go                                    | 2 +-
 pkg/sentry/fs/file_overlay_test.go                               | 2 +-
 pkg/sentry/fs/file_state.go                                      | 2 +-
 pkg/sentry/fs/file_test.go                                       | 2 +-
 pkg/sentry/fs/filesystems.go                                     | 2 +-
 pkg/sentry/fs/filetest/filetest.go                               | 2 +-
 pkg/sentry/fs/flags.go                                           | 2 +-
 pkg/sentry/fs/fs.go                                              | 2 +-
 pkg/sentry/fs/fsutil/dirty_set.go                                | 2 +-
 pkg/sentry/fs/fsutil/dirty_set_test.go                           | 2 +-
 pkg/sentry/fs/fsutil/file.go                                     | 2 +-
 pkg/sentry/fs/fsutil/file_range_set.go                           | 2 +-
 pkg/sentry/fs/fsutil/frame_ref_set.go                            | 2 +-
 pkg/sentry/fs/fsutil/fsutil.go                                   | 2 +-
 pkg/sentry/fs/fsutil/handle.go                                   | 2 +-
 pkg/sentry/fs/fsutil/handle_test.go                              | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper.go                         | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper_state.go                   | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go                  | 2 +-
 pkg/sentry/fs/fsutil/inode.go                                    | 2 +-
 pkg/sentry/fs/fsutil/inode_cached.go                             | 2 +-
 pkg/sentry/fs/fsutil/inode_cached_test.go                        | 2 +-
 pkg/sentry/fs/gofer/attr.go                                      | 2 +-
 pkg/sentry/fs/gofer/cache_policy.go                              | 2 +-
 pkg/sentry/fs/gofer/context_file.go                              | 2 +-
 pkg/sentry/fs/gofer/device.go                                    | 2 +-
 pkg/sentry/fs/gofer/file.go                                      | 2 +-
 pkg/sentry/fs/gofer/file_state.go                                | 2 +-
 pkg/sentry/fs/gofer/fs.go                                        | 2 +-
 pkg/sentry/fs/gofer/gofer_test.go                                | 2 +-
 pkg/sentry/fs/gofer/handles.go                                   | 2 +-
 pkg/sentry/fs/gofer/inode.go                                     | 2 +-
 pkg/sentry/fs/gofer/inode_state.go                               | 2 +-
 pkg/sentry/fs/gofer/path.go                                      | 2 +-
 pkg/sentry/fs/gofer/session.go                                   | 2 +-
 pkg/sentry/fs/gofer/session_state.go                             | 2 +-
 pkg/sentry/fs/gofer/socket.go                                    | 2 +-
 pkg/sentry/fs/gofer/util.go                                      | 2 +-
 pkg/sentry/fs/host/control.go                                    | 2 +-
 pkg/sentry/fs/host/descriptor.go                                 | 2 +-
 pkg/sentry/fs/host/descriptor_state.go                           | 2 +-
 pkg/sentry/fs/host/descriptor_test.go                            | 2 +-
 pkg/sentry/fs/host/device.go                                     | 2 +-
 pkg/sentry/fs/host/file.go                                       | 2 +-
 pkg/sentry/fs/host/fs.go                                         | 2 +-
 pkg/sentry/fs/host/fs_test.go                                    | 2 +-
 pkg/sentry/fs/host/inode.go                                      | 2 +-
 pkg/sentry/fs/host/inode_state.go                                | 2 +-
 pkg/sentry/fs/host/inode_test.go                                 | 2 +-
 pkg/sentry/fs/host/ioctl_unsafe.go                               | 2 +-
 pkg/sentry/fs/host/socket.go                                     | 2 +-
 pkg/sentry/fs/host/socket_iovec.go                               | 2 +-
 pkg/sentry/fs/host/socket_state.go                               | 2 +-
 pkg/sentry/fs/host/socket_test.go                                | 2 +-
 pkg/sentry/fs/host/socket_unsafe.go                              | 2 +-
 pkg/sentry/fs/host/tty.go                                        | 2 +-
 pkg/sentry/fs/host/util.go                                       | 2 +-
 pkg/sentry/fs/host/util_unsafe.go                                | 2 +-
 pkg/sentry/fs/host/wait_test.go                                  | 2 +-
 pkg/sentry/fs/inode.go                                           | 2 +-
 pkg/sentry/fs/inode_inotify.go                                   | 2 +-
 pkg/sentry/fs/inode_operations.go                                | 2 +-
 pkg/sentry/fs/inode_overlay.go                                   | 2 +-
 pkg/sentry/fs/inode_overlay_test.go                              | 2 +-
 pkg/sentry/fs/inotify.go                                         | 2 +-
 pkg/sentry/fs/inotify_event.go                                   | 2 +-
 pkg/sentry/fs/inotify_watch.go                                   | 2 +-
 pkg/sentry/fs/lock/lock.go                                       | 2 +-
 pkg/sentry/fs/lock/lock_range_test.go                            | 2 +-
 pkg/sentry/fs/lock/lock_set_functions.go                         | 2 +-
 pkg/sentry/fs/lock/lock_test.go                                  | 2 +-
 pkg/sentry/fs/mock.go                                            | 2 +-
 pkg/sentry/fs/mount.go                                           | 2 +-
 pkg/sentry/fs/mount_overlay.go                                   | 2 +-
 pkg/sentry/fs/mount_state.go                                     | 2 +-
 pkg/sentry/fs/mount_test.go                                      | 2 +-
 pkg/sentry/fs/mounts.go                                          | 2 +-
 pkg/sentry/fs/mounts_test.go                                     | 2 +-
 pkg/sentry/fs/offset.go                                          | 2 +-
 pkg/sentry/fs/overlay.go                                         | 2 +-
 pkg/sentry/fs/path.go                                            | 2 +-
 pkg/sentry/fs/path_test.go                                       | 2 +-
 pkg/sentry/fs/proc/cpuinfo.go                                    | 2 +-
 pkg/sentry/fs/proc/device/device.go                              | 2 +-
 pkg/sentry/fs/proc/exec_args.go                                  | 2 +-
 pkg/sentry/fs/proc/fds.go                                        | 2 +-
 pkg/sentry/fs/proc/file.go                                       | 2 +-
 pkg/sentry/fs/proc/filesystems.go                                | 2 +-
 pkg/sentry/fs/proc/fs.go                                         | 2 +-
 pkg/sentry/fs/proc/loadavg.go                                    | 2 +-
 pkg/sentry/fs/proc/meminfo.go                                    | 2 +-
 pkg/sentry/fs/proc/mounts.go                                     | 2 +-
 pkg/sentry/fs/proc/net.go                                        | 2 +-
 pkg/sentry/fs/proc/net_test.go                                   | 2 +-
 pkg/sentry/fs/proc/proc.go                                       | 2 +-
 pkg/sentry/fs/proc/rpcinet_proc.go                               | 2 +-
 pkg/sentry/fs/proc/seqfile/seqfile.go                            | 2 +-
 pkg/sentry/fs/proc/seqfile/seqfile_test.go                       | 2 +-
 pkg/sentry/fs/proc/stat.go                                       | 2 +-
 pkg/sentry/fs/proc/sys.go                                        | 2 +-
 pkg/sentry/fs/proc/sys_net.go                                    | 2 +-
 pkg/sentry/fs/proc/sys_net_test.go                               | 2 +-
 pkg/sentry/fs/proc/task.go                                       | 2 +-
 pkg/sentry/fs/proc/uid_gid_map.go                                | 2 +-
 pkg/sentry/fs/proc/uptime.go                                     | 2 +-
 pkg/sentry/fs/proc/version.go                                    | 2 +-
 pkg/sentry/fs/ramfs/dir.go                                       | 2 +-
 pkg/sentry/fs/ramfs/file.go                                      | 2 +-
 pkg/sentry/fs/ramfs/ramfs.go                                     | 2 +-
 pkg/sentry/fs/ramfs/socket.go                                    | 2 +-
 pkg/sentry/fs/ramfs/symlink.go                                   | 2 +-
 pkg/sentry/fs/ramfs/test/test.go                                 | 2 +-
 pkg/sentry/fs/ramfs/tree.go                                      | 2 +-
 pkg/sentry/fs/ramfs/tree_test.go                                 | 2 +-
 pkg/sentry/fs/restore.go                                         | 2 +-
 pkg/sentry/fs/save.go                                            | 2 +-
 pkg/sentry/fs/seek.go                                            | 2 +-
 pkg/sentry/fs/sync.go                                            | 2 +-
 pkg/sentry/fs/sys/device.go                                      | 2 +-
 pkg/sentry/fs/sys/devices.go                                     | 2 +-
 pkg/sentry/fs/sys/fs.go                                          | 2 +-
 pkg/sentry/fs/sys/sys.go                                         | 2 +-
 pkg/sentry/fs/timerfd/timerfd.go                                 | 2 +-
 pkg/sentry/fs/tmpfs/device.go                                    | 2 +-
 pkg/sentry/fs/tmpfs/file_regular.go                              | 2 +-
 pkg/sentry/fs/tmpfs/file_test.go                                 | 2 +-
 pkg/sentry/fs/tmpfs/fs.go                                        | 2 +-
 pkg/sentry/fs/tmpfs/inode_file.go                                | 2 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                                     | 2 +-
 pkg/sentry/fs/tty/dir.go                                         | 2 +-
 pkg/sentry/fs/tty/fs.go                                          | 2 +-
 pkg/sentry/fs/tty/inode.go                                       | 2 +-
 pkg/sentry/fs/tty/line_discipline.go                             | 2 +-
 pkg/sentry/fs/tty/master.go                                      | 2 +-
 pkg/sentry/fs/tty/queue.go                                       | 2 +-
 pkg/sentry/fs/tty/slave.go                                       | 2 +-
 pkg/sentry/fs/tty/terminal.go                                    | 2 +-
 pkg/sentry/fs/tty/tty_test.go                                    | 2 +-
 pkg/sentry/hostcpu/getcpu_amd64.s                                | 2 +-
 pkg/sentry/hostcpu/hostcpu.go                                    | 2 +-
 pkg/sentry/hostcpu/hostcpu_test.go                               | 2 +-
 pkg/sentry/inet/context.go                                       | 2 +-
 pkg/sentry/inet/inet.go                                          | 2 +-
 pkg/sentry/inet/test_stack.go                                    | 2 +-
 pkg/sentry/kernel/abstract_socket_namespace.go                   | 2 +-
 pkg/sentry/kernel/auth/auth.go                                   | 2 +-
 pkg/sentry/kernel/auth/capability_set.go                         | 2 +-
 pkg/sentry/kernel/auth/context.go                                | 2 +-
 pkg/sentry/kernel/auth/credentials.go                            | 2 +-
 pkg/sentry/kernel/auth/id.go                                     | 2 +-
 pkg/sentry/kernel/auth/id_map.go                                 | 2 +-
 pkg/sentry/kernel/auth/id_map_functions.go                       | 2 +-
 pkg/sentry/kernel/auth/user_namespace.go                         | 2 +-
 pkg/sentry/kernel/context.go                                     | 2 +-
 pkg/sentry/kernel/epoll/epoll.go                                 | 2 +-
 pkg/sentry/kernel/epoll/epoll_state.go                           | 2 +-
 pkg/sentry/kernel/epoll/epoll_test.go                            | 2 +-
 pkg/sentry/kernel/eventfd/eventfd.go                             | 2 +-
 pkg/sentry/kernel/eventfd/eventfd_test.go                        | 2 +-
 pkg/sentry/kernel/fasync/fasync.go                               | 2 +-
 pkg/sentry/kernel/fd_map.go                                      | 2 +-
 pkg/sentry/kernel/fd_map_test.go                                 | 2 +-
 pkg/sentry/kernel/fs_context.go                                  | 2 +-
 pkg/sentry/kernel/futex/futex.go                                 | 2 +-
 pkg/sentry/kernel/futex/futex_test.go                            | 2 +-
 pkg/sentry/kernel/ipc_namespace.go                               | 2 +-
 pkg/sentry/kernel/kdefs/kdefs.go                                 | 2 +-
 pkg/sentry/kernel/kernel.go                                      | 2 +-
 pkg/sentry/kernel/kernel_state.go                                | 2 +-
 pkg/sentry/kernel/memevent/memory_events.go                      | 2 +-
 pkg/sentry/kernel/memevent/memory_events.proto                   | 2 +-
 pkg/sentry/kernel/pending_signals.go                             | 2 +-
 pkg/sentry/kernel/pending_signals_state.go                       | 2 +-
 pkg/sentry/kernel/pipe/buffers.go                                | 2 +-
 pkg/sentry/kernel/pipe/device.go                                 | 2 +-
 pkg/sentry/kernel/pipe/node.go                                   | 2 +-
 pkg/sentry/kernel/pipe/node_test.go                              | 2 +-
 pkg/sentry/kernel/pipe/pipe.go                                   | 2 +-
 pkg/sentry/kernel/pipe/pipe_test.go                              | 2 +-
 pkg/sentry/kernel/pipe/reader.go                                 | 2 +-
 pkg/sentry/kernel/pipe/reader_writer.go                          | 2 +-
 pkg/sentry/kernel/pipe/writer.go                                 | 2 +-
 pkg/sentry/kernel/posixtimer.go                                  | 2 +-
 pkg/sentry/kernel/ptrace.go                                      | 2 +-
 pkg/sentry/kernel/rseq.go                                        | 2 +-
 pkg/sentry/kernel/sched/cpuset.go                                | 2 +-
 pkg/sentry/kernel/sched/cpuset_test.go                           | 2 +-
 pkg/sentry/kernel/sched/sched.go                                 | 2 +-
 pkg/sentry/kernel/seccomp.go                                     | 2 +-
 pkg/sentry/kernel/semaphore/semaphore.go                         | 2 +-
 pkg/sentry/kernel/semaphore/semaphore_test.go                    | 2 +-
 pkg/sentry/kernel/sessions.go                                    | 2 +-
 pkg/sentry/kernel/shm/device.go                                  | 2 +-
 pkg/sentry/kernel/shm/shm.go                                     | 2 +-
 pkg/sentry/kernel/signal.go                                      | 2 +-
 pkg/sentry/kernel/signal_handlers.go                             | 2 +-
 pkg/sentry/kernel/syscalls.go                                    | 2 +-
 pkg/sentry/kernel/syscalls_state.go                              | 2 +-
 pkg/sentry/kernel/syslog.go                                      | 2 +-
 pkg/sentry/kernel/table_test.go                                  | 2 +-
 pkg/sentry/kernel/task.go                                        | 2 +-
 pkg/sentry/kernel/task_acct.go                                   | 2 +-
 pkg/sentry/kernel/task_block.go                                  | 2 +-
 pkg/sentry/kernel/task_clone.go                                  | 2 +-
 pkg/sentry/kernel/task_context.go                                | 2 +-
 pkg/sentry/kernel/task_exec.go                                   | 2 +-
 pkg/sentry/kernel/task_exit.go                                   | 2 +-
 pkg/sentry/kernel/task_futex.go                                  | 2 +-
 pkg/sentry/kernel/task_identity.go                               | 2 +-
 pkg/sentry/kernel/task_log.go                                    | 2 +-
 pkg/sentry/kernel/task_net.go                                    | 2 +-
 pkg/sentry/kernel/task_run.go                                    | 2 +-
 pkg/sentry/kernel/task_sched.go                                  | 2 +-
 pkg/sentry/kernel/task_signals.go                                | 2 +-
 pkg/sentry/kernel/task_start.go                                  | 2 +-
 pkg/sentry/kernel/task_stop.go                                   | 2 +-
 pkg/sentry/kernel/task_syscall.go                                | 2 +-
 pkg/sentry/kernel/task_test.go                                   | 2 +-
 pkg/sentry/kernel/task_usermem.go                                | 2 +-
 pkg/sentry/kernel/thread_group.go                                | 2 +-
 pkg/sentry/kernel/threads.go                                     | 2 +-
 pkg/sentry/kernel/time/context.go                                | 2 +-
 pkg/sentry/kernel/time/time.go                                   | 2 +-
 pkg/sentry/kernel/timekeeper.go                                  | 2 +-
 pkg/sentry/kernel/timekeeper_state.go                            | 2 +-
 pkg/sentry/kernel/timekeeper_test.go                             | 2 +-
 pkg/sentry/kernel/uts_namespace.go                               | 2 +-
 pkg/sentry/kernel/vdso.go                                        | 2 +-
 pkg/sentry/kernel/version.go                                     | 2 +-
 pkg/sentry/limits/context.go                                     | 2 +-
 pkg/sentry/limits/limits.go                                      | 2 +-
 pkg/sentry/limits/limits_test.go                                 | 2 +-
 pkg/sentry/limits/linux.go                                       | 2 +-
 pkg/sentry/loader/elf.go                                         | 2 +-
 pkg/sentry/loader/interpreter.go                                 | 2 +-
 pkg/sentry/loader/loader.go                                      | 2 +-
 pkg/sentry/loader/vdso.go                                        | 2 +-
 pkg/sentry/loader/vdso_state.go                                  | 2 +-
 pkg/sentry/memmap/mapping_set.go                                 | 2 +-
 pkg/sentry/memmap/mapping_set_test.go                            | 2 +-
 pkg/sentry/memmap/memmap.go                                      | 2 +-
 pkg/sentry/memutil/memutil.go                                    | 2 +-
 pkg/sentry/memutil/memutil_unsafe.go                             | 2 +-
 pkg/sentry/mm/address_space.go                                   | 2 +-
 pkg/sentry/mm/aio_context.go                                     | 2 +-
 pkg/sentry/mm/aio_context_state.go                               | 2 +-
 pkg/sentry/mm/debug.go                                           | 2 +-
 pkg/sentry/mm/io.go                                              | 2 +-
 pkg/sentry/mm/lifecycle.go                                       | 2 +-
 pkg/sentry/mm/metadata.go                                        | 2 +-
 pkg/sentry/mm/mm.go                                              | 2 +-
 pkg/sentry/mm/mm_test.go                                         | 2 +-
 pkg/sentry/mm/pma.go                                             | 2 +-
 pkg/sentry/mm/proc_pid_maps.go                                   | 2 +-
 pkg/sentry/mm/save_restore.go                                    | 2 +-
 pkg/sentry/mm/shm.go                                             | 2 +-
 pkg/sentry/mm/special_mappable.go                                | 2 +-
 pkg/sentry/mm/syscalls.go                                        | 2 +-
 pkg/sentry/mm/vma.go                                             | 2 +-
 pkg/sentry/platform/context.go                                   | 2 +-
 pkg/sentry/platform/filemem/filemem.go                           | 2 +-
 pkg/sentry/platform/filemem/filemem_state.go                     | 2 +-
 pkg/sentry/platform/filemem/filemem_test.go                      | 2 +-
 pkg/sentry/platform/filemem/filemem_unsafe.go                    | 2 +-
 pkg/sentry/platform/interrupt/interrupt.go                       | 2 +-
 pkg/sentry/platform/interrupt/interrupt_test.go                  | 2 +-
 pkg/sentry/platform/kvm/address_space.go                         | 2 +-
 pkg/sentry/platform/kvm/allocator.go                             | 2 +-
 pkg/sentry/platform/kvm/bluepill.go                              | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64.go                        | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64.s                         | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go                 | 2 +-
 pkg/sentry/platform/kvm/bluepill_fault.go                        | 2 +-
 pkg/sentry/platform/kvm/bluepill_unsafe.go                       | 2 +-
 pkg/sentry/platform/kvm/context.go                               | 2 +-
 pkg/sentry/platform/kvm/host_map.go                              | 2 +-
 pkg/sentry/platform/kvm/kvm.go                                   | 2 +-
 pkg/sentry/platform/kvm/kvm_amd64.go                             | 2 +-
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go                      | 2 +-
 pkg/sentry/platform/kvm/kvm_const.go                             | 2 +-
 pkg/sentry/platform/kvm/kvm_test.go                              | 2 +-
 pkg/sentry/platform/kvm/machine.go                               | 2 +-
 pkg/sentry/platform/kvm/machine_amd64.go                         | 2 +-
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go                  | 2 +-
 pkg/sentry/platform/kvm/machine_unsafe.go                        | 2 +-
 pkg/sentry/platform/kvm/physical_map.go                          | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil.go                     | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil_amd64.go               | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil_amd64.s                | 2 +-
 pkg/sentry/platform/kvm/virtual_map.go                           | 2 +-
 pkg/sentry/platform/kvm/virtual_map_test.go                      | 2 +-
 pkg/sentry/platform/mmap_min_addr.go                             | 2 +-
 pkg/sentry/platform/platform.go                                  | 2 +-
 pkg/sentry/platform/procid/procid.go                             | 2 +-
 pkg/sentry/platform/procid/procid_amd64.s                        | 2 +-
 pkg/sentry/platform/procid/procid_net_test.go                    | 2 +-
 pkg/sentry/platform/procid/procid_test.go                        | 2 +-
 pkg/sentry/platform/ptrace/ptrace.go                             | 2 +-
 pkg/sentry/platform/ptrace/ptrace_unsafe.go                      | 2 +-
 pkg/sentry/platform/ptrace/stub_amd64.s                          | 2 +-
 pkg/sentry/platform/ptrace/stub_unsafe.go                        | 2 +-
 pkg/sentry/platform/ptrace/subprocess.go                         | 2 +-
 pkg/sentry/platform/ptrace/subprocess_amd64.go                   | 2 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go                   | 2 +-
 pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go      | 2 +-
 pkg/sentry/platform/ptrace/subprocess_unsafe.go                  | 2 +-
 pkg/sentry/platform/ring0/defs.go                                | 2 +-
 pkg/sentry/platform/ring0/defs_amd64.go                          | 2 +-
 pkg/sentry/platform/ring0/entry_amd64.go                         | 2 +-
 pkg/sentry/platform/ring0/entry_amd64.s                          | 2 +-
 pkg/sentry/platform/ring0/gen_offsets/main.go                    | 2 +-
 pkg/sentry/platform/ring0/kernel.go                              | 2 +-
 pkg/sentry/platform/ring0/kernel_amd64.go                        | 2 +-
 pkg/sentry/platform/ring0/kernel_unsafe.go                       | 2 +-
 pkg/sentry/platform/ring0/lib_amd64.go                           | 2 +-
 pkg/sentry/platform/ring0/lib_amd64.s                            | 2 +-
 pkg/sentry/platform/ring0/offsets_amd64.go                       | 2 +-
 pkg/sentry/platform/ring0/pagetables/allocator.go                | 2 +-
 pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go         | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go               | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go         | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go    | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_test.go          | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_x86.go           | 2 +-
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go                | 2 +-
 pkg/sentry/platform/ring0/pagetables/walker_amd64.go             | 2 +-
 pkg/sentry/platform/ring0/ring0.go                               | 2 +-
 pkg/sentry/platform/ring0/x86.go                                 | 2 +-
 pkg/sentry/platform/safecopy/atomic_amd64.s                      | 2 +-
 pkg/sentry/platform/safecopy/memclr_amd64.s                      | 2 +-
 pkg/sentry/platform/safecopy/memcpy_amd64.s                      | 2 +-
 pkg/sentry/platform/safecopy/safecopy.go                         | 2 +-
 pkg/sentry/platform/safecopy/safecopy_test.go                    | 2 +-
 pkg/sentry/platform/safecopy/safecopy_unsafe.go                  | 2 +-
 pkg/sentry/platform/safecopy/sighandler_amd64.s                  | 2 +-
 pkg/sentry/safemem/block_unsafe.go                               | 2 +-
 pkg/sentry/safemem/io.go                                         | 2 +-
 pkg/sentry/safemem/io_test.go                                    | 2 +-
 pkg/sentry/safemem/safemem.go                                    | 2 +-
 pkg/sentry/safemem/seq_test.go                                   | 2 +-
 pkg/sentry/safemem/seq_unsafe.go                                 | 2 +-
 pkg/sentry/sighandling/sighandling.go                            | 2 +-
 pkg/sentry/sighandling/sighandling_unsafe.go                     | 2 +-
 pkg/sentry/socket/control/control.go                             | 2 +-
 pkg/sentry/socket/epsocket/device.go                             | 2 +-
 pkg/sentry/socket/epsocket/epsocket.go                           | 2 +-
 pkg/sentry/socket/epsocket/provider.go                           | 2 +-
 pkg/sentry/socket/epsocket/save_restore.go                       | 2 +-
 pkg/sentry/socket/epsocket/stack.go                              | 2 +-
 pkg/sentry/socket/hostinet/device.go                             | 2 +-
 pkg/sentry/socket/hostinet/hostinet.go                           | 2 +-
 pkg/sentry/socket/hostinet/save_restore.go                       | 2 +-
 pkg/sentry/socket/hostinet/socket.go                             | 2 +-
 pkg/sentry/socket/hostinet/socket_unsafe.go                      | 2 +-
 pkg/sentry/socket/hostinet/stack.go                              | 2 +-
 pkg/sentry/socket/netlink/message.go                             | 2 +-
 pkg/sentry/socket/netlink/port/port.go                           | 2 +-
 pkg/sentry/socket/netlink/port/port_test.go                      | 2 +-
 pkg/sentry/socket/netlink/provider.go                            | 2 +-
 pkg/sentry/socket/netlink/route/protocol.go                      | 2 +-
 pkg/sentry/socket/netlink/socket.go                              | 2 +-
 pkg/sentry/socket/rpcinet/conn/conn.go                           | 2 +-
 pkg/sentry/socket/rpcinet/device.go                              | 2 +-
 pkg/sentry/socket/rpcinet/notifier/notifier.go                   | 2 +-
 pkg/sentry/socket/rpcinet/rpcinet.go                             | 2 +-
 pkg/sentry/socket/rpcinet/socket.go                              | 2 +-
 pkg/sentry/socket/rpcinet/stack.go                               | 2 +-
 pkg/sentry/socket/rpcinet/stack_unsafe.go                        | 2 +-
 pkg/sentry/socket/socket.go                                      | 2 +-
 pkg/sentry/socket/unix/device.go                                 | 2 +-
 pkg/sentry/socket/unix/io.go                                     | 2 +-
 pkg/sentry/socket/unix/transport/connectioned.go                 | 2 +-
 pkg/sentry/socket/unix/transport/connectioned_state.go           | 2 +-
 pkg/sentry/socket/unix/transport/connectionless.go               | 2 +-
 pkg/sentry/socket/unix/transport/queue.go                        | 2 +-
 pkg/sentry/socket/unix/transport/unix.go                         | 2 +-
 pkg/sentry/socket/unix/unix.go                                   | 2 +-
 pkg/sentry/state/state.go                                        | 2 +-
 pkg/sentry/state/state_metadata.go                               | 2 +-
 pkg/sentry/state/state_unsafe.go                                 | 2 +-
 pkg/sentry/strace/clone.go                                       | 2 +-
 pkg/sentry/strace/futex.go                                       | 2 +-
 pkg/sentry/strace/linux64.go                                     | 2 +-
 pkg/sentry/strace/open.go                                        | 2 +-
 pkg/sentry/strace/ptrace.go                                      | 2 +-
 pkg/sentry/strace/socket.go                                      | 2 +-
 pkg/sentry/strace/strace.go                                      | 2 +-
 pkg/sentry/strace/strace.proto                                   | 2 +-
 pkg/sentry/strace/syscalls.go                                    | 2 +-
 pkg/sentry/syscalls/epoll.go                                     | 2 +-
 pkg/sentry/syscalls/linux/error.go                               | 2 +-
 pkg/sentry/syscalls/linux/flags.go                               | 2 +-
 pkg/sentry/syscalls/linux/linux64.go                             | 2 +-
 pkg/sentry/syscalls/linux/sigset.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_aio.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_capability.go                      | 2 +-
 pkg/sentry/syscalls/linux/sys_epoll.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_eventfd.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_file.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_futex.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_getdents.go                        | 2 +-
 pkg/sentry/syscalls/linux/sys_identity.go                        | 2 +-
 pkg/sentry/syscalls/linux/sys_inotify.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_lseek.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_mmap.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_mount.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_pipe.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_poll.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_prctl.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_random.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_read.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_rlimit.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_rusage.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_sched.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_seccomp.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_sem.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_shm.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_signal.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_socket.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_stat.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_sync.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_sysinfo.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_syslog.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_thread.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_time.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_timer.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_timerfd.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_tls.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_utsname.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_write.go                           | 2 +-
 pkg/sentry/syscalls/linux/timespec.go                            | 2 +-
 pkg/sentry/syscalls/polling.go                                   | 2 +-
 pkg/sentry/syscalls/syscalls.go                                  | 2 +-
 pkg/sentry/syscalls/unimplemented_syscall.proto                  | 2 +-
 pkg/sentry/time/calibrated_clock.go                              | 2 +-
 pkg/sentry/time/calibrated_clock_test.go                         | 2 +-
 pkg/sentry/time/clock_id.go                                      | 2 +-
 pkg/sentry/time/clocks.go                                        | 2 +-
 pkg/sentry/time/muldiv_amd64.s                                   | 2 +-
 pkg/sentry/time/parameters.go                                    | 2 +-
 pkg/sentry/time/parameters_test.go                               | 2 +-
 pkg/sentry/time/sampler.go                                       | 2 +-
 pkg/sentry/time/sampler_test.go                                  | 2 +-
 pkg/sentry/time/sampler_unsafe.go                                | 2 +-
 pkg/sentry/time/tsc_amd64.s                                      | 2 +-
 pkg/sentry/uniqueid/context.go                                   | 2 +-
 pkg/sentry/usage/cpu.go                                          | 2 +-
 pkg/sentry/usage/io.go                                           | 2 +-
 pkg/sentry/usage/memory.go                                       | 2 +-
 pkg/sentry/usage/memory_unsafe.go                                | 2 +-
 pkg/sentry/usage/usage.go                                        | 2 +-
 pkg/sentry/usermem/access_type.go                                | 2 +-
 pkg/sentry/usermem/addr.go                                       | 2 +-
 pkg/sentry/usermem/addr_range_seq_test.go                        | 2 +-
 pkg/sentry/usermem/addr_range_seq_unsafe.go                      | 2 +-
 pkg/sentry/usermem/bytes_io.go                                   | 2 +-
 pkg/sentry/usermem/bytes_io_unsafe.go                            | 2 +-
 pkg/sentry/usermem/usermem.go                                    | 2 +-
 pkg/sentry/usermem/usermem_test.go                               | 2 +-
 pkg/sentry/usermem/usermem_x86.go                                | 2 +-
 pkg/sentry/watchdog/watchdog.go                                  | 2 +-
 pkg/sleep/commit_amd64.s                                         | 2 +-
 pkg/sleep/commit_asm.go                                          | 2 +-
 pkg/sleep/commit_noasm.go                                        | 2 +-
 pkg/sleep/empty.s                                                | 2 +-
 pkg/sleep/sleep_test.go                                          | 2 +-
 pkg/sleep/sleep_unsafe.go                                        | 2 +-
 pkg/state/decode.go                                              | 2 +-
 pkg/state/encode.go                                              | 2 +-
 pkg/state/encode_unsafe.go                                       | 2 +-
 pkg/state/map.go                                                 | 2 +-
 pkg/state/object.proto                                           | 2 +-
 pkg/state/printer.go                                             | 2 +-
 pkg/state/state.go                                               | 2 +-
 pkg/state/state_test.go                                          | 2 +-
 pkg/state/statefile/statefile.go                                 | 2 +-
 pkg/state/statefile/statefile_test.go                            | 2 +-
 pkg/state/stats.go                                               | 2 +-
 pkg/sync/atomicptr_unsafe.go                                     | 2 +-
 pkg/sync/atomicptrtest/atomicptr_test.go                         | 2 +-
 pkg/sync/memmove_unsafe.go                                       | 2 +-
 pkg/sync/norace_unsafe.go                                        | 2 +-
 pkg/sync/race_unsafe.go                                          | 2 +-
 pkg/sync/seqatomic_unsafe.go                                     | 2 +-
 pkg/sync/seqatomictest/seqatomic_test.go                         | 2 +-
 pkg/sync/seqcount.go                                             | 2 +-
 pkg/sync/seqcount_test.go                                        | 2 +-
 pkg/sync/sync.go                                                 | 2 +-
 pkg/syserr/host_linux.go                                         | 2 +-
 pkg/syserr/netstack.go                                           | 2 +-
 pkg/syserr/syserr.go                                             | 2 +-
 pkg/syserror/syserror.go                                         | 2 +-
 pkg/syserror/syserror_test.go                                    | 2 +-
 pkg/tcpip/adapters/gonet/gonet.go                                | 2 +-
 pkg/tcpip/adapters/gonet/gonet_test.go                           | 2 +-
 pkg/tcpip/buffer/prependable.go                                  | 2 +-
 pkg/tcpip/buffer/view.go                                         | 2 +-
 pkg/tcpip/buffer/view_test.go                                    | 2 +-
 pkg/tcpip/checker/checker.go                                     | 2 +-
 pkg/tcpip/header/arp.go                                          | 2 +-
 pkg/tcpip/header/checksum.go                                     | 2 +-
 pkg/tcpip/header/eth.go                                          | 2 +-
 pkg/tcpip/header/gue.go                                          | 2 +-
 pkg/tcpip/header/icmpv4.go                                       | 2 +-
 pkg/tcpip/header/icmpv6.go                                       | 2 +-
 pkg/tcpip/header/interfaces.go                                   | 2 +-
 pkg/tcpip/header/ipv4.go                                         | 2 +-
 pkg/tcpip/header/ipv6.go                                         | 2 +-
 pkg/tcpip/header/ipv6_fragment.go                                | 2 +-
 pkg/tcpip/header/ipversion_test.go                               | 2 +-
 pkg/tcpip/header/tcp.go                                          | 2 +-
 pkg/tcpip/header/tcp_test.go                                     | 2 +-
 pkg/tcpip/header/udp.go                                          | 2 +-
 pkg/tcpip/link/channel/channel.go                                | 2 +-
 pkg/tcpip/link/fdbased/endpoint.go                               | 2 +-
 pkg/tcpip/link/fdbased/endpoint_test.go                          | 2 +-
 pkg/tcpip/link/loopback/loopback.go                              | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_amd64.s                      | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_unsafe.go                    | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go              | 2 +-
 pkg/tcpip/link/rawfile/errors.go                                 | 2 +-
 pkg/tcpip/link/rawfile/rawfile_unsafe.go                         | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe.go                            | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe_test.go                       | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go                     | 2 +-
 pkg/tcpip/link/sharedmem/pipe/rx.go                              | 2 +-
 pkg/tcpip/link/sharedmem/pipe/tx.go                              | 2 +-
 pkg/tcpip/link/sharedmem/queue/queue_test.go                     | 2 +-
 pkg/tcpip/link/sharedmem/queue/rx.go                             | 2 +-
 pkg/tcpip/link/sharedmem/queue/tx.go                             | 2 +-
 pkg/tcpip/link/sharedmem/rx.go                                   | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem.go                            | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem_test.go                       | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem_unsafe.go                     | 2 +-
 pkg/tcpip/link/sharedmem/tx.go                                   | 2 +-
 pkg/tcpip/link/sniffer/pcap.go                                   | 2 +-
 pkg/tcpip/link/sniffer/sniffer.go                                | 2 +-
 pkg/tcpip/link/tun/tun_unsafe.go                                 | 2 +-
 pkg/tcpip/link/waitable/waitable.go                              | 2 +-
 pkg/tcpip/link/waitable/waitable_test.go                         | 2 +-
 pkg/tcpip/network/arp/arp.go                                     | 2 +-
 pkg/tcpip/network/arp/arp_test.go                                | 2 +-
 pkg/tcpip/network/fragmentation/frag_heap.go                     | 2 +-
 pkg/tcpip/network/fragmentation/frag_heap_test.go                | 2 +-
 pkg/tcpip/network/fragmentation/fragmentation.go                 | 2 +-
 pkg/tcpip/network/fragmentation/fragmentation_test.go            | 2 +-
 pkg/tcpip/network/fragmentation/reassembler.go                   | 2 +-
 pkg/tcpip/network/fragmentation/reassembler_test.go              | 2 +-
 pkg/tcpip/network/hash/hash.go                                   | 2 +-
 pkg/tcpip/network/ip_test.go                                     | 2 +-
 pkg/tcpip/network/ipv4/icmp.go                                   | 2 +-
 pkg/tcpip/network/ipv4/ipv4.go                                   | 2 +-
 pkg/tcpip/network/ipv4/ipv4_test.go                              | 2 +-
 pkg/tcpip/network/ipv6/icmp.go                                   | 2 +-
 pkg/tcpip/network/ipv6/icmp_test.go                              | 2 +-
 pkg/tcpip/network/ipv6/ipv6.go                                   | 2 +-
 pkg/tcpip/ports/ports.go                                         | 2 +-
 pkg/tcpip/ports/ports_test.go                                    | 2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go                         | 2 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go                            | 2 +-
 pkg/tcpip/seqnum/seqnum.go                                       | 2 +-
 pkg/tcpip/stack/linkaddrcache.go                                 | 2 +-
 pkg/tcpip/stack/linkaddrcache_test.go                            | 2 +-
 pkg/tcpip/stack/nic.go                                           | 2 +-
 pkg/tcpip/stack/registration.go                                  | 2 +-
 pkg/tcpip/stack/route.go                                         | 2 +-
 pkg/tcpip/stack/stack.go                                         | 2 +-
 pkg/tcpip/stack/stack_global_state.go                            | 2 +-
 pkg/tcpip/stack/stack_test.go                                    | 2 +-
 pkg/tcpip/stack/transport_demuxer.go                             | 2 +-
 pkg/tcpip/stack/transport_test.go                                | 2 +-
 pkg/tcpip/tcpip.go                                               | 2 +-
 pkg/tcpip/tcpip_test.go                                          | 2 +-
 pkg/tcpip/time.s                                                 | 2 +-
 pkg/tcpip/time_unsafe.go                                         | 2 +-
 pkg/tcpip/transport/ping/endpoint.go                             | 2 +-
 pkg/tcpip/transport/ping/endpoint_state.go                       | 2 +-
 pkg/tcpip/transport/ping/protocol.go                             | 2 +-
 pkg/tcpip/transport/tcp/accept.go                                | 2 +-
 pkg/tcpip/transport/tcp/connect.go                               | 2 +-
 pkg/tcpip/transport/tcp/cubic.go                                 | 2 +-
 pkg/tcpip/transport/tcp/dual_stack_test.go                       | 2 +-
 pkg/tcpip/transport/tcp/endpoint.go                              | 2 +-
 pkg/tcpip/transport/tcp/endpoint_state.go                        | 2 +-
 pkg/tcpip/transport/tcp/forwarder.go                             | 2 +-
 pkg/tcpip/transport/tcp/protocol.go                              | 2 +-
 pkg/tcpip/transport/tcp/rcv.go                                   | 2 +-
 pkg/tcpip/transport/tcp/reno.go                                  | 2 +-
 pkg/tcpip/transport/tcp/sack.go                                  | 2 +-
 pkg/tcpip/transport/tcp/segment.go                               | 2 +-
 pkg/tcpip/transport/tcp/segment_heap.go                          | 2 +-
 pkg/tcpip/transport/tcp/segment_queue.go                         | 2 +-
 pkg/tcpip/transport/tcp/segment_state.go                         | 2 +-
 pkg/tcpip/transport/tcp/snd.go                                   | 2 +-
 pkg/tcpip/transport/tcp/snd_state.go                             | 2 +-
 pkg/tcpip/transport/tcp/tcp_sack_test.go                         | 2 +-
 pkg/tcpip/transport/tcp/tcp_test.go                              | 2 +-
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go                    | 2 +-
 pkg/tcpip/transport/tcp/testing/context/context.go               | 2 +-
 pkg/tcpip/transport/tcp/timer.go                                 | 2 +-
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go                | 2 +-
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go           | 2 +-
 pkg/tcpip/transport/udp/endpoint.go                              | 2 +-
 pkg/tcpip/transport/udp/endpoint_state.go                        | 2 +-
 pkg/tcpip/transport/udp/protocol.go                              | 2 +-
 pkg/tcpip/transport/udp/udp_test.go                              | 2 +-
 pkg/tmutex/tmutex.go                                             | 2 +-
 pkg/tmutex/tmutex_test.go                                        | 2 +-
 pkg/unet/unet.go                                                 | 2 +-
 pkg/unet/unet_test.go                                            | 2 +-
 pkg/unet/unet_unsafe.go                                          | 2 +-
 pkg/urpc/urpc.go                                                 | 2 +-
 pkg/urpc/urpc_test.go                                            | 2 +-
 pkg/waiter/fdnotifier/fdnotifier.go                              | 2 +-
 pkg/waiter/fdnotifier/poll_unsafe.go                             | 2 +-
 pkg/waiter/waiter.go                                             | 2 +-
 pkg/waiter/waiter_test.go                                        | 2 +-
 runsc/boot/compat.go                                             | 2 +-
 runsc/boot/config.go                                             | 2 +-
 runsc/boot/controller.go                                         | 2 +-
 runsc/boot/debug.go                                              | 2 +-
 runsc/boot/events.go                                             | 2 +-
 runsc/boot/fds.go                                                | 2 +-
 runsc/boot/filter/config.go                                      | 2 +-
 runsc/boot/filter/extra_filters.go                               | 2 +-
 runsc/boot/filter/extra_filters_msan.go                          | 2 +-
 runsc/boot/filter/extra_filters_race.go                          | 2 +-
 runsc/boot/filter/filter.go                                      | 2 +-
 runsc/boot/fs.go                                                 | 2 +-
 runsc/boot/limits.go                                             | 2 +-
 runsc/boot/loader.go                                             | 2 +-
 runsc/boot/loader_test.go                                        | 2 +-
 runsc/boot/network.go                                            | 2 +-
 runsc/boot/strace.go                                             | 2 +-
 runsc/cgroup/cgroup.go                                           | 2 +-
 runsc/cgroup/cgroup_test.go                                      | 2 +-
 runsc/cmd/boot.go                                                | 2 +-
 runsc/cmd/capability.go                                          | 2 +-
 runsc/cmd/capability_test.go                                     | 2 +-
 runsc/cmd/checkpoint.go                                          | 2 +-
 runsc/cmd/cmd.go                                                 | 2 +-
 runsc/cmd/create.go                                              | 2 +-
 runsc/cmd/debug.go                                               | 2 +-
 runsc/cmd/delete.go                                              | 2 +-
 runsc/cmd/delete_test.go                                         | 2 +-
 runsc/cmd/events.go                                              | 2 +-
 runsc/cmd/exec.go                                                | 2 +-
 runsc/cmd/exec_test.go                                           | 2 +-
 runsc/cmd/gofer.go                                               | 2 +-
 runsc/cmd/kill.go                                                | 2 +-
 runsc/cmd/list.go                                                | 2 +-
 runsc/cmd/path.go                                                | 2 +-
 runsc/cmd/pause.go                                               | 2 +-
 runsc/cmd/ps.go                                                  | 2 +-
 runsc/cmd/restore.go                                             | 2 +-
 runsc/cmd/resume.go                                              | 2 +-
 runsc/cmd/run.go                                                 | 2 +-
 runsc/cmd/spec.go                                                | 2 +-
 runsc/cmd/start.go                                               | 2 +-
 runsc/cmd/state.go                                               | 2 +-
 runsc/cmd/wait.go                                                | 2 +-
 runsc/console/console.go                                         | 2 +-
 runsc/container/console_test.go                                  | 2 +-
 runsc/container/container.go                                     | 2 +-
 runsc/container/container_test.go                                | 2 +-
 runsc/container/fs.go                                            | 2 +-
 runsc/container/fs_test.go                                       | 2 +-
 runsc/container/hook.go                                          | 2 +-
 runsc/container/multi_container_test.go                          | 2 +-
 runsc/container/status.go                                        | 2 +-
 runsc/container/test_app.go                                      | 2 +-
 runsc/fsgofer/filter/config.go                                   | 2 +-
 runsc/fsgofer/filter/extra_filters.go                            | 2 +-
 runsc/fsgofer/filter/extra_filters_msan.go                       | 2 +-
 runsc/fsgofer/filter/extra_filters_race.go                       | 2 +-
 runsc/fsgofer/filter/filter.go                                   | 2 +-
 runsc/fsgofer/fsgofer.go                                         | 2 +-
 runsc/fsgofer/fsgofer_test.go                                    | 2 +-
 runsc/fsgofer/fsgofer_unsafe.go                                  | 2 +-
 runsc/main.go                                                    | 2 +-
 runsc/sandbox/chroot.go                                          | 2 +-
 runsc/sandbox/network.go                                         | 2 +-
 runsc/sandbox/sandbox.go                                         | 2 +-
 runsc/specutils/namespace.go                                     | 2 +-
 runsc/specutils/specutils.go                                     | 2 +-
 runsc/specutils/specutils_test.go                                | 2 +-
 runsc/test/image/image.go                                        | 2 +-
 runsc/test/image/image_test.go                                   | 2 +-
 runsc/test/image/mysql.sql                                       | 2 +-
 runsc/test/image/ruby.rb                                         | 2 +-
 runsc/test/image/ruby.sh                                         | 2 +-
 runsc/test/install.sh                                            | 2 +-
 runsc/test/integration/exec_test.go                              | 2 +-
 runsc/test/integration/integration.go                            | 2 +-
 runsc/test/integration/integration_test.go                       | 2 +-
 runsc/test/root/cgroup_test.go                                   | 2 +-
 runsc/test/root/chroot_test.go                                   | 2 +-
 runsc/test/root/root.go                                          | 2 +-
 runsc/test/testutil/docker.go                                    | 2 +-
 runsc/test/testutil/testutil.go                                  | 2 +-
 runsc/test/testutil/testutil_race.go                             | 2 +-
 runsc/tools/dockercfg/dockercfg.go                               | 2 +-
 tools/go_generics/generics.go                                    | 2 +-
 tools/go_generics/generics_tests/all_stmts/input.go              | 2 +-
 tools/go_generics/generics_tests/all_stmts/output/output.go      | 2 +-
 tools/go_generics/generics_tests/all_types/input.go              | 2 +-
 tools/go_generics/generics_tests/all_types/lib/lib.go            | 2 +-
 tools/go_generics/generics_tests/all_types/output/output.go      | 2 +-
 tools/go_generics/generics_tests/consts/input.go                 | 2 +-
 tools/go_generics/generics_tests/consts/output/output.go         | 2 +-
 tools/go_generics/generics_tests/imports/input.go                | 2 +-
 tools/go_generics/generics_tests/imports/output/output.go        | 2 +-
 tools/go_generics/generics_tests/remove_typedef/input.go         | 2 +-
 tools/go_generics/generics_tests/remove_typedef/output/output.go | 2 +-
 tools/go_generics/generics_tests/simple/input.go                 | 2 +-
 tools/go_generics/generics_tests/simple/output/output.go         | 2 +-
 tools/go_generics/globals/globals_visitor.go                     | 2 +-
 tools/go_generics/globals/scope.go                               | 2 +-
 tools/go_generics/go_generics_unittest.sh                        | 2 +-
 tools/go_generics/imports.go                                     | 2 +-
 tools/go_generics/merge.go                                       | 2 +-
 tools/go_generics/remove.go                                      | 2 +-
 tools/go_generics/rules_tests/template.go                        | 2 +-
 tools/go_generics/rules_tests/template_test.go                   | 2 +-
 tools/go_stateify/main.go                                        | 2 +-
 tools/workspace_status.sh                                        | 2 +-
 vdso/barrier.h                                                   | 2 +-
 vdso/check_vdso.py                                               | 2 +-
 vdso/compiler.h                                                  | 2 +-
 vdso/cycle_clock.h                                               | 2 +-
 vdso/seqlock.h                                                   | 2 +-
 vdso/syscalls.h                                                  | 2 +-
 vdso/vdso.cc                                                     | 2 +-
 vdso/vdso_time.cc                                                | 2 +-
 vdso/vdso_time.h                                                 | 2 +-
 923 files changed, 923 insertions(+), 923 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/run_build.sh b/kokoro/run_build.sh
index f2b719f52..89e24b037 100755
--- a/kokoro/run_build.sh
+++ b/kokoro/run_build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 3f8841cee..0a0d73d29 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/pkg/abi/abi.go b/pkg/abi/abi.go
index a53c2747b..7770f0405 100644
--- a/pkg/abi/abi.go
+++ b/pkg/abi/abi.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go
index dd5d67b51..9d9f361a4 100644
--- a/pkg/abi/abi_linux.go
+++ b/pkg/abi/abi_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/flag.go b/pkg/abi/flag.go
index 0391ccf37..0698e410f 100644
--- a/pkg/abi/flag.go
+++ b/pkg/abi/flag.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go
index 9c39ca2ef..1b7ca714a 100644
--- a/pkg/abi/linux/aio.go
+++ b/pkg/abi/linux/aio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ashmem.go b/pkg/abi/linux/ashmem.go
index 7fbfd2e68..ced1e44d4 100644
--- a/pkg/abi/linux/ashmem.go
+++ b/pkg/abi/linux/ashmem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/binder.go b/pkg/abi/linux/binder.go
index b228898f9..522dc6f53 100644
--- a/pkg/abi/linux/binder.go
+++ b/pkg/abi/linux/binder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index 80e5b1af1..d9cd09948 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index b470ce0a5..7d96f013e 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index ea5b16b7b..5b1199aac 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/elf.go b/pkg/abi/linux/elf.go
index 76c13b677..928067c04 100644
--- a/pkg/abi/linux/elf.go
+++ b/pkg/abi/linux/elf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/errors.go b/pkg/abi/linux/errors.go
index b5ddb2b2f..01e4095b8 100644
--- a/pkg/abi/linux/errors.go
+++ b/pkg/abi/linux/errors.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/eventfd.go b/pkg/abi/linux/eventfd.go
index bc0fb44d2..5614f5cf1 100644
--- a/pkg/abi/linux/eventfd.go
+++ b/pkg/abi/linux/eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/exec.go b/pkg/abi/linux/exec.go
index 4d81eca54..a07c29243 100644
--- a/pkg/abi/linux/exec.go
+++ b/pkg/abi/linux/exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index 2a5ad6ed7..c8558933a 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 9bf229a57..72e5c6f83 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 32a0812b4..7817bfb52 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go
index f63f5200c..5dff01fba 100644
--- a/pkg/abi/linux/futex.go
+++ b/pkg/abi/linux/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/inotify.go b/pkg/abi/linux/inotify.go
index 072a2d146..79c5d3593 100644
--- a/pkg/abi/linux/inotify.go
+++ b/pkg/abi/linux/inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index afd9ee82b..9afc3d1ef 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index 6b68999ab..fcec16965 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go
index 81e9904dd..10681768b 100644
--- a/pkg/abi/linux/ipc.go
+++ b/pkg/abi/linux/ipc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index e1f0932ec..b2e51b9bd 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/linux.go b/pkg/abi/linux/linux.go
index de2af80dc..d365f693d 100644
--- a/pkg/abi/linux/linux.go
+++ b/pkg/abi/linux/linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index b48e1d18a..3fcdf8235 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netdevice.go b/pkg/abi/linux/netdevice.go
index 88654a1b3..e3b6b1e40 100644
--- a/pkg/abi/linux/netdevice.go
+++ b/pkg/abi/linux/netdevice.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index e823ffa7e..10ceb5bf2 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index a5d778748..4200b6506 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/poll.go b/pkg/abi/linux/poll.go
index f373cfca1..9f0b15d1c 100644
--- a/pkg/abi/linux/poll.go
+++ b/pkg/abi/linux/poll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index 074ec03f0..e152c4c27 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ptrace.go b/pkg/abi/linux/ptrace.go
index ba48d4d6d..7db4f5464 100644
--- a/pkg/abi/linux/ptrace.go
+++ b/pkg/abi/linux/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/rusage.go b/pkg/abi/linux/rusage.go
index a4a89abda..7fea4b589 100644
--- a/pkg/abi/linux/rusage.go
+++ b/pkg/abi/linux/rusage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/sched.go b/pkg/abi/linux/sched.go
index 05fda1604..ef96a3801 100644
--- a/pkg/abi/linux/sched.go
+++ b/pkg/abi/linux/sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index a8de9d3d0..9963ceeba 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index 3495f5cd0..d1a0bdb32 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
index f50b3c2e2..82a80e609 100644
--- a/pkg/abi/linux/shm.go
+++ b/pkg/abi/linux/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index b2c7230c4..bf9bce6ed 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 19b5fa212..af0761a3b 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index 4569f4208..bbd21e726 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/timer.go b/pkg/abi/linux/timer.go
index 6c4675c35..a6f420bdb 100644
--- a/pkg/abi/linux/timer.go
+++ b/pkg/abi/linux/timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index f63dc52aa..e6f7c5b2a 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/uio.go b/pkg/abi/linux/uio.go
index 93c972774..7e00d9959 100644
--- a/pkg/abi/linux/uio.go
+++ b/pkg/abi/linux/uio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/utsname.go b/pkg/abi/linux/utsname.go
index 7d33d20de..f80ed7d4a 100644
--- a/pkg/abi/linux/utsname.go
+++ b/pkg/abi/linux/utsname.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go
index 1cb73359a..26b674435 100644
--- a/pkg/amutex/amutex.go
+++ b/pkg/amutex/amutex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go
index 876e47b19..104e0dab1 100644
--- a/pkg/amutex/amutex_test.go
+++ b/pkg/amutex/amutex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops.go b/pkg/atomicbitops/atomic_bitops.go
index 6635ea0d2..9a57f9599 100644
--- a/pkg/atomicbitops/atomic_bitops.go
+++ b/pkg/atomicbitops/atomic_bitops.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_amd64.s b/pkg/atomicbitops/atomic_bitops_amd64.s
index 542452bec..b37e3aad3 100644
--- a/pkg/atomicbitops/atomic_bitops_amd64.s
+++ b/pkg/atomicbitops/atomic_bitops_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_common.go b/pkg/atomicbitops/atomic_bitops_common.go
index 542ff4e83..b03242baa 100644
--- a/pkg/atomicbitops/atomic_bitops_common.go
+++ b/pkg/atomicbitops/atomic_bitops_common.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go
index ec0c07ee2..ee6207cb3 100644
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ b/pkg/atomicbitops/atomic_bitops_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/binary/binary.go b/pkg/binary/binary.go
index 3b18a86ee..02f7e9fb8 100644
--- a/pkg/binary/binary.go
+++ b/pkg/binary/binary.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/binary/binary_test.go b/pkg/binary/binary_test.go
index 921a0369a..d8d481f32 100644
--- a/pkg/binary/binary_test.go
+++ b/pkg/binary/binary_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/bits.go b/pkg/bits/bits.go
index 50ca4bff7..eb3c80f49 100644
--- a/pkg/bits/bits.go
+++ b/pkg/bits/bits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go
index 0a01f29c2..8c578cca2 100644
--- a/pkg/bits/bits_template.go
+++ b/pkg/bits/bits_template.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_amd64.go b/pkg/bits/uint64_arch_amd64.go
index 068597f68..1fef89394 100644
--- a/pkg/bits/uint64_arch_amd64.go
+++ b/pkg/bits/uint64_arch_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_amd64_asm.s b/pkg/bits/uint64_arch_amd64_asm.s
index 33885641a..8c7322f0f 100644
--- a/pkg/bits/uint64_arch_amd64_asm.s
+++ b/pkg/bits/uint64_arch_amd64_asm.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_generic.go b/pkg/bits/uint64_arch_generic.go
index 862033a4b..cfb47400b 100644
--- a/pkg/bits/uint64_arch_generic.go
+++ b/pkg/bits/uint64_arch_generic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go
index 906017e1a..d6dbaf602 100644
--- a/pkg/bits/uint64_test.go
+++ b/pkg/bits/uint64_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/bpf.go b/pkg/bpf/bpf.go
index 757744090..98d44d911 100644
--- a/pkg/bpf/bpf.go
+++ b/pkg/bpf/bpf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/decoder.go b/pkg/bpf/decoder.go
index ef41e9edc..ae6b8839a 100644
--- a/pkg/bpf/decoder.go
+++ b/pkg/bpf/decoder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/decoder_test.go b/pkg/bpf/decoder_test.go
index 18709b944..f093e1e41 100644
--- a/pkg/bpf/decoder_test.go
+++ b/pkg/bpf/decoder_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/input_bytes.go b/pkg/bpf/input_bytes.go
index 74af038eb..745c0749b 100644
--- a/pkg/bpf/input_bytes.go
+++ b/pkg/bpf/input_bytes.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/interpreter.go b/pkg/bpf/interpreter.go
index 111ada9d1..86c7add4d 100644
--- a/pkg/bpf/interpreter.go
+++ b/pkg/bpf/interpreter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/interpreter_test.go b/pkg/bpf/interpreter_test.go
index 9e5e33228..c46a43991 100644
--- a/pkg/bpf/interpreter_test.go
+++ b/pkg/bpf/interpreter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/program_builder.go b/pkg/bpf/program_builder.go
index bad56d7ac..b4ce228e1 100644
--- a/pkg/bpf/program_builder.go
+++ b/pkg/bpf/program_builder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/program_builder_test.go b/pkg/bpf/program_builder_test.go
index 7e4f06584..0e0b79d88 100644
--- a/pkg/bpf/program_builder_test.go
+++ b/pkg/bpf/program_builder_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go
index 667f17c5c..205536812 100644
--- a/pkg/compressio/compressio.go
+++ b/pkg/compressio/compressio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/compressio/compressio_test.go b/pkg/compressio/compressio_test.go
index 7cb5f8dc4..1bbabee79 100644
--- a/pkg/compressio/compressio_test.go
+++ b/pkg/compressio/compressio_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/control/client/client.go b/pkg/control/client/client.go
index f7c2e8776..0d0c9f148 100644
--- a/pkg/control/client/client.go
+++ b/pkg/control/client/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go
index d00061ce3..c46b5d70b 100644
--- a/pkg/control/server/server.go
+++ b/pkg/control/server/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpu_amd64.s b/pkg/cpuid/cpu_amd64.s
index 48a13c6fd..905c1d12e 100644
--- a/pkg/cpuid/cpu_amd64.s
+++ b/pkg/cpuid/cpu_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index e91e34dc7..5b083a5fb 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid_parse_test.go b/pkg/cpuid/cpuid_parse_test.go
index c4f52818c..81b06f48c 100644
--- a/pkg/cpuid/cpuid_parse_test.go
+++ b/pkg/cpuid/cpuid_parse_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid_test.go b/pkg/cpuid/cpuid_test.go
index 02f732f85..0decd8f08 100644
--- a/pkg/cpuid/cpuid_test.go
+++ b/pkg/cpuid/cpuid_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 92c634a14..3330c4998 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp.go b/pkg/dhcp/dhcp.go
index ceaba34c3..ad11e178a 100644
--- a/pkg/dhcp/dhcp.go
+++ b/pkg/dhcp/dhcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp_string.go b/pkg/dhcp/dhcp_string.go
index 7cabed29e..8533895bd 100644
--- a/pkg/dhcp/dhcp_string.go
+++ b/pkg/dhcp/dhcp_string.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index d60e3752b..a21dce6bc 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index 26700bdbc..3e06ab4c7 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index bfd28256e..41a7b5ed3 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/eventchannel/event.proto b/pkg/eventchannel/event.proto
index 455f03658..c1679c7e7 100644
--- a/pkg/eventchannel/event.proto
+++ b/pkg/eventchannel/event.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index 32d24c41b..f6656ffa1 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fd/fd_test.go b/pkg/fd/fd_test.go
index 94b3eb7cc..42bb3ef6c 100644
--- a/pkg/fd/fd_test.go
+++ b/pkg/fd/fd_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/gate/gate.go b/pkg/gate/gate.go
index 93808c9dd..48122bf5a 100644
--- a/pkg/gate/gate.go
+++ b/pkg/gate/gate.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 06587339b..95620fa8e 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 4ae02eee9..51c9b6df3 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/ilist/list_test.go b/pkg/ilist/list_test.go
index 2c56280f6..4bda570b6 100644
--- a/pkg/ilist/list_test.go
+++ b/pkg/ilist/list_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/linewriter/linewriter.go b/pkg/linewriter/linewriter.go
index 98f974410..5fbd4e779 100644
--- a/pkg/linewriter/linewriter.go
+++ b/pkg/linewriter/linewriter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/linewriter/linewriter_test.go b/pkg/linewriter/linewriter_test.go
index ce97cca05..9140ee6af 100644
--- a/pkg/linewriter/linewriter_test.go
+++ b/pkg/linewriter/linewriter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index 58b4052e6..fbb58501b 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/glog_unsafe.go b/pkg/log/glog_unsafe.go
index c320190b8..bb06aa7d3 100644
--- a/pkg/log/glog_unsafe.go
+++ b/pkg/log/glog_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json.go b/pkg/log/json.go
index 3887f1cd5..96bd13d87 100644
--- a/pkg/log/json.go
+++ b/pkg/log/json.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json_test.go b/pkg/log/json_test.go
index 3b167dab0..b8c7a795e 100644
--- a/pkg/log/json_test.go
+++ b/pkg/log/json_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/log.go b/pkg/log/log.go
index c496e86e4..b8d456aae 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/log_test.go b/pkg/log/log_test.go
index d93e989dc..a59d457dd 100644
--- a/pkg/log/log_test.go
+++ b/pkg/log/log_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 763cd6bc2..02af75974 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric.proto b/pkg/metric/metric.proto
index 6108cb7c0..917fda1ac 100644
--- a/pkg/metric/metric.proto
+++ b/pkg/metric/metric.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index 7d156e4a5..40034a589 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/buffer.go b/pkg/p9/buffer.go
index fc65d2c5f..9575ddf12 100644
--- a/pkg/p9/buffer.go
+++ b/pkg/p9/buffer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 5fa231bc5..3ebfab82a 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index a46efd27f..066639fda 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client_test.go b/pkg/p9/client_test.go
index 06302a76a..f7145452d 100644
--- a/pkg/p9/client_test.go
+++ b/pkg/p9/client_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 9723fa24d..d2e89e373 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index ea41f97c7..959dff31d 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index cef3701a7..1e6aaa762 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index b3d76801b..972c37344 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index f353755f1..dfb41bb76 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index c6899c3ce..3b0993ecd 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9_test.go b/pkg/p9/p9_test.go
index a50ac80a4..02498346c 100644
--- a/pkg/p9/p9_test.go
+++ b/pkg/p9/p9_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 34ddccd8b..db562b9ba 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9test/mocks.go b/pkg/p9/p9test/mocks.go
index 9d039ac63..9a8c14975 100644
--- a/pkg/p9/p9test/mocks.go
+++ b/pkg/p9/p9test/mocks.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go
index 9a508b898..34ed898e8 100644
--- a/pkg/p9/pool.go
+++ b/pkg/p9/pool.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/pool_test.go b/pkg/p9/pool_test.go
index 96be2c8bd..71052d8c4 100644
--- a/pkg/p9/pool_test.go
+++ b/pkg/p9/pool_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 28a273ac6..5c7cb18c8 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index b5df29961..97396806c 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go
index d6d4b6365..3352a5205 100644
--- a/pkg/p9/transport_test.go
+++ b/pkg/p9/transport_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 8783eaa7e..ceb6fabbf 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/version_test.go b/pkg/p9/version_test.go
index 634ac3ca5..c053614c9 100644
--- a/pkg/p9/version_test.go
+++ b/pkg/p9/version_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/rand/rand.go b/pkg/rand/rand.go
index e81f0f5db..593a14380 100644
--- a/pkg/rand/rand.go
+++ b/pkg/rand/rand.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index a2be66b3b..7ebe8f3b0 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 638a93bab..8f08c74c7 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter_state.go b/pkg/refs/refcounter_state.go
index 093eae785..136f06fbf 100644
--- a/pkg/refs/refcounter_state.go
+++ b/pkg/refs/refcounter_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go
index cc11bcd71..abaa87453 100644
--- a/pkg/refs/refcounter_test.go
+++ b/pkg/refs/refcounter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index a746dc9b3..1dfbf749e 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 6b707f195..a9278c64b 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 0188ad4f3..226f30b7b 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index 4f2ae4dac..007038273 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index ae18534bf..dd009221a 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/full_reader.go b/pkg/secio/full_reader.go
index b2dbb8615..90b1772a7 100644
--- a/pkg/secio/full_reader.go
+++ b/pkg/secio/full_reader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/secio.go b/pkg/secio/secio.go
index fc625efb8..e5f74a497 100644
--- a/pkg/secio/secio.go
+++ b/pkg/secio/secio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/secio_test.go b/pkg/secio/secio_test.go
index 64b4cc17d..8304c4f74 100644
--- a/pkg/secio/secio_test.go
+++ b/pkg/secio/secio_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/range.go b/pkg/segment/range.go
index 34c067265..057bcd7ff 100644
--- a/pkg/segment/range.go
+++ b/pkg/segment/range.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index cffec2a2c..a9a3b8875 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/set_state.go b/pkg/segment/set_state.go
index a763d1915..b86e1b75f 100644
--- a/pkg/segment/set_state.go
+++ b/pkg/segment/set_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go
index 7ea24b177..0825105db 100644
--- a/pkg/segment/test/segment_test.go
+++ b/pkg/segment/test/segment_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go
index 37c196ea1..05ba5fbb9 100644
--- a/pkg/segment/test/set_functions.go
+++ b/pkg/segment/test/set_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/aligned.go b/pkg/sentry/arch/aligned.go
index 193232e27..c88c034f6 100644
--- a/pkg/sentry/arch/aligned.go
+++ b/pkg/sentry/arch/aligned.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 21cb84502..575b7ba66 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 5ba6c19ea..bb80a7bed 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s
index 10d621b6d..fa9857df7 100644
--- a/pkg/sentry/arch/arch_amd64.s
+++ b/pkg/sentry/arch/arch_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index e9c23a06b..604bd08a6 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index b35eec53c..59bf89d99 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
index 81cfb4a01..5df65a691 100644
--- a/pkg/sentry/arch/auxv.go
+++ b/pkg/sentry/arch/auxv.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index 437ff44ca..f4c2f7043 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index 36437b965..ad098c746 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 9ca4c8ed1..f7f054b0b 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_info.go b/pkg/sentry/arch/signal_info.go
index ec004ae75..fa0ecbec5 100644
--- a/pkg/sentry/arch/signal_info.go
+++ b/pkg/sentry/arch/signal_info.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index ba43dd1d4..c02ae3b7c 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 6c1b9be82..716a3574d 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go
index 41d8ba0d1..47c31d4b9 100644
--- a/pkg/sentry/arch/syscalls_amd64.go
+++ b/pkg/sentry/arch/syscalls_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index 598c5b4ff..12bdcef85 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index b3c6a566b..d2f084ed7 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/control.go b/pkg/sentry/control/control.go
index a6ee6e649..32d30b6ea 100644
--- a/pkg/sentry/control/control.go
+++ b/pkg/sentry/control/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 0ba730c1e..b6ac2f312 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index 22c826236..5d52cd829 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
index cee4db636..0a480c84a 100644
--- a/pkg/sentry/control/state.go
+++ b/pkg/sentry/control/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index 21fee8f8a..27e4eb258 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/device/device_test.go b/pkg/sentry/device/device_test.go
index dfec45046..5d8805c2f 100644
--- a/pkg/sentry/device/device_test.go
+++ b/pkg/sentry/device/device_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go
index ddc2c0985..743cf511f 100644
--- a/pkg/sentry/fs/anon/anon.go
+++ b/pkg/sentry/fs/anon/anon.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/anon/device.go b/pkg/sentry/fs/anon/device.go
index 1c666729c..2d1249299 100644
--- a/pkg/sentry/fs/anon/device.go
+++ b/pkg/sentry/fs/anon/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index bfd7f2762..5372875ac 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index d0986fa11..962da141b 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
index ecba395a0..7c997f533 100644
--- a/pkg/sentry/fs/ashmem/pin_board.go
+++ b/pkg/sentry/fs/ashmem/pin_board.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/pin_board_test.go b/pkg/sentry/fs/ashmem/pin_board_test.go
index f4ea5de6d..736e628dc 100644
--- a/pkg/sentry/fs/ashmem/pin_board_test.go
+++ b/pkg/sentry/fs/ashmem/pin_board_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 091f4ac63..59e060e3c 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 502a262dd..42b9e8b26 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index da46ad77f..1775d3486 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 8c949b176..d65dc74bf 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index c3c9d963d..64f030f72 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index b347468ff..ef6d1a870 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 3f4f2a40a..05a5005ad 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/device.go b/pkg/sentry/fs/dev/device.go
index 9d935e008..3cecdf6e2 100644
--- a/pkg/sentry/fs/dev/device.go
+++ b/pkg/sentry/fs/dev/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index 2ae49be4e..d96f4f423 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 492b8eb3a..eeda646ab 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 2977c8670..68090f353 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 47b76218f..33e4913e4 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 27fea0019..2c01485a8 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index c680e4828..502b0a09b 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go
index 82b7f6bd5..5d0e9d91c 100644
--- a/pkg/sentry/fs/dirent_cache_test.go
+++ b/pkg/sentry/fs/dirent_cache_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index f9dcba316..325404e27 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index 04ab197b9..5cf151dab 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 2e34604e6..bfafff5ec 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go
index 945cfaf08..92ab6ff0e 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index 83f6c1986..69516e048 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
index 99c40d8ed..4395666ad 100644
--- a/pkg/sentry/fs/fdpipe/pipe_state.go
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index 6cd314f5b..d3f15be6b 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 36794d378..d6752ed1b 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index d223bb5c7..28e8e233d 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 41e646ee8..9b958b64b 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 830458ff9..11e4f7203 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_state.go b/pkg/sentry/fs/file_state.go
index f848d1b79..1c3bae3e8 100644
--- a/pkg/sentry/fs/file_state.go
+++ b/pkg/sentry/fs/file_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_test.go b/pkg/sentry/fs/file_test.go
index 18aee7101..f3ed9a70b 100644
--- a/pkg/sentry/fs/file_test.go
+++ b/pkg/sentry/fs/file_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 5a1e7a270..ba8be85e4 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index 1831aa82f..65ca196d9 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index 1aa271560..bf2a20b33 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 6ec9ff446..b5c72990e 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 8e31e48fd..5add16ac4 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go
index f7693cb19..f5c9d9215 100644
--- a/pkg/sentry/fs/fsutil/dirty_set_test.go
+++ b/pkg/sentry/fs/fsutil/dirty_set_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index d5881613b..46db2e51c 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index da6949ccb..dd7ab4b4a 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index 14dece315..b6e783614 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go
index 6fe4ef13d..3d7f3732d 100644
--- a/pkg/sentry/fs/fsutil/fsutil.go
+++ b/pkg/sentry/fs/fsutil/fsutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
index e7efd3c0f..8920b72ee 100644
--- a/pkg/sentry/fs/fsutil/handle.go
+++ b/pkg/sentry/fs/fsutil/handle.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/handle_test.go b/pkg/sentry/fs/fsutil/handle_test.go
index d94c3eb0d..43e1a3bdf 100644
--- a/pkg/sentry/fs/fsutil/handle_test.go
+++ b/pkg/sentry/fs/fsutil/handle_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index 9c1e2f76f..9599665f0 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
index 57705decd..bbd15b30b 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_state.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
index 790f3a5a6..86df76822 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 3acc32752..d4db1c2de 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 6777c8bf7..b0af44ddd 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 996c91849..e388ec3d7 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
index 5e24767f9..98700d014 100644
--- a/pkg/sentry/fs/gofer/attr.go
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 98f43c578..3d380f0e8 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index d4b6f6eb7..a0265c2aa 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/device.go b/pkg/sentry/fs/gofer/device.go
index fac7306d4..52c5acf48 100644
--- a/pkg/sentry/fs/gofer/device.go
+++ b/pkg/sentry/fs/gofer/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index c4a210656..6d961813d 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index 715af8f16..dd4f817bf 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 3ae93f059..ed30cb1f1 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index c8d7bd773..3190d1e18 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index a3e52aad6..f32e99ce0 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 7fc8f77b0..5811b8b12 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index ad11034f9..ad4d3df58 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 0bf7881da..a324dc990 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 4e2293398..7552216f3 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 8e6424492..f657135fc 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index d072da624..76ce58810 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index d9ed8c81e..1a759370d 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index d2e34a69d..0753640a2 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 148291ba6..7c9d2b299 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
index 7fb274451..530c0109f 100644
--- a/pkg/sentry/fs/host/descriptor_state.go
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go
index f393a8b54..6bc1bd2ae 100644
--- a/pkg/sentry/fs/host/descriptor_test.go
+++ b/pkg/sentry/fs/host/descriptor_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
index f2a0b6b15..b5adedf44 100644
--- a/pkg/sentry/fs/host/device.go
+++ b/pkg/sentry/fs/host/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 22a5d9f12..975084c86 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index e46ae433c..fec890964 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index b08125ca8..e69559aac 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index e32497203..08754bd6b 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index 8bc99d94b..b7c1a9581 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index 0ff87c418..9f1561bd5 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index bc965a1c2..175dca613 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 0eb267c00..af53bf533 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index 1a9587b90..d4ce4a8c1 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
index 7fa500bfb..2932c1f16 100644
--- a/pkg/sentry/fs/host/socket_state.go
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 483e99dd6..e9a88b124 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index 5e4c5feed..f35e2492d 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index ad1323610..cf3639c46 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index 74c703eb7..40c450660 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index 2ecb54319..d00da89d6 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index c5f5c9c0d..9ca8c399f 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 409c81a97..95769ccf8 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index 683140afe..e213df924 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 3ee3de10e..77973ce79 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index cf698a4da..78923fb5b 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 23e5635a4..bba20da14 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 2aabdded8..f251df0d1 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index e9b5e0f56..9e3e9d816 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 3e1959e83..b83544c9f 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 439e645db..5ff800d2d 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_range_test.go b/pkg/sentry/fs/lock/lock_range_test.go
index 06a37c701..b0ab882b9 100644
--- a/pkg/sentry/fs/lock/lock_range_test.go
+++ b/pkg/sentry/fs/lock/lock_range_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
index e16f485be..395592a4b 100644
--- a/pkg/sentry/fs/lock/lock_set_functions.go
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
index c60f5f7a2..67fa4b1dd 100644
--- a/pkg/sentry/fs/lock/lock_test.go
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 846b6e8bb..6bfcda6bb 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 8345876fc..24e28ddb2 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index dbc608c7e..fb91635bc 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_state.go b/pkg/sentry/fs/mount_state.go
index f5ed1dd8d..6344d5160 100644
--- a/pkg/sentry/fs/mount_state.go
+++ b/pkg/sentry/fs/mount_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index 968b435ab..a1c9f4f79 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index c0a803b2d..7c5348cce 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index 8669f3a38..cc7c32c9b 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go
index 7cc8398e6..38aee765a 100644
--- a/pkg/sentry/fs/offset.go
+++ b/pkg/sentry/fs/offset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 5a30af419..036c0f733 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/path.go b/pkg/sentry/fs/path.go
index b74f6ed8c..91a9a8ffd 100644
--- a/pkg/sentry/fs/path.go
+++ b/pkg/sentry/fs/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/path_test.go b/pkg/sentry/fs/path_test.go
index 7ab070855..391b010a7 100644
--- a/pkg/sentry/fs/path_test.go
+++ b/pkg/sentry/fs/path_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index 4dfec03a4..f8be06dc3 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/device/device.go b/pkg/sentry/fs/proc/device/device.go
index 6194afe88..04b687bcf 100644
--- a/pkg/sentry/fs/proc/device/device.go
+++ b/pkg/sentry/fs/proc/device/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index a69cbaa0e..b4896053f 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index dada8f982..5ebb33703 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
index 4b3448245..f659e590a 100644
--- a/pkg/sentry/fs/proc/file.go
+++ b/pkg/sentry/fs/proc/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index 49b92fd8a..c050a00be 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 061824b8c..63f737ff4 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 6fac251d2..78f3a1dc0 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 53dfd59ef..b31258eed 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 81dcc153a..0b0e87528 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 8cd6fe9d3..45f2a1211 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/net_test.go b/pkg/sentry/fs/proc/net_test.go
index a31a20494..94677cc1d 100644
--- a/pkg/sentry/fs/proc/net_test.go
+++ b/pkg/sentry/fs/proc/net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 07029a7bb..33030bebf 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
index 50d0271f9..d025069df 100644
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 51cae5e37..0499ba65b 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
index d90e3e736..f9a2ca38e 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index bf7650211..f2bbef375 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index 384b4ffe1..54562508d 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index beb25be20..801eb6a1e 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 7ba392346..0ce9d30f1 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 748ca4320..404faea0a 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index a7e4cf0a6..f70399686 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index f3a9b81df..80c7ce0b4 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index 00f6a2afd..b6d49d5e9 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 075e13b01..0a911b155 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/file.go b/pkg/sentry/fs/ramfs/file.go
index 0b94d92a1..b7fc98ffc 100644
--- a/pkg/sentry/fs/ramfs/file.go
+++ b/pkg/sentry/fs/ramfs/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index 83cbcab23..d77688a34 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 9ac00eb18..8c81478c8 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index 1c54d9991..a21fac2c7 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/test/test.go b/pkg/sentry/fs/ramfs/test/test.go
index fb669558f..11bff7729 100644
--- a/pkg/sentry/fs/ramfs/test/test.go
+++ b/pkg/sentry/fs/ramfs/test/test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
index 1fb335f74..29a70f698 100644
--- a/pkg/sentry/fs/ramfs/tree.go
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index 68e2929d5..d5567d9e1 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
index b4ac85a27..da2df7e1d 100644
--- a/pkg/sentry/fs/restore.go
+++ b/pkg/sentry/fs/restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/save.go b/pkg/sentry/fs/save.go
index bf2a85143..90988d385 100644
--- a/pkg/sentry/fs/save.go
+++ b/pkg/sentry/fs/save.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/seek.go b/pkg/sentry/fs/seek.go
index 1268726c2..72f3fb632 100644
--- a/pkg/sentry/fs/seek.go
+++ b/pkg/sentry/fs/seek.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sync.go b/pkg/sentry/fs/sync.go
index 9738a8f22..6dcc2fe8d 100644
--- a/pkg/sentry/fs/sync.go
+++ b/pkg/sentry/fs/sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/device.go b/pkg/sentry/fs/sys/device.go
index 54e414d1b..38ecd0c18 100644
--- a/pkg/sentry/fs/sys/device.go
+++ b/pkg/sentry/fs/sys/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index 2cf3a6f98..e64aa0edc 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 625525540..5ce33f87f 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index 7b9697668..7cc1942c7 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 767db95a0..7423e816c 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/device.go b/pkg/sentry/fs/tmpfs/device.go
index e588b3440..aade93c26 100644
--- a/pkg/sentry/fs/tmpfs/device.go
+++ b/pkg/sentry/fs/tmpfs/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 342688f81..1f9d69909 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index f064eb1ac..b5830d3df 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index ca620e65e..7c91e248b 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1e4fe47d2..42a7d7b9c 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 38be6db46..91b782540 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 7c0c0b0c1..e32b05c1d 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index d9f8f02f3..0c412eb21 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go
index c0fa2b407..d5d1caafc 100644
--- a/pkg/sentry/fs/tty/inode.go
+++ b/pkg/sentry/fs/tty/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 31804571e..484366f85 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index ae7540eff..dad0cad79 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 01dc8d1ac..a09ca0119 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 4a0d4fdb9..9de3168bf 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 3cb135124..79f9d76d7 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 32e1b1556..ad535838f 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/getcpu_amd64.s b/pkg/sentry/hostcpu/getcpu_amd64.s
index 7f6247d81..409db1450 100644
--- a/pkg/sentry/hostcpu/getcpu_amd64.s
+++ b/pkg/sentry/hostcpu/getcpu_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/hostcpu.go b/pkg/sentry/hostcpu/hostcpu.go
index fa46499ad..3adc847bb 100644
--- a/pkg/sentry/hostcpu/hostcpu.go
+++ b/pkg/sentry/hostcpu/hostcpu.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/hostcpu_test.go b/pkg/sentry/hostcpu/hostcpu_test.go
index a82e1a271..38de0e1f6 100644
--- a/pkg/sentry/hostcpu/hostcpu_test.go
+++ b/pkg/sentry/hostcpu/hostcpu_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/context.go b/pkg/sentry/inet/context.go
index 370381f41..d05e96f15 100644
--- a/pkg/sentry/inet/context.go
+++ b/pkg/sentry/inet/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 30ca4e0c0..8206377cc 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index bc10926ee..05c1a1792 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 45088c988..1ea2cee36 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
index c49a6b852..19f15fd36 100644
--- a/pkg/sentry/kernel/auth/auth.go
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
index 5b8164c49..88d6243aa 100644
--- a/pkg/sentry/kernel/auth/capability_set.go
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index 914589b28..f7e945599 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index f18f7dac9..de33f1953 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index 37522b018..e5bed44d7 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index bd0090e0f..43f439825 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
index 889291d96..8f1a189ec 100644
--- a/pkg/sentry/kernel/auth/id_map_functions.go
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index d359f3f31..5bb9c44c0 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index 261ca6f7a..b629521eb 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index a8eb114c0..9c13ecfcc 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index dabb32f49..7f3e2004a 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
index bc869fc13..d89c1b745 100644
--- a/pkg/sentry/kernel/epoll/epoll_test.go
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index a4ada0e78..26dc59a85 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
index 71326b62f..14e8996d9 100644
--- a/pkg/sentry/kernel/eventfd/eventfd_test.go
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index f77339cae..aa4aac109 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index cad0b0a20..715f4714d 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
index 95123aef3..b49996137 100644
--- a/pkg/sentry/kernel/fd_map_test.go
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index f3f05e8f5..3cf0db280 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 54b1982a0..ea69d433b 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 726c26990..ea506a29b 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 5eef49f59..9ceb9bd92 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
index bbb476544..8eafe810b 100644
--- a/pkg/sentry/kernel/kdefs/kdefs.go
+++ b/pkg/sentry/kernel/kdefs/kdefs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5d6856f3c..bad558d48 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
index bb2d5102d..a0a69b498 100644
--- a/pkg/sentry/kernel/kernel_state.go
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index f7a183a1d..f05ef1b64 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
index abc565054..43b8deb76 100644
--- a/pkg/sentry/kernel/memevent/memory_events.proto
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index bb5db0309..373e11772 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go
index 6d90ed033..72be6702f 100644
--- a/pkg/sentry/kernel/pending_signals_state.go
+++ b/pkg/sentry/kernel/pending_signals_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
index a82e45c3f..fa8045910 100644
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
index 8d383577a..eec5c5de8 100644
--- a/pkg/sentry/kernel/pipe/device.go
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 23d692da1..4b0e00b85 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index cc1ebf4f6..eda551594 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index ced2559a7..126054826 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index 49ef8c8ac..3b9895927 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 1fa5e9a32..f27379969 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 82607367b..63efc5bbe 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index d93324b53..6fea9769c 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
index 0ab958529..40b5acca3 100644
--- a/pkg/sentry/kernel/posixtimer.go
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 9fe28f435..20bac2b70 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 1f3de58e3..46b03c700 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
index 0a97603f0..69aee9127 100644
--- a/pkg/sentry/kernel/sched/cpuset.go
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go
index 8a6e12958..a036ed513 100644
--- a/pkg/sentry/kernel/sched/cpuset_test.go
+++ b/pkg/sentry/kernel/sched/cpuset_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
index f1de1da60..e59909baf 100644
--- a/pkg/sentry/kernel/sched/sched.go
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index d77c05e2f..37dd3e4c9 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index aa07946cf..232a276dc 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index f9eb382e9..5f886bf31 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index a9b4e7647..78a5b4063 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go
index b0dacdbe0..bbc653ed8 100644
--- a/pkg/sentry/kernel/shm/device.go
+++ b/pkg/sentry/kernel/shm/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 77973951e..8d0d14e45 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index e3a2a777a..b066df132 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 3649f5e4d..3f1ac9898 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 4c7811b6c..19b711e9c 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 826809a70..981455d46 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 6531bd5d2..2aecf3eea 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
index 71ca75555..3b29d3c6a 100644
--- a/pkg/sentry/kernel/table_test.go
+++ b/pkg/sentry/kernel/table_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 4f0b7fe3f..e22ec768d 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index d2052921e..24230af89 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 6dc7b938e..e5027e551 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index de3aef40d..755fe0370 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index d2df7e9d1..45b8d2b04 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 1b760aba4..a9b74da8e 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 65969ca9b..44fbb487c 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 62ebbcb0d..5a11ca3df 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index b0921b2eb..8f90ed786 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 1769da210..f4c881c2d 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index 4df2e53d3..fc7cefc1f 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 49ac933b7..596b9aa16 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 19dcc963a..3b3cdc24a 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index e2925a708..fe24f7542 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 6c8d7d316..c82a32c78 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index feaf6cae4..36846484c 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index f0373c375..0318adb35 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go
index 82ef858a1..3f37f505d 100644
--- a/pkg/sentry/kernel/task_test.go
+++ b/pkg/sentry/kernel/task_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 2b4954869..c8e973bd5 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index dfff7b52d..d7652f57c 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 4e3d19e97..bdb907905 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
index ac4dc01d8..3675ea20d 100644
--- a/pkg/sentry/kernel/time/context.go
+++ b/pkg/sentry/kernel/time/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 52e0dfba1..ca0f4ba2e 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 2167f3efe..6bff80f13 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
index 2e7fed4d8..f3a3ed543 100644
--- a/pkg/sentry/kernel/timekeeper_state.go
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index 34a5cec27..71674c21c 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index 7e0fe0d21..ed5f0c031 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 971e8bc59..0ec858a4a 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
index a9e84673f..72bb0f93c 100644
--- a/pkg/sentry/kernel/version.go
+++ b/pkg/sentry/kernel/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index 75e97bf92..bf413eb7d 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index 02c8b60e3..ba0b7d4fd 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/limits_test.go b/pkg/sentry/limits/limits_test.go
index dd6f80750..d41f62554 100644
--- a/pkg/sentry/limits/limits_test.go
+++ b/pkg/sentry/limits/limits_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 8e6a24341..511db6733 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 849be5a3d..9b1e81dc9 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 54534952b..06a3c7156 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 62b39e52b..d1417c4f1 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index a06e27ac9..437cc5da1 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
index dc71e1c2d..b327f0e1e 100644
--- a/pkg/sentry/loader/vdso_state.go
+++ b/pkg/sentry/loader/vdso_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index c9483905d..33cf16f91 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go
index 10668d404..49ee34548 100644
--- a/pkg/sentry/memmap/mapping_set_test.go
+++ b/pkg/sentry/memmap/mapping_set_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index cdc5f2b27..05349a77f 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memutil/memutil.go b/pkg/sentry/memutil/memutil.go
index 4f245cf3c..286d50ca4 100644
--- a/pkg/sentry/memutil/memutil.go
+++ b/pkg/sentry/memutil/memutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go
index 32c27eb2f..8d9fc64fb 100644
--- a/pkg/sentry/memutil/memutil_unsafe.go
+++ b/pkg/sentry/memutil/memutil_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 27554f163..7488f7c4a 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index b42156d45..87942af0e 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index 1a5e56f8e..192a6f744 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
index 56d0490f0..d341b9c07 100644
--- a/pkg/sentry/mm/debug.go
+++ b/pkg/sentry/mm/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index 6741db594..6600ddd78 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index a4b5cb443..b248b76e7 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 32d5e2ff6..5ef1ba0b1 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 3299ae164..aab697f9e 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index b47aa7263..f2db43196 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 9febb25ac..5690fe6b4 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/proc_pid_maps.go b/pkg/sentry/mm/proc_pid_maps.go
index 5840b257c..0bf1cdb51 100644
--- a/pkg/sentry/mm/proc_pid_maps.go
+++ b/pkg/sentry/mm/proc_pid_maps.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
index 36fed8f1c..6e7080a84 100644
--- a/pkg/sentry/mm/save_restore.go
+++ b/pkg/sentry/mm/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
index bab137a5a..3bc48c7e7 100644
--- a/pkg/sentry/mm/shm.go
+++ b/pkg/sentry/mm/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 5d7bd33bd..e511472f4 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index b0622b0c3..a721cc456 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index b81e861f1..dafdbd0e4 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go
index 0d200a5e2..cca21a23e 100644
--- a/pkg/sentry/platform/context.go
+++ b/pkg/sentry/platform/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index f278c8d63..97da31e70 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/filemem/filemem_state.go b/pkg/sentry/platform/filemem/filemem_state.go
index e28e021c9..964e2aaaa 100644
--- a/pkg/sentry/platform/filemem/filemem_state.go
+++ b/pkg/sentry/platform/filemem/filemem_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/filemem/filemem_test.go b/pkg/sentry/platform/filemem/filemem_test.go
index 4b165dc48..9becec25f 100644
--- a/pkg/sentry/platform/filemem/filemem_test.go
+++ b/pkg/sentry/platform/filemem/filemem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/filemem/filemem_unsafe.go b/pkg/sentry/platform/filemem/filemem_unsafe.go
index a23b9825a..776aed74d 100644
--- a/pkg/sentry/platform/filemem/filemem_unsafe.go
+++ b/pkg/sentry/platform/filemem/filemem_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index ca4f42087..9c83f41eb 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/interrupt/interrupt_test.go b/pkg/sentry/platform/interrupt/interrupt_test.go
index 7c49eeea6..fb3284395 100644
--- a/pkg/sentry/platform/interrupt/interrupt_test.go
+++ b/pkg/sentry/platform/interrupt/interrupt_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index c4293c517..72e897a9a 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
index f5cebd5b3..b25cad155 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index ecc33d7dd..9f1c9510b 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index b364e3ef7..f013d1dc9 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index 0881bd5f5..ec017f6c2 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 61ca61dcb..cd00a47f2 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index 8650cd78f..e79a30ef2 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 216d4b4b6..747a95997 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index aac84febf..be902be88 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/host_map.go b/pkg/sentry/platform/kvm/host_map.go
index fc16ad2de..ee6a1a42d 100644
--- a/pkg/sentry/platform/kvm/host_map.go
+++ b/pkg/sentry/platform/kvm/host_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 0c4dff308..d4f50024d 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index 3d56ed895..70d0ac63b 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
index 476e783a0..c0a0af92d 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index ca44c31b3..8c53c6f06 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 52448839f..45eeb96ff 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 9f60b6b31..fc7ad258f 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index bcd29a947..e0aec42b8 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 8b9041f13..50e513f3b 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 86323c891..4f5b01321 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 81a98656d..b908cae6a 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go
index 8a614e25d..0d496561d 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
index 39286a0af..fcba33813 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
index 3b5ad8817..f1da41a44 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index 0d3fbe043..0343e9267 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go
index 7875bd3e9..935e0eb93 100644
--- a/pkg/sentry/platform/kvm/virtual_map_test.go
+++ b/pkg/sentry/platform/kvm/virtual_map_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go
index 6398e5e01..1bcc1f8e9 100644
--- a/pkg/sentry/platform/mmap_min_addr.go
+++ b/pkg/sentry/platform/mmap_min_addr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 8a1620d93..f16588e6e 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/sentry/platform/procid/procid.go
index 5f861908f..3f49ab093 100644
--- a/pkg/sentry/platform/procid/procid.go
+++ b/pkg/sentry/platform/procid/procid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
index 5b1ba1f24..fd88ce82e 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_net_test.go b/pkg/sentry/platform/procid/procid_net_test.go
index 2d1605a08..e8dcc479d 100644
--- a/pkg/sentry/platform/procid/procid_net_test.go
+++ b/pkg/sentry/platform/procid/procid_net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_test.go b/pkg/sentry/platform/procid/procid_test.go
index 5e44da36f..7a57c7cdc 100644
--- a/pkg/sentry/platform/procid/procid_test.go
+++ b/pkg/sentry/platform/procid/procid_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 4f20716f7..00d92b092 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 46a8bda8e..7a3cb8f49 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
index 9bf87b6f6..63f98e40d 100644
--- a/pkg/sentry/platform/ptrace/stub_amd64.s
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go
index c868a2d68..48c16c4a1 100644
--- a/pkg/sentry/platform/ptrace/stub_unsafe.go
+++ b/pkg/sentry/platform/ptrace/stub_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 6d5ad6b71..6a9da5db8 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index c38dc1ff8..d23a1133e 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 53adadadd..7523487e7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
index 697431472..0c9263060 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index fe41641d3..ca6c4ac97 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index f09d045eb..18137e55d 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 84819f132..67242b92b 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index a3e992e0d..4a9affe64 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index 08c15ad65..d48fbd2d1 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go
index ffa7eaf77..11c49855f 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/main.go
+++ b/pkg/sentry/platform/ring0/gen_offsets/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 62e67005e..e70eafde2 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 0d2b0f7dc..ab562bca7 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go
index cfb3ad853..faf4240e5 100644
--- a/pkg/sentry/platform/ring0/kernel_unsafe.go
+++ b/pkg/sentry/platform/ring0/kernel_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index 989e3e383..2b95a0141 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
index 6f143ea5a..98a130525 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.s
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index ca5fd456b..753d31ef8 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index 049fd0247..ee6e90a11 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
index aca778913..f48647b3a 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index ff5787f89..c7207ec18 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 878463018..746f614e5 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
index a7f2ad9a4..2f82c4353 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index dca3f69ef..3e5dc7dc7 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index ca49d20f8..6bd8c3584 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index fa068e35e..0d9a51aa5 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
index afa4d473a..c4c71d23e 100644
--- a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go
index 4991031c5..10c51e88d 100644
--- a/pkg/sentry/platform/ring0/ring0.go
+++ b/pkg/sentry/platform/ring0/ring0.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index f489fcecb..7c88010d8 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
index 69947dec3..873ffa046 100644
--- a/pkg/sentry/platform/safecopy/atomic_amd64.s
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/memclr_amd64.s b/pkg/sentry/platform/safecopy/memclr_amd64.s
index 7d1019f60..488b6e666 100644
--- a/pkg/sentry/platform/safecopy/memclr_amd64.s
+++ b/pkg/sentry/platform/safecopy/memclr_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/memcpy_amd64.s b/pkg/sentry/platform/safecopy/memcpy_amd64.s
index 96ef2eefc..0bf26fd7b 100644
--- a/pkg/sentry/platform/safecopy/memcpy_amd64.s
+++ b/pkg/sentry/platform/safecopy/memcpy_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
index 90a2aad7b..c60f73103 100644
--- a/pkg/sentry/platform/safecopy/safecopy.go
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy_test.go b/pkg/sentry/platform/safecopy/safecopy_test.go
index 67df36121..1a682d28a 100644
--- a/pkg/sentry/platform/safecopy/safecopy_test.go
+++ b/pkg/sentry/platform/safecopy/safecopy_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
index 72f243f8d..df1c35b66 100644
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
index a65cb0c26..06614f1b4 100644
--- a/pkg/sentry/platform/safecopy/sighandler_amd64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go
index 0b58f6497..e91ff66ae 100644
--- a/pkg/sentry/safemem/block_unsafe.go
+++ b/pkg/sentry/safemem/block_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
index fd917648b..6cb52439f 100644
--- a/pkg/sentry/safemem/io.go
+++ b/pkg/sentry/safemem/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/io_test.go b/pkg/sentry/safemem/io_test.go
index edac4c1d7..2eda8c3bb 100644
--- a/pkg/sentry/safemem/io_test.go
+++ b/pkg/sentry/safemem/io_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/safemem.go b/pkg/sentry/safemem/safemem.go
index 2f8002004..090932d3e 100644
--- a/pkg/sentry/safemem/safemem.go
+++ b/pkg/sentry/safemem/safemem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/seq_test.go b/pkg/sentry/safemem/seq_test.go
index 3e83b3851..fddcaf714 100644
--- a/pkg/sentry/safemem/seq_test.go
+++ b/pkg/sentry/safemem/seq_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/seq_unsafe.go b/pkg/sentry/safemem/seq_unsafe.go
index e0d29a0b3..83a6b7183 100644
--- a/pkg/sentry/safemem/seq_unsafe.go
+++ b/pkg/sentry/safemem/seq_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 29bcf55ab..6b5d5f993 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index a455b919f..5913d47a8 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index db97e95f2..d44f5e88a 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/device.go b/pkg/sentry/socket/epsocket/device.go
index 17f2c9559..3cc138eb0 100644
--- a/pkg/sentry/socket/epsocket/device.go
+++ b/pkg/sentry/socket/epsocket/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 47c575e7b..e90ef4835 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index dbc232d26..686554437 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/save_restore.go b/pkg/sentry/socket/epsocket/save_restore.go
index 2613f90de..34d9a7cf0 100644
--- a/pkg/sentry/socket/epsocket/save_restore.go
+++ b/pkg/sentry/socket/epsocket/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index e4ed52fc8..c0081c819 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/device.go b/pkg/sentry/socket/hostinet/device.go
index a9a673316..c5133f3bb 100644
--- a/pkg/sentry/socket/hostinet/device.go
+++ b/pkg/sentry/socket/hostinet/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/hostinet.go b/pkg/sentry/socket/hostinet/hostinet.go
index 67c6c8066..7858892ab 100644
--- a/pkg/sentry/socket/hostinet/hostinet.go
+++ b/pkg/sentry/socket/hostinet/hostinet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/save_restore.go b/pkg/sentry/socket/hostinet/save_restore.go
index 0821a794a..3827f082a 100644
--- a/pkg/sentry/socket/hostinet/save_restore.go
+++ b/pkg/sentry/socket/hostinet/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index e82624b44..e4e950fbb 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
index f8bb75636..59c8910ca 100644
--- a/pkg/sentry/socket/hostinet/socket_unsafe.go
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index f64809d39..4ce73c1f1 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index b902d7ec9..a95172cba 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index 1c5d4c3a5..20b9a6e37 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/port/port_test.go b/pkg/sentry/socket/netlink/port/port_test.go
index 34565e2f9..49b3b48ab 100644
--- a/pkg/sentry/socket/netlink/port/port_test.go
+++ b/pkg/sentry/socket/netlink/port/port_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 5d0a04a07..06786bd50 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 70322b9ed..7e70b09b2 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 0c03997f2..4d4130a4c 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index f4c8489b1..9c749b888 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/device.go b/pkg/sentry/socket/rpcinet/device.go
index f7b63436e..d2b9f9222 100644
--- a/pkg/sentry/socket/rpcinet/device.go
+++ b/pkg/sentry/socket/rpcinet/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index f88a908ed..73c255c33 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/rpcinet.go b/pkg/sentry/socket/rpcinet/rpcinet.go
index 10b0dedc2..6c98e6acb 100644
--- a/pkg/sentry/socket/rpcinet/rpcinet.go
+++ b/pkg/sentry/socket/rpcinet/rpcinet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index c7e761d54..44fa5c620 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
index bcb89fb34..cb8344ec6 100644
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/stack_unsafe.go b/pkg/sentry/socket/rpcinet/stack_unsafe.go
index 9a896c623..d04fb2069 100644
--- a/pkg/sentry/socket/rpcinet/stack_unsafe.go
+++ b/pkg/sentry/socket/rpcinet/stack_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 31f8d42d7..a235c5249 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/device.go b/pkg/sentry/socket/unix/device.go
index e8bcc7a9f..41820dbb3 100644
--- a/pkg/sentry/socket/unix/device.go
+++ b/pkg/sentry/socket/unix/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 06333e14b..7d6434696 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 566e3d57b..4c913effc 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectioned_state.go b/pkg/sentry/socket/unix/transport/connectioned_state.go
index 7e6c73dcc..608a6a97a 100644
--- a/pkg/sentry/socket/unix/transport/connectioned_state.go
+++ b/pkg/sentry/socket/unix/transport/connectioned_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 86cd05199..cd4633106 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index c4d7d863c..5b4dfab68 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 2934101a2..157133b65 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 668363864..3543dd81f 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 43e88a713..70b33f190 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go
index afa21672a..7f047b808 100644
--- a/pkg/sentry/state/state_metadata.go
+++ b/pkg/sentry/state/state_metadata.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state_unsafe.go b/pkg/sentry/state/state_unsafe.go
index 3ff7d24c8..f02e12b2a 100644
--- a/pkg/sentry/state/state_unsafe.go
+++ b/pkg/sentry/state/state_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/clone.go b/pkg/sentry/strace/clone.go
index b82ca1ad1..e18ce84dc 100644
--- a/pkg/sentry/strace/clone.go
+++ b/pkg/sentry/strace/clone.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/futex.go b/pkg/sentry/strace/futex.go
index 3da108cb7..ceb3dc21d 100644
--- a/pkg/sentry/strace/futex.go
+++ b/pkg/sentry/strace/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 1df148e7d..99714f12c 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/open.go b/pkg/sentry/strace/open.go
index 839d5eda7..5a72a940c 100644
--- a/pkg/sentry/strace/open.go
+++ b/pkg/sentry/strace/open.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/ptrace.go b/pkg/sentry/strace/ptrace.go
index fcdb7e9f4..c572aafb4 100644
--- a/pkg/sentry/strace/ptrace.go
+++ b/pkg/sentry/strace/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 26831edd6..375418dc1 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index f7bfa3a1f..4286f0df7 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto
index 914e8c7b0..f1fc539d6 100644
--- a/pkg/sentry/strace/strace.proto
+++ b/pkg/sentry/strace/strace.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 8be4fa318..9eeb18a03 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index 01dd6fa71..b90d191b7 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 013b385bc..9fd002955 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index f01483cd3..d1e0833fc 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 4465549ad..75e87f5ec 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
index bfb541634..a033b7c70 100644
--- a/pkg/sentry/syscalls/linux/sigset.go
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 54e4afa9e..355071131 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
index 89c81ac90..cf972dc28 100644
--- a/pkg/sentry/syscalls/linux/sys_capability.go
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index e69dfc77a..62272efcd 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index 60fe5a133..903172890 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 64704bb88..a70f35be0 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index d35dcecbe..cf04428bc 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 29c0d7a39..4b441b31b 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
index 4fd0ed794..8d594aa83 100644
--- a/pkg/sentry/syscalls/linux/sys_identity.go
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index 725204dff..26a505782 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 97b51ba7c..ad3bfd761 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 1a98328dc..f8d9c43fd 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index 57cedccc1..bf0df7302 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 2b544f145..3652c429e 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index b9bdefadb..bf0958435 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index a1242acd3..c7b39ede8 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
index be31e6b17..452dff058 100644
--- a/pkg/sentry/syscalls/linux/sys_random.go
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 0be2d195a..b2e5a5449 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index d806b58ab..2f16e1791 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
index 82e42b589..ab07c77f9 100644
--- a/pkg/sentry/syscalls/linux/sys_rusage.go
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index ff9e46077..e679a6694 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 4323a4df4..969acaa36 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index a8983705b..4ed52c4a7 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index 48ff1d5f0..b13d48b98 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index ecdec5d3a..a539354c5 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 5fa5ddce6..0a7551742 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 619a14d7c..9c433c45d 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 902d210db..826c6869d 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 6560bac57..5eeb3ba58 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go
index 792040c81..7193b7aed 100644
--- a/pkg/sentry/syscalls/linux/sys_syslog.go
+++ b/pkg/sentry/syscalls/linux/sys_syslog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 550f63a43..820ca680e 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index 8e6683444..063fbb106 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index c41074d54..6baf4599b 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index 92c6a3d60..f70d13682 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index b95d62320..27ddb3808 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index 899116374..689f2f838 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index caa7b01ea..08e263112 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
index e865c6fc0..752ec326d 100644
--- a/pkg/sentry/syscalls/linux/timespec.go
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/polling.go b/pkg/sentry/syscalls/polling.go
index fd90184ef..2b33d6c19 100644
--- a/pkg/sentry/syscalls/polling.go
+++ b/pkg/sentry/syscalls/polling.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index 1176f858d..bae32d727 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/unimplemented_syscall.proto b/pkg/sentry/syscalls/unimplemented_syscall.proto
index d6febf5b1..41579b016 100644
--- a/pkg/sentry/syscalls/unimplemented_syscall.proto
+++ b/pkg/sentry/syscalls/unimplemented_syscall.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index cbb95e2d7..c8cf4eca4 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/calibrated_clock_test.go b/pkg/sentry/time/calibrated_clock_test.go
index 8b6dd5592..a9237630e 100644
--- a/pkg/sentry/time/calibrated_clock_test.go
+++ b/pkg/sentry/time/calibrated_clock_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/clock_id.go b/pkg/sentry/time/clock_id.go
index 500102e58..1317a5dad 100644
--- a/pkg/sentry/time/clock_id.go
+++ b/pkg/sentry/time/clock_id.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/clocks.go b/pkg/sentry/time/clocks.go
index 9925b407d..e26386520 100644
--- a/pkg/sentry/time/clocks.go
+++ b/pkg/sentry/time/clocks.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/muldiv_amd64.s b/pkg/sentry/time/muldiv_amd64.s
index 291940b1d..bfcb8c724 100644
--- a/pkg/sentry/time/muldiv_amd64.s
+++ b/pkg/sentry/time/muldiv_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index 594b4874b..f3ad58454 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
index 7394fc5ee..4a0c4e880 100644
--- a/pkg/sentry/time/parameters_test.go
+++ b/pkg/sentry/time/parameters_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler.go b/pkg/sentry/time/sampler.go
index cf581b5fa..445690d49 100644
--- a/pkg/sentry/time/sampler.go
+++ b/pkg/sentry/time/sampler.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler_test.go b/pkg/sentry/time/sampler_test.go
index caf7e5c53..ec0e442b6 100644
--- a/pkg/sentry/time/sampler_test.go
+++ b/pkg/sentry/time/sampler_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler_unsafe.go b/pkg/sentry/time/sampler_unsafe.go
index 7ea19d387..0f8eb4fc8 100644
--- a/pkg/sentry/time/sampler_unsafe.go
+++ b/pkg/sentry/time/sampler_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/tsc_amd64.s b/pkg/sentry/time/tsc_amd64.s
index 4cc604392..e53d477f7 100644
--- a/pkg/sentry/time/tsc_amd64.s
+++ b/pkg/sentry/time/tsc_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go
index e48fabc2d..399d98c29 100644
--- a/pkg/sentry/uniqueid/context.go
+++ b/pkg/sentry/uniqueid/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/cpu.go b/pkg/sentry/usage/cpu.go
index ed7b04b9e..cbd7cfe19 100644
--- a/pkg/sentry/usage/cpu.go
+++ b/pkg/sentry/usage/cpu.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/io.go b/pkg/sentry/usage/io.go
index 49faa507d..8e27a0a88 100644
--- a/pkg/sentry/usage/io.go
+++ b/pkg/sentry/usage/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 92a478d85..7e065cb76 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/memory_unsafe.go b/pkg/sentry/usage/memory_unsafe.go
index f990a7750..a3ae668a5 100644
--- a/pkg/sentry/usage/memory_unsafe.go
+++ b/pkg/sentry/usage/memory_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/usage.go b/pkg/sentry/usage/usage.go
index 3b3118659..ab327f8e2 100644
--- a/pkg/sentry/usage/usage.go
+++ b/pkg/sentry/usage/usage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
index 75346d854..c71d05afe 100644
--- a/pkg/sentry/usermem/access_type.go
+++ b/pkg/sentry/usermem/access_type.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
index fc94bee80..2a75aa60c 100644
--- a/pkg/sentry/usermem/addr.go
+++ b/pkg/sentry/usermem/addr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr_range_seq_test.go b/pkg/sentry/usermem/addr_range_seq_test.go
index cf9d785ed..bd6a1ec8a 100644
--- a/pkg/sentry/usermem/addr_range_seq_test.go
+++ b/pkg/sentry/usermem/addr_range_seq_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr_range_seq_unsafe.go b/pkg/sentry/usermem/addr_range_seq_unsafe.go
index 13b2998b3..f5fd446fa 100644
--- a/pkg/sentry/usermem/addr_range_seq_unsafe.go
+++ b/pkg/sentry/usermem/addr_range_seq_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go
index 01a746404..274f568d0 100644
--- a/pkg/sentry/usermem/bytes_io.go
+++ b/pkg/sentry/usermem/bytes_io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go
index efd71fcbc..8bdf3a508 100644
--- a/pkg/sentry/usermem/bytes_io_unsafe.go
+++ b/pkg/sentry/usermem/bytes_io_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 5d8a1c558..1d6c0b4d6 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go
index 563560da8..1991a9641 100644
--- a/pkg/sentry/usermem/usermem_test.go
+++ b/pkg/sentry/usermem/usermem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_x86.go b/pkg/sentry/usermem/usermem_x86.go
index 2484b0d82..9ec90f9ff 100644
--- a/pkg/sentry/usermem/usermem_x86.go
+++ b/pkg/sentry/usermem/usermem_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 5b620693d..75b11237f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_amd64.s b/pkg/sleep/commit_amd64.s
index d525e5b79..d08df7f37 100644
--- a/pkg/sleep/commit_amd64.s
+++ b/pkg/sleep/commit_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_asm.go b/pkg/sleep/commit_asm.go
index 39a55df7e..90eef4cbc 100644
--- a/pkg/sleep/commit_asm.go
+++ b/pkg/sleep/commit_asm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_noasm.go b/pkg/sleep/commit_noasm.go
index 584866cd8..967d22e24 100644
--- a/pkg/sleep/commit_noasm.go
+++ b/pkg/sleep/commit_noasm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/empty.s b/pkg/sleep/empty.s
index 8aca31bee..85d52cd9c 100644
--- a/pkg/sleep/empty.s
+++ b/pkg/sleep/empty.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index bc1738371..8feb9ffc2 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index b12cce681..45fb6f0ea 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/decode.go b/pkg/state/decode.go
index 3ef59610b..54b5ad8b8 100644
--- a/pkg/state/decode.go
+++ b/pkg/state/decode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/encode.go b/pkg/state/encode.go
index fd052db12..577aaf051 100644
--- a/pkg/state/encode.go
+++ b/pkg/state/encode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/encode_unsafe.go b/pkg/state/encode_unsafe.go
index d96ba56d4..be94742a8 100644
--- a/pkg/state/encode_unsafe.go
+++ b/pkg/state/encode_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/map.go b/pkg/state/map.go
index c3d165501..0035d7250 100644
--- a/pkg/state/map.go
+++ b/pkg/state/map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/object.proto b/pkg/state/object.proto
index c78efed2a..d3b46ea97 100644
--- a/pkg/state/object.proto
+++ b/pkg/state/object.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/printer.go b/pkg/state/printer.go
index 2c8ce60a5..aee4b69fb 100644
--- a/pkg/state/printer.go
+++ b/pkg/state/printer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/state.go b/pkg/state/state.go
index 23a0b5922..4b141777e 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/state_test.go b/pkg/state/state_test.go
index 38ad9da9c..22bcad9e1 100644
--- a/pkg/state/state_test.go
+++ b/pkg/state/state_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/statefile/statefile.go b/pkg/state/statefile/statefile.go
index 9c86c1934..99158fd02 100644
--- a/pkg/state/statefile/statefile.go
+++ b/pkg/state/statefile/statefile.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/statefile/statefile_test.go b/pkg/state/statefile/statefile_test.go
index fa3fb9f2c..b4f400e01 100644
--- a/pkg/state/statefile/statefile_test.go
+++ b/pkg/state/statefile/statefile_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/stats.go b/pkg/state/stats.go
index ddcc49f78..17ca258fc 100644
--- a/pkg/state/stats.go
+++ b/pkg/state/stats.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/atomicptr_unsafe.go b/pkg/sync/atomicptr_unsafe.go
index f12e9cb67..d943b7ff4 100644
--- a/pkg/sync/atomicptr_unsafe.go
+++ b/pkg/sync/atomicptr_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/atomicptrtest/atomicptr_test.go b/pkg/sync/atomicptrtest/atomicptr_test.go
index b458382b1..3262785ce 100644
--- a/pkg/sync/atomicptrtest/atomicptr_test.go
+++ b/pkg/sync/atomicptrtest/atomicptr_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
index 0c992d5a4..cd7a02dca 100644
--- a/pkg/sync/memmove_unsafe.go
+++ b/pkg/sync/memmove_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go
index 968665078..1593b9e5d 100644
--- a/pkg/sync/norace_unsafe.go
+++ b/pkg/sync/norace_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go
index d143a21c7..473eaddc6 100644
--- a/pkg/sync/race_unsafe.go
+++ b/pkg/sync/race_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
index a18e1229a..bea31adc5 100644
--- a/pkg/sync/seqatomic_unsafe.go
+++ b/pkg/sync/seqatomic_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/seqatomictest/seqatomic_test.go b/pkg/sync/seqatomictest/seqatomic_test.go
index b785d2344..f5e1fbfff 100644
--- a/pkg/sync/seqatomictest/seqatomic_test.go
+++ b/pkg/sync/seqatomictest/seqatomic_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go
index 8e3304d69..732e856a4 100644
--- a/pkg/sync/seqcount.go
+++ b/pkg/sync/seqcount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/seqcount_test.go b/pkg/sync/seqcount_test.go
index fa4abed1d..b14a8878e 100644
--- a/pkg/sync/seqcount_test.go
+++ b/pkg/sync/seqcount_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/sync.go b/pkg/sync/sync.go
index 36d4c4dee..22c5348d7 100644
--- a/pkg/sync/sync.go
+++ b/pkg/sync/sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/host_linux.go b/pkg/syserr/host_linux.go
index 22009a799..74bbe9f5b 100644
--- a/pkg/syserr/host_linux.go
+++ b/pkg/syserr/host_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index b9786b48f..20e756edb 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go
index dba6cb7de..6a66e23a2 100644
--- a/pkg/syserr/syserr.go
+++ b/pkg/syserr/syserr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 5bc74e65e..4228707f4 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserror/syserror_test.go b/pkg/syserror/syserror_test.go
index fb7d8d5ee..0f0da5781 100644
--- a/pkg/syserror/syserror_test.go
+++ b/pkg/syserror/syserror_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index b64dce720..81428770b 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 79b7c77ee..05a730a05 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go
index c5dd2819f..d3a9a0f88 100644
--- a/pkg/tcpip/buffer/prependable.go
+++ b/pkg/tcpip/buffer/prependable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index cea4e3657..24479ea40 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index 02c264593..74a0a96fc 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 206531f20..5dfb3ca1d 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/arp.go b/pkg/tcpip/header/arp.go
index ae373f112..22b259ccb 100644
--- a/pkg/tcpip/header/arp.go
+++ b/pkg/tcpip/header/arp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index e67c50f50..12f208fde 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index 99c29b750..77365bc41 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/gue.go b/pkg/tcpip/header/gue.go
index aac4593c5..2ad13955a 100644
--- a/pkg/tcpip/header/gue.go
+++ b/pkg/tcpip/header/gue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
index af1e94b7f..3ac89cdae 100644
--- a/pkg/tcpip/header/icmpv4.go
+++ b/pkg/tcpip/header/icmpv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index 7d35caff7..e317975e8 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/interfaces.go b/pkg/tcpip/header/interfaces.go
index 042006983..ac327d8a5 100644
--- a/pkg/tcpip/header/interfaces.go
+++ b/pkg/tcpip/header/interfaces.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index 29570cc34..1b882d3d8 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 66c778fe1..d985b745d 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv6_fragment.go b/pkg/tcpip/header/ipv6_fragment.go
index 44b28b326..e36d5177b 100644
--- a/pkg/tcpip/header/ipv6_fragment.go
+++ b/pkg/tcpip/header/ipv6_fragment.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipversion_test.go b/pkg/tcpip/header/ipversion_test.go
index 3ae9b7e4a..8301ba5cf 100644
--- a/pkg/tcpip/header/ipversion_test.go
+++ b/pkg/tcpip/header/ipversion_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index 6689a6dc5..567a21167 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/tcp_test.go b/pkg/tcpip/header/tcp_test.go
index 7854d3523..7cd98df3b 100644
--- a/pkg/tcpip/header/tcp_test.go
+++ b/pkg/tcpip/header/tcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index cf2602e50..31c8ef456 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 113cbbf5e..da34032cc 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index ee99ada07..24af428dd 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 52e532ebb..19b007a9e 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index fc3f80c01..e6585be66 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
index fc5231831..63b8c4451 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
index a0a9d4acd..6a3e956ad 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go
index 1f143c0db..89a8a9954 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
index de7593d9c..f42ff98db 100644
--- a/pkg/tcpip/link/rawfile/errors.go
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index cea3cd6a1..be4a4fa9c 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe.go b/pkg/tcpip/link/sharedmem/pipe/pipe.go
index 1a0edbaba..e014324cc 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
index db0737c98..30742ccb1 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
index 480dc4a23..f491d74a2 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/rx.go b/pkg/tcpip/link/sharedmem/pipe/rx.go
index ff778cecd..8d641c76f 100644
--- a/pkg/tcpip/link/sharedmem/pipe/rx.go
+++ b/pkg/tcpip/link/sharedmem/pipe/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/tx.go b/pkg/tcpip/link/sharedmem/pipe/tx.go
index 717f5a4b1..e75175d98 100644
--- a/pkg/tcpip/link/sharedmem/pipe/tx.go
+++ b/pkg/tcpip/link/sharedmem/pipe/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/queue_test.go b/pkg/tcpip/link/sharedmem/queue/queue_test.go
index 3d5909cef..391165bc3 100644
--- a/pkg/tcpip/link/sharedmem/queue/queue_test.go
+++ b/pkg/tcpip/link/sharedmem/queue/queue_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/rx.go b/pkg/tcpip/link/sharedmem/queue/rx.go
index c40d62c33..d3a5da08a 100644
--- a/pkg/tcpip/link/sharedmem/queue/rx.go
+++ b/pkg/tcpip/link/sharedmem/queue/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/tx.go b/pkg/tcpip/link/sharedmem/queue/tx.go
index 39b595e56..845108db1 100644
--- a/pkg/tcpip/link/sharedmem/queue/tx.go
+++ b/pkg/tcpip/link/sharedmem/queue/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go
index b8e39eca1..3eeab769e 100644
--- a/pkg/tcpip/link/sharedmem/rx.go
+++ b/pkg/tcpip/link/sharedmem/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index ce6e86767..27d7eb3b9 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index ad987d382..4b8061b13 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
index f0be2dc73..b91adbaf7 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/tx.go b/pkg/tcpip/link/sharedmem/tx.go
index 42a21cb43..37da34831 100644
--- a/pkg/tcpip/link/sharedmem/tx.go
+++ b/pkg/tcpip/link/sharedmem/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sniffer/pcap.go b/pkg/tcpip/link/sniffer/pcap.go
index 04f3d494e..3d0d8d852 100644
--- a/pkg/tcpip/link/sniffer/pcap.go
+++ b/pkg/tcpip/link/sniffer/pcap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index a30e57a32..1bd174bc3 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/tun/tun_unsafe.go b/pkg/tcpip/link/tun/tun_unsafe.go
index 1dec41982..e4c589dda 100644
--- a/pkg/tcpip/link/tun/tun_unsafe.go
+++ b/pkg/tcpip/link/tun/tun_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index ef8c88561..9ffb7b7e9 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 0a15c40de..5ebe09664 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 9d0881e11..2e0024925 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 50628e4a2..5894f9114 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/frag_heap.go b/pkg/tcpip/network/fragmentation/frag_heap.go
index 6c7faafe4..55615c8e6 100644
--- a/pkg/tcpip/network/fragmentation/frag_heap.go
+++ b/pkg/tcpip/network/fragmentation/frag_heap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/frag_heap_test.go b/pkg/tcpip/network/fragmentation/frag_heap_test.go
index a15540634..1b1b72e88 100644
--- a/pkg/tcpip/network/fragmentation/frag_heap_test.go
+++ b/pkg/tcpip/network/fragmentation/frag_heap_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 885e3cca2..a5dda0398 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index fc62a15dd..5bf3463a9 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index b57fe82ec..c9ad2bef6 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
index 4c137828f..a2bc9707a 100644
--- a/pkg/tcpip/network/fragmentation/reassembler_test.go
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/hash/hash.go b/pkg/tcpip/network/hash/hash.go
index eddf7ca4d..07960ddf0 100644
--- a/pkg/tcpip/network/hash/hash.go
+++ b/pkg/tcpip/network/hash/hash.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index e3c7af1f9..5c1e88e56 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index ee8172ac8..f82dc098f 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index d4eeeb5d9..d7801ec19 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 2b7067a50..190d548eb 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 81aba0923..14107443b 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index fabbdc8c7..12c818b48 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 25bd998e5..4d0b6ee9c 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 4e24efddb..41ef32921 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 4ab6a1fa2..72577dfcb 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index c4707736e..67e8f0b9e 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 910d1257f..ab40e9e0b 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/seqnum/seqnum.go b/pkg/tcpip/seqnum/seqnum.go
index e507d02f7..f2b988839 100644
--- a/pkg/tcpip/seqnum/seqnum.go
+++ b/pkg/tcpip/seqnum/seqnum.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 3a147a75f..cb7b7116b 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index e46267f12..651fa17ac 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index dba95369c..3da99ac67 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 0acec2984..b6266eb55 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 6c6400c33..2b4185014 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index d1ec6a660..d4da980a9 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index b6c095efb..f2c6c9a8d 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index a0b3399a8..74bf2c99e 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index a7470d606..c8522ad9e 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 98cc3b120..f09760180 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index bf11c2175..413aee6c6 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
index d283f71c7..361e359d4 100644
--- a/pkg/tcpip/tcpip_test.go
+++ b/pkg/tcpip/tcpip_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/time.s b/pkg/tcpip/time.s
index 8aca31bee..85d52cd9c 100644
--- a/pkg/tcpip/time.s
+++ b/pkg/tcpip/time.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 2102e9633..231151bf3 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index 055daa918..b3f54cfe0 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/ping/endpoint_state.go b/pkg/tcpip/transport/ping/endpoint_state.go
index a16087304..80721d227 100644
--- a/pkg/tcpip/transport/ping/endpoint_state.go
+++ b/pkg/tcpip/transport/ping/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/ping/protocol.go b/pkg/tcpip/transport/ping/protocol.go
index 549b1b2d3..1d504773b 100644
--- a/pkg/tcpip/transport/ping/protocol.go
+++ b/pkg/tcpip/transport/ping/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index c22ed5ea7..5a88d25d0 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 27dbcace2..800d2409e 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/cubic.go b/pkg/tcpip/transport/tcp/cubic.go
index 8cea416d2..003525d86 100644
--- a/pkg/tcpip/transport/tcp/cubic.go
+++ b/pkg/tcpip/transport/tcp/cubic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index c88e98977..d3120c1d8 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 707d6be96..673a65c31 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index bed7ec6a6..e32c73aae 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index c80f3c7d6..2f90839e9 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index abdc825cd..753e1419e 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 92ef9c6f7..05ff9e0d7 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/reno.go b/pkg/tcpip/transport/tcp/reno.go
index feb593234..e4f8b7d5a 100644
--- a/pkg/tcpip/transport/tcp/reno.go
+++ b/pkg/tcpip/transport/tcp/reno.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/sack.go b/pkg/tcpip/transport/tcp/sack.go
index 05bac08cb..24e48fe7b 100644
--- a/pkg/tcpip/transport/tcp/sack.go
+++ b/pkg/tcpip/transport/tcp/sack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 51a3d6aba..fc87a05fd 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
index e3a3405ef..98422fadf 100644
--- a/pkg/tcpip/transport/tcp/segment_heap.go
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 6a2d7bc0b..0c637d7ad 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
index 22f0bbf18..46b6d85a6 100644
--- a/pkg/tcpip/transport/tcp/segment_state.go
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 0bd421ff4..eefe93d48 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index d536839af..86bbd643f 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index a61d0ca64..06b0702c5 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 48852ea47..04e046257 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index ca16fc8fa..b08df0fec 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 5b25534f4..0695e8150 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/timer.go b/pkg/tcpip/transport/tcp/timer.go
index 938c0bcef..38240d2d5 100644
--- a/pkg/tcpip/transport/tcp/timer.go
+++ b/pkg/tcpip/transport/tcp/timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index 5f8f1a64d..f7b2900de 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
index 514722ab7..aaeae9b18 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 840e95302..d777a80d0 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 70a37c7f2..db1e281ad 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 1334fec8a..b3fbed6e4 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index c3f592bd4..58a346cd9 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tmutex/tmutex.go b/pkg/tmutex/tmutex.go
index bd5c681dd..df61d89f5 100644
--- a/pkg/tmutex/tmutex.go
+++ b/pkg/tmutex/tmutex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go
index a9dc9972f..a4537cb3b 100644
--- a/pkg/tmutex/tmutex_test.go
+++ b/pkg/tmutex/tmutex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index f4800e0d9..deeea078d 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go
index 6c546825f..ecc670925 100644
--- a/pkg/unet/unet_test.go
+++ b/pkg/unet/unet_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet_unsafe.go b/pkg/unet/unet_unsafe.go
index fa15cf744..1d69de542 100644
--- a/pkg/unet/unet_unsafe.go
+++ b/pkg/unet/unet_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index 1ec06dd4c..753366be2 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/urpc/urpc_test.go b/pkg/urpc/urpc_test.go
index d9cfc512e..f1b9a85ca 100644
--- a/pkg/urpc/urpc_test.go
+++ b/pkg/urpc/urpc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/fdnotifier/fdnotifier.go b/pkg/waiter/fdnotifier/fdnotifier.go
index 8bb93e39b..624b1a0c5 100644
--- a/pkg/waiter/fdnotifier/fdnotifier.go
+++ b/pkg/waiter/fdnotifier/fdnotifier.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/fdnotifier/poll_unsafe.go b/pkg/waiter/fdnotifier/poll_unsafe.go
index 26bca2b53..8459d4c74 100644
--- a/pkg/waiter/fdnotifier/poll_unsafe.go
+++ b/pkg/waiter/fdnotifier/poll_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 832b6a5a9..93390b299 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/waiter_test.go b/pkg/waiter/waiter_test.go
index c45f22889..60853f9c1 100644
--- a/pkg/waiter/waiter_test.go
+++ b/pkg/waiter/waiter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 3250cdcdc..6766953b3 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 51d20d06d..9ebbde424 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index bee82f344..6dd7fadd9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
index 971962c91..d224d08b7 100644
--- a/runsc/boot/debug.go
+++ b/runsc/boot/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index 595846b10..f954b8c0b 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 9416e3a5c..a3d21d963 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 92a73db9a..378396b9b 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
index 82cf00dfb..67f3101fe 100644
--- a/runsc/boot/filter/extra_filters.go
+++ b/runsc/boot/filter/extra_filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index 76f3f6865..fb95283ab 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
index ebd56c553..02a122c95 100644
--- a/runsc/boot/filter/extra_filters_race.go
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index b656883ad..dc7294b1d 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index ea825e571..e52c89fe4 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 510497eba..8ecda6d0e 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index c79b95bde..fa3de0133 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 41ff3681b..c342ee005 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 6a2678ac9..89f186139 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index 1e898672b..028bcc1f4 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 7a75a189a..d6058a8a2 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index cde915329..4a4713d4f 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 023b63dc0..7c14857ba 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
index 0b18c5481..e5da021e5 100644
--- a/runsc/cmd/capability.go
+++ b/runsc/cmd/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index 3329b308d..dd278b32d 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 023ab2455..d49d0169b 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index 2937ae1c4..a1c3491a3 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 275a96f57..b84185b43 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index cb7d81057..288cbe435 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 92b609c3c..ea1ca1278 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index f6d164394..4a5b4774a 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index df65ea31d..df03415ec 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 336edf3f6..9a395e6f1 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
index 623461e78..686c5e150 100644
--- a/runsc/cmd/exec_test.go
+++ b/runsc/cmd/exec_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index fd4eee546..3842fdf64 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 7a98d10a2..1f1086250 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index 4d4a5cb0b..fd59b73e6 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
index c207b80da..baba937a8 100644
--- a/runsc/cmd/path.go
+++ b/runsc/cmd/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index ac393b48e..5ff6f059c 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 5d219bfdc..fd76cf975 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 6dc044672..cc99b3503 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index a12adf1a3..274b5d084 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 9a87cf240..b6a12f5d6 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 6281fc49d..57ee37c86 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 97ea91fff..48bd4c401 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index 265014e1b..f8ce8c3d8 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 956349140..121c54554 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 3df184742..9f4f9214d 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 8f019b54a..0b0dfb4cb 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/container.go b/runsc/container/container.go
index f76bad1aa..cb4c9b5c1 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 662591b3b..243528d35 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index 2ed42fd93..41022686b 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/fs_test.go b/runsc/container/fs_test.go
index 84bde18fb..87cdb078e 100644
--- a/runsc/container/fs_test.go
+++ b/runsc/container/fs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/hook.go b/runsc/container/hook.go
index 3d93ca0be..6b9e5550a 100644
--- a/runsc/container/hook.go
+++ b/runsc/container/hook.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 1781a4602..4548eb106 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/status.go b/runsc/container/status.go
index bf177e78a..234ffb0dd 100644
--- a/runsc/container/status.go
+++ b/runsc/container/status.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index cc3b087e1..b5071ada6 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 35698f21f..75a087848 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go
index 82cf00dfb..67f3101fe 100644
--- a/runsc/fsgofer/filter/extra_filters.go
+++ b/runsc/fsgofer/filter/extra_filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go
index 169a79ed8..7e142b790 100644
--- a/runsc/fsgofer/filter/extra_filters_msan.go
+++ b/runsc/fsgofer/filter/extra_filters_msan.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
index 9e6512d8c..3cd29472a 100644
--- a/runsc/fsgofer/filter/extra_filters_race.go
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index 6f341f688..f50b6bc87 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 9c4864cf1..e03bb7752 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index a500a2976..48860f952 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index e676809ac..99bc25ec1 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/main.go b/runsc/main.go
index 62b1f01b3..4a92db7c0 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
index 35b19a0b1..354049871 100644
--- a/runsc/sandbox/chroot.go
+++ b/runsc/sandbox/chroot.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 86a52c6ae..52fe8fc0f 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 923a52f7f..0fe85cfe1 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 00293d45b..73fab13e1 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index b29802fde..ab14ed1fc 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index 64e2172c8..b61f1ca62 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/image.go b/runsc/test/image/image.go
index 069d08013..bcb6f876f 100644
--- a/runsc/test/image/image.go
+++ b/runsc/test/image/image.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index d89d80a86..763152b47 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/mysql.sql b/runsc/test/image/mysql.sql
index dd5bfaa4e..c1271e719 100644
--- a/runsc/test/image/mysql.sql
+++ b/runsc/test/image/mysql.sql
@@ -1,4 +1,4 @@
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/ruby.rb b/runsc/test/image/ruby.rb
index ae5de3419..25d1ac129 100644
--- a/runsc/test/image/ruby.rb
+++ b/runsc/test/image/ruby.rb
@@ -1,4 +1,4 @@
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/ruby.sh b/runsc/test/image/ruby.sh
index 54be2c931..d3a9b5656 100644
--- a/runsc/test/image/ruby.sh
+++ b/runsc/test/image/ruby.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/install.sh b/runsc/test/install.sh
index c239588d4..32e1e884e 100755
--- a/runsc/test/install.sh
+++ b/runsc/test/install.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 3cac674d0..fac8337f4 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/integration.go b/runsc/test/integration/integration.go
index 49c3c893a..e15321c87 100644
--- a/runsc/test/integration/integration.go
+++ b/runsc/test/integration/integration.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 536bb17e0..526b3a7a1 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/cgroup_test.go b/runsc/test/root/cgroup_test.go
index 5cb4b794f..fdb94ff64 100644
--- a/runsc/test/root/cgroup_test.go
+++ b/runsc/test/root/cgroup_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 8831e6a78..0ffaaf87b 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/root.go b/runsc/test/root/root.go
index 790f62c29..586ea0fe3 100644
--- a/runsc/test/root/root.go
+++ b/runsc/test/root/root.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 7d6a72e5f..3f74e0770 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4d7ac3bc9..1b5a02c0f 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/testutil_race.go b/runsc/test/testutil/testutil_race.go
index 59cfdaa7b..9267af150 100644
--- a/runsc/test/testutil/testutil_race.go
+++ b/runsc/test/testutil/testutil_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/tools/dockercfg/dockercfg.go b/runsc/tools/dockercfg/dockercfg.go
index 0bd6cad93..110a581ff 100644
--- a/runsc/tools/dockercfg/dockercfg.go
+++ b/runsc/tools/dockercfg/dockercfg.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go
index cc61a7537..eaf5c4970 100644
--- a/tools/go_generics/generics.go
+++ b/tools/go_generics/generics.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_stmts/input.go b/tools/go_generics/generics_tests/all_stmts/input.go
index 870af3b6c..19184a3fe 100644
--- a/tools/go_generics/generics_tests/all_stmts/input.go
+++ b/tools/go_generics/generics_tests/all_stmts/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_stmts/output/output.go b/tools/go_generics/generics_tests/all_stmts/output/output.go
index e4e670bf1..51582346c 100644
--- a/tools/go_generics/generics_tests/all_stmts/output/output.go
+++ b/tools/go_generics/generics_tests/all_stmts/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/input.go b/tools/go_generics/generics_tests/all_types/input.go
index 3a8643e3d..ed6e97c29 100644
--- a/tools/go_generics/generics_tests/all_types/input.go
+++ b/tools/go_generics/generics_tests/all_types/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/lib/lib.go b/tools/go_generics/generics_tests/all_types/lib/lib.go
index d3911d12d..7e73e678e 100644
--- a/tools/go_generics/generics_tests/all_types/lib/lib.go
+++ b/tools/go_generics/generics_tests/all_types/lib/lib.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/output/output.go b/tools/go_generics/generics_tests/all_types/output/output.go
index b89840936..ec09a6be4 100644
--- a/tools/go_generics/generics_tests/all_types/output/output.go
+++ b/tools/go_generics/generics_tests/all_types/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/consts/input.go b/tools/go_generics/generics_tests/consts/input.go
index dabf76e1e..394bcc262 100644
--- a/tools/go_generics/generics_tests/consts/input.go
+++ b/tools/go_generics/generics_tests/consts/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/consts/output/output.go b/tools/go_generics/generics_tests/consts/output/output.go
index 72865607e..91a07fdc2 100644
--- a/tools/go_generics/generics_tests/consts/output/output.go
+++ b/tools/go_generics/generics_tests/consts/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/imports/input.go b/tools/go_generics/generics_tests/imports/input.go
index 66b43fee5..22e6641a6 100644
--- a/tools/go_generics/generics_tests/imports/input.go
+++ b/tools/go_generics/generics_tests/imports/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/imports/output/output.go b/tools/go_generics/generics_tests/imports/output/output.go
index 5f20d43ce..2555c0004 100644
--- a/tools/go_generics/generics_tests/imports/output/output.go
+++ b/tools/go_generics/generics_tests/imports/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/remove_typedef/input.go b/tools/go_generics/generics_tests/remove_typedef/input.go
index c02307d32..d9c9b8530 100644
--- a/tools/go_generics/generics_tests/remove_typedef/input.go
+++ b/tools/go_generics/generics_tests/remove_typedef/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/remove_typedef/output/output.go b/tools/go_generics/generics_tests/remove_typedef/output/output.go
index d20a89abd..f111a9426 100644
--- a/tools/go_generics/generics_tests/remove_typedef/output/output.go
+++ b/tools/go_generics/generics_tests/remove_typedef/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/simple/input.go b/tools/go_generics/generics_tests/simple/input.go
index 670161d6e..711687cf5 100644
--- a/tools/go_generics/generics_tests/simple/input.go
+++ b/tools/go_generics/generics_tests/simple/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/simple/output/output.go b/tools/go_generics/generics_tests/simple/output/output.go
index 75b5467cd..139c9bf9d 100644
--- a/tools/go_generics/generics_tests/simple/output/output.go
+++ b/tools/go_generics/generics_tests/simple/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/globals/globals_visitor.go b/tools/go_generics/globals/globals_visitor.go
index fc0de4381..daaa17b1d 100644
--- a/tools/go_generics/globals/globals_visitor.go
+++ b/tools/go_generics/globals/globals_visitor.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/globals/scope.go b/tools/go_generics/globals/scope.go
index 18743bdee..b75a91689 100644
--- a/tools/go_generics/globals/scope.go
+++ b/tools/go_generics/globals/scope.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/go_generics_unittest.sh b/tools/go_generics/go_generics_unittest.sh
index 699e1f631..e7553a071 100755
--- a/tools/go_generics/go_generics_unittest.sh
+++ b/tools/go_generics/go_generics_unittest.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/imports.go b/tools/go_generics/imports.go
index 97267098b..57f7c3dce 100644
--- a/tools/go_generics/imports.go
+++ b/tools/go_generics/imports.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/merge.go b/tools/go_generics/merge.go
index ebe7cf4e4..2f83facf8 100644
--- a/tools/go_generics/merge.go
+++ b/tools/go_generics/merge.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/remove.go b/tools/go_generics/remove.go
index 2a66de762..139d03955 100644
--- a/tools/go_generics/remove.go
+++ b/tools/go_generics/remove.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/rules_tests/template.go b/tools/go_generics/rules_tests/template.go
index 73c024f0e..f3f31ae8e 100644
--- a/tools/go_generics/rules_tests/template.go
+++ b/tools/go_generics/rules_tests/template.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/rules_tests/template_test.go b/tools/go_generics/rules_tests/template_test.go
index 76c4cdb64..3a38c8629 100644
--- a/tools/go_generics/rules_tests/template_test.go
+++ b/tools/go_generics/rules_tests/template_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 5646b879a..9e2c8e106 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
index d89db1f99..7d44dad37 100755
--- a/tools/workspace_status.sh
+++ b/tools/workspace_status.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vdso/barrier.h b/vdso/barrier.h
index db8185b2e..7866af414 100644
--- a/vdso/barrier.h
+++ b/vdso/barrier.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/check_vdso.py b/vdso/check_vdso.py
index 9a3142ab8..6f7d7e7ec 100644
--- a/vdso/check_vdso.py
+++ b/vdso/check_vdso.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vdso/compiler.h b/vdso/compiler.h
index a661516c3..d65f148fb 100644
--- a/vdso/compiler.h
+++ b/vdso/compiler.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/cycle_clock.h b/vdso/cycle_clock.h
index 93c5f2c0d..dfb5b427d 100644
--- a/vdso/cycle_clock.h
+++ b/vdso/cycle_clock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/seqlock.h b/vdso/seqlock.h
index b527bdbca..ab2f3fda3 100644
--- a/vdso/seqlock.h
+++ b/vdso/seqlock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/syscalls.h b/vdso/syscalls.h
index fd79c4642..0be8a7f9b 100644
--- a/vdso/syscalls.h
+++ b/vdso/syscalls.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso.cc b/vdso/vdso.cc
index db3bdef01..f30dc26a2 100644
--- a/vdso/vdso.cc
+++ b/vdso/vdso.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso_time.cc b/vdso/vdso_time.cc
index 5d5c8de65..a59771bff 100644
--- a/vdso/vdso_time.cc
+++ b/vdso/vdso_time.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso_time.h b/vdso/vdso_time.h
index 71d6e2f64..464dadff2 100644
--- a/vdso/vdso_time.h
+++ b/vdso/vdso_time.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3


From b2068cf5a5d43f3898cf389ab2d6151cf61908ac Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Sat, 20 Oct 2018 11:12:26 -0700
Subject: Add more unimplemented syscall events

Added events for *ctl syscalls that may have multiple different commands.
For runsc, each syscall event is only logged once. For *ctl syscalls, use
the cmd as identifier, not only the syscall number.

PiperOrigin-RevId: 218015941
Change-Id: Ie3c19131ae36124861e9b492a7dbe1765d9e5e59
---
 pkg/abi/linux/ioctl.go                          | 75 +++++++++++++------
 pkg/abi/linux/prctl.go                          | 99 +++++++++++++++++++++++--
 pkg/sentry/fs/host/BUILD                        |  1 +
 pkg/sentry/fs/host/tty.go                       | 30 ++++++++
 pkg/sentry/fs/tty/BUILD                         |  1 +
 pkg/sentry/fs/tty/master.go                     | 45 ++++++++++-
 pkg/sentry/fs/tty/slave.go                      |  3 +-
 pkg/sentry/kernel/BUILD                         |  2 +
 pkg/sentry/kernel/kernel.go                     | 17 +++++
 pkg/sentry/kernel/pipe/reader_writer.go         |  3 +-
 pkg/sentry/kernel/task.go                       |  3 +
 pkg/sentry/socket/epsocket/BUILD                |  1 +
 pkg/sentry/socket/epsocket/epsocket.go          |  3 +
 pkg/sentry/socket/rpcinet/BUILD                 |  1 +
 pkg/sentry/socket/rpcinet/socket.go             |  5 ++
 pkg/sentry/syscalls/BUILD                       | 18 -----
 pkg/sentry/syscalls/linux/linux64.go            |  2 +-
 pkg/sentry/syscalls/linux/sys_prctl.go          | 39 +++++++++-
 pkg/sentry/syscalls/linux/sys_shm.go            |  1 +
 pkg/sentry/syscalls/linux/sys_tls.go            |  3 +
 pkg/sentry/syscalls/syscalls.go                 | 15 +---
 pkg/sentry/syscalls/unimplemented_syscall.proto | 27 -------
 pkg/sentry/unimpl/BUILD                         | 30 ++++++++
 pkg/sentry/unimpl/events.go                     | 45 +++++++++++
 pkg/sentry/unimpl/unimplemented_syscall.proto   | 27 +++++++
 runsc/boot/BUILD                                |  9 ++-
 runsc/boot/compat.go                            | 72 ++++++++++++++++--
 runsc/boot/compat_amd64.go                      | 54 ++++++++++++++
 runsc/boot/compat_test.go                       | 66 +++++++++++++++++
 29 files changed, 596 insertions(+), 101 deletions(-)
 delete mode 100644 pkg/sentry/syscalls/unimplemented_syscall.proto
 create mode 100644 pkg/sentry/unimpl/BUILD
 create mode 100644 pkg/sentry/unimpl/events.go
 create mode 100644 pkg/sentry/unimpl/unimplemented_syscall.proto
 create mode 100644 runsc/boot/compat_amd64.go
 create mode 100644 runsc/boot/compat_test.go

(limited to 'runsc')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 9afc3d1ef..191b26e4d 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -18,28 +18,59 @@ package linux
 //
 // These are ordered by request number (low byte).
 const (
-	TCGETS     = 0x00005401
-	TCSETS     = 0x00005402
-	TCSETSW    = 0x00005403
-	TCSETSF    = 0x00005404
-	TIOCSCTTY  = 0x0000540e
-	TIOCGPGRP  = 0x0000540f
-	TIOCSPGRP  = 0x00005410
-	TIOCOUTQ   = 0x00005411
-	TIOCGWINSZ = 0x00005413
-	TIOCSWINSZ = 0x00005414
-	TIOCINQ    = 0x0000541b
-	FIONREAD   = TIOCINQ
-	FIONBIO    = 0x00005421
-	TIOCGPTN   = 0x80045430
-	TIOCSPTLCK = 0x40045431
-	FIONCLEX   = 0x00005450
-	FIOCLEX    = 0x00005451
-	FIOASYNC   = 0x00005452
-	FIOSETOWN  = 0x00008901
-	SIOCSPGRP  = 0x00008902
-	FIOGETOWN  = 0x00008903
-	SIOCGPGRP  = 0x00008904
+	TCGETS      = 0x00005401
+	TCSETS      = 0x00005402
+	TCSETSW     = 0x00005403
+	TCSETSF     = 0x00005404
+	TCSBRK      = 0x00005409
+	TIOCEXCL    = 0x0000540c
+	TIOCNXCL    = 0x0000540d
+	TIOCSCTTY   = 0x0000540e
+	TIOCGPGRP   = 0x0000540f
+	TIOCSPGRP   = 0x00005410
+	TIOCOUTQ    = 0x00005411
+	TIOCSTI     = 0x00005412
+	TIOCGWINSZ  = 0x00005413
+	TIOCSWINSZ  = 0x00005414
+	TIOCMGET    = 0x00005415
+	TIOCMBIS    = 0x00005416
+	TIOCMBIC    = 0x00005417
+	TIOCMSET    = 0x00005418
+	TIOCINQ     = 0x0000541b
+	FIONREAD    = TIOCINQ
+	FIONBIO     = 0x00005421
+	TIOCSETD    = 0x00005423
+	TIOCNOTTY   = 0x00005422
+	TIOCGETD    = 0x00005424
+	TCSBRKP     = 0x00005425
+	TIOCSBRK    = 0x00005427
+	TIOCCBRK    = 0x00005428
+	TIOCGSID    = 0x00005429
+	TIOCGPTN    = 0x80045430
+	TIOCSPTLCK  = 0x40045431
+	TIOCGDEV    = 0x80045432
+	TIOCVHANGUP = 0x00005437
+	TCFLSH      = 0x0000540b
+	TIOCCONS    = 0x0000541d
+	TIOCSSERIAL = 0x0000541f
+	TIOCGEXCL   = 0x80045440
+	TIOCGPTPEER = 0x80045441
+	TIOCGICOUNT = 0x0000545d
+	FIONCLEX    = 0x00005450
+	FIOCLEX     = 0x00005451
+	FIOASYNC    = 0x00005452
+	FIOSETOWN   = 0x00008901
+	SIOCSPGRP   = 0x00008902
+	FIOGETOWN   = 0x00008903
+	SIOCGPGRP   = 0x00008904
+)
+
+// ioctl(2) requests provided by uapi/linux/sockios.h
+const (
+	SIOCGIFMEM    = 0x891f
+	SIOCGIFPFLAGS = 0x8935
+	SIOCGMIIPHY   = 0x8947
+	SIOCGMIIREG   = 0x8948
 )
 
 // ioctl(2) requests provided by uapi/linux/android/binder.h
diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index e152c4c27..db3206f36 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -22,26 +22,102 @@ const (
 	// PR_GET_PDEATHSIG will get the process' death signal.
 	PR_GET_PDEATHSIG = 2
 
+	// PR_GET_DUMPABLE will get the process's dumpable flag.
+	PR_GET_DUMPABLE = 3
+
+	// PR_SET_DUMPABLE will set the process's dumpable flag.
+	PR_SET_DUMPABLE = 4
+
 	// PR_GET_KEEPCAPS will get the value of the keep capabilities flag.
 	PR_GET_KEEPCAPS = 7
 
 	// PR_SET_KEEPCAPS will set the value of the keep capabilities flag.
 	PR_SET_KEEPCAPS = 8
 
+	// PR_GET_TIMING will get the process's timing method.
+	PR_GET_TIMING = 13
+
+	// PR_SET_TIMING will set the process's timing method.
+	PR_SET_TIMING = 14
+
 	// PR_SET_NAME will set the process' name.
 	PR_SET_NAME = 15
 
 	// PR_GET_NAME will get the process' name.
 	PR_GET_NAME = 16
 
+	// PR_GET_SECCOMP will get a process' seccomp mode.
+	PR_GET_SECCOMP = 21
+
+	// PR_SET_SECCOMP will set a process' seccomp mode.
+	PR_SET_SECCOMP = 22
+
+	// PR_CAPBSET_READ will get the capability bounding set.
+	PR_CAPBSET_READ = 23
+
+	// PR_CAPBSET_DROP will set the capability bounding set.
+	PR_CAPBSET_DROP = 24
+
+	// PR_GET_TSC will get the the value of the flag determining whether the
+	// timestamp counter can be read.
+	PR_GET_TSC = 25
+
+	// PR_SET_TSC will set the the value of the flag determining whether the
+	// timestamp counter can be read.
+	PR_SET_TSC = 26
+
+	// PR_SET_TIMERSLACK set the process's time slack.
+	PR_SET_TIMERSLACK = 29
+
+	// PR_GET_TIMERSLACK get the process's time slack.
+	PR_GET_TIMERSLACK = 30
+
+	// PR_TASK_PERF_EVENTS_DISABLE disable all performance counters attached to
+	// the calling process.
+	PR_TASK_PERF_EVENTS_DISABLE = 31
+
+	// PR_TASK_PERF_EVENTS_ENABLE enable all performance counters attached to
+	// the calling process.
+	PR_TASK_PERF_EVENTS_ENABLE = 32
+
+	// PR_MCE_KILL set the machine check memory corruption kill policy for the
+	// calling thread.
+	PR_MCE_KILL = 33
+
+	// PR_MCE_KILL_GET get the machine check memory corruption kill policy for the
+	// calling thread.
+	PR_MCE_KILL_GET = 34
+
 	// PR_SET_MM will modify certain kernel memory map descriptor fields of the
 	// calling process. See prctl(2) for more information.
 	PR_SET_MM = 35
 
+	PR_SET_MM_START_CODE  = 1
+	PR_SET_MM_END_CODE    = 2
+	PR_SET_MM_START_DATA  = 3
+	PR_SET_MM_END_DATA    = 4
+	PR_SET_MM_START_STACK = 5
+	PR_SET_MM_START_BRK   = 6
+	PR_SET_MM_BRK         = 7
+	PR_SET_MM_ARG_START   = 8
+	PR_SET_MM_ARG_END     = 9
+	PR_SET_MM_ENV_START   = 10
+	PR_SET_MM_ENV_END     = 11
+	PR_SET_MM_AUXV        = 12
 	// PR_SET_MM_EXE_FILE will supersede the /proc/pid/exe symbolic link with a
 	// new one pointing to a new executable file identified by the file descriptor
 	// provided in arg3 argument. See prctl(2) for more information.
 	PR_SET_MM_EXE_FILE = 13
+	PR_SET_MM_MAP      = 14
+	PR_SET_MM_MAP_SIZE = 15
+
+	// PR_SET_CHILD_SUBREAPER set the "child subreaper" attribute of the calling
+	// process.
+	PR_SET_CHILD_SUBREAPER = 36
+
+	// PR_GET_CHILD_SUBREAPER get the "child subreaper" attribute of the calling
+	// process.
+	PR_GET_CHILD_SUBREAPER = 37
 
 	// PR_SET_NO_NEW_PRIVS will set the calling thread's no_new_privs bit.
 	PR_SET_NO_NEW_PRIVS = 38
@@ -49,17 +125,24 @@ const (
 	// PR_GET_NO_NEW_PRIVS will get the calling thread's no_new_privs bit.
 	PR_GET_NO_NEW_PRIVS = 39
 
-	// PR_SET_SECCOMP will set a process' seccomp mode.
-	PR_SET_SECCOMP = 22
+	// PR_GET_TID_ADDRESS retrieve the clear_child_tid address.
+	PR_GET_TID_ADDRESS = 40
 
-	// PR_GET_SECCOMP will get a process' seccomp mode.
-	PR_GET_SECCOMP = 21
+	// PR_SET_THP_DISABLE set the state of the "THP disable" flag for the calling
+	// thread.
+	PR_SET_THP_DISABLE = 41
 
-	// PR_CAPBSET_READ will get the capability bounding set.
-	PR_CAPBSET_READ = 23
+	// PR_GET_THP_DISABLE get the state of the "THP disable" flag for the calling
+	// thread.
+	PR_GET_THP_DISABLE = 42
 
-	// PR_CAPBSET_DROP will set the capability bounding set.
-	PR_CAPBSET_DROP = 24
+	// PR_MPX_ENABLE_MANAGEMENT enable kernel management of Memory Protection
+	// eXtensions (MPX) bounds tables.
+	PR_MPX_ENABLE_MANAGEMENT = 43
+
+	// PR_MPX_DISABLE_MANAGEMENTdisable kernel management of Memory Protection
+	// eXtensions (MPX) bounds tables.
+	PR_MPX_DISABLE_MANAGEMENT = 44
 )
 
 // From <asm/prctl.h>
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 4f264a024..d1eb9bd64 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index cf3639c46..f0bcdc908 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -179,6 +180,35 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		err := ioctlSetWinsize(fd, &winsize)
 		return 0, err
 
+	// Unimplemented commands.
+	case linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+		fallthrough
 	default:
 		return 0, syserror.ENOTTY
 	}
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index d4dd20e30..2b45069a6 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -27,6 +27,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index dad0cad79..00bec4c2c 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -149,7 +150,7 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	switch args[1].Uint() {
+	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
 		return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
@@ -177,6 +178,48 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args a
 	case linux.TIOCSWINSZ:
 		return 0, mf.t.ld.setWindowSize(ctx, io, args)
 	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
 	}
 }
+
+// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
+func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
+	switch cmd {
+	case linux.TCGETS,
+		linux.TCSETS,
+		linux.TCSETSW,
+		linux.TCSETSF,
+		linux.TIOCGPGRP,
+		linux.TIOCSPGRP,
+		linux.TIOCGWINSZ,
+		linux.TIOCSWINSZ,
+		linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+	}
+}
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 9de3168bf..a696fbb51 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -134,7 +134,7 @@ func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src userme
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	switch args[1].Uint() {
+	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the input queue read buffer.
 		return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
@@ -161,6 +161,7 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args ar
 		// control.
 		return 0, nil
 	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
 	}
 }
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e2fb61ba6..389824b25 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -157,6 +157,8 @@ go_library(
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/time",
+        "//pkg/sentry/unimpl",
+        "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index bad558d48..17425e656 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -40,6 +40,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -58,6 +59,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
+	uspb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/state"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
@@ -595,6 +598,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k
 	case uniqueid.CtxInotifyCookie:
 		return ctx.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return ctx.k
 	default:
 		return nil
 	}
@@ -1033,6 +1038,16 @@ func (k *Kernel) SupervisorContext() context.Context {
 	}
 }
 
+// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
+// channel.
+func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
+	t := TaskFromContext(ctx)
+	eventchannel.Emit(&uspb.UnimplementedSyscall{
+		Tid:       int32(t.ThreadID()),
+		Registers: t.Arch().StateData().Proto(),
+	})
+}
+
 type supervisorContext struct {
 	context.NoopSleeper
 	log.Logger
@@ -1073,6 +1088,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return ctx.k
 	case uniqueid.CtxInotifyCookie:
 		return ctx.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return ctx.k
 	default:
 		return nil
 	}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 63efc5bbe..36be1efc3 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -19,6 +19,7 @@ import (
 	"math"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -77,7 +78,7 @@ func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask {
 func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	// Switch on ioctl request.
 	switch int(args[1].Int()) {
-	case syscall.TIOCINQ:
+	case linux.FIONREAD:
 		v := rw.queuedSize()
 		if v > math.MaxInt32 {
 			panic(fmt.Sprintf("Impossibly large pipe queued size: %d", v))
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e22ec768d..73ba8bee9 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -30,6 +30,7 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -594,6 +595,8 @@ func (t *Task) Value(key interface{}) interface{} {
 		return t.k
 	case uniqueid.CtxInotifyCookie:
 		return t.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return t.k
 	default:
 		return nil
 	}
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index dbabc931c..da4aaf510 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e90ef4835..39a0b9941 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -45,6 +45,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -1184,6 +1185,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 		})
 		return 0, err
 
+	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
+		unimpl.EmitUnimplementedEvent(ctx)
 	}
 
 	return 0, syserror.ENOTTY
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 3ea433360..38fa54283 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/socket/rpcinet/conn",
         "//pkg/sentry/socket/rpcinet/notifier",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 44fa5c620..788d853c9 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier"
 	pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -555,6 +556,10 @@ func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.S
 		})
 
 		return 0, err
+
+	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
+		unimpl.EmitUnimplementedEvent(ctx)
+
 	default:
 		return 0, syserror.ENOTTY
 	}
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 22a757095..2a9f0915e 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
 go_library(
     name = "syscalls",
@@ -13,9 +12,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls",
     visibility = ["//:sandbox"],
     deps = [
-        ":unimplemented_syscall_go_proto",
         "//pkg/abi/linux",
-        "//pkg/eventchannel",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
@@ -26,18 +23,3 @@ go_library(
         "//pkg/waiter",
     ],
 )
-
-proto_library(
-    name = "unimplemented_syscall_proto",
-    srcs = ["unimplemented_syscall.proto"],
-    visibility = ["//visibility:public"],
-    deps = ["//pkg/sentry/arch:registers_proto"],
-)
-
-go_proto_library(
-    name = "unimplemented_syscall_go_proto",
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto",
-    proto = ":unimplemented_syscall_proto",
-    visibility = ["//visibility:public"],
-    deps = ["//pkg/sentry/arch:registers_go_proto"],
-)
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 75e87f5ec..11bf81f88 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -369,7 +369,7 @@ var AMD64 = &kernel.SyscallTable{
 		0xffffffffff600800: 309, // vsyscall getcpu(2)
 	},
 	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
-		syscalls.UnimplementedEvent(t)
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, syserror.ENOSYS
 	},
 }
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index c7b39ede8..91e852049 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -104,6 +104,22 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 			// Set the underlying executable.
 			t.MemoryManager().SetExecutable(file.Dirent)
+
+		case linux.PR_SET_MM_AUXV,
+			linux.PR_SET_MM_START_CODE,
+			linux.PR_SET_MM_END_CODE,
+			linux.PR_SET_MM_START_DATA,
+			linux.PR_SET_MM_END_DATA,
+			linux.PR_SET_MM_START_STACK,
+			linux.PR_SET_MM_START_BRK,
+			linux.PR_SET_MM_BRK,
+			linux.PR_SET_MM_ARG_START,
+			linux.PR_SET_MM_ARG_END,
+			linux.PR_SET_MM_ENV_START,
+			linux.PR_SET_MM_ENV_END:
+
+			t.Kernel().EmitUnimplementedEvent(t)
+			fallthrough
 		default:
 			return 0, nil, syscall.EINVAL
 		}
@@ -151,8 +167,29 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		}
 		return 0, nil, t.DropBoundingCapability(cp)
 
+	case linux.PR_GET_DUMPABLE,
+		linux.PR_SET_DUMPABLE,
+		linux.PR_GET_TIMING,
+		linux.PR_SET_TIMING,
+		linux.PR_GET_TSC,
+		linux.PR_SET_TSC,
+		linux.PR_TASK_PERF_EVENTS_DISABLE,
+		linux.PR_TASK_PERF_EVENTS_ENABLE,
+		linux.PR_GET_TIMERSLACK,
+		linux.PR_SET_TIMERSLACK,
+		linux.PR_MCE_KILL,
+		linux.PR_MCE_KILL_GET,
+		linux.PR_GET_TID_ADDRESS,
+		linux.PR_SET_CHILD_SUBREAPER,
+		linux.PR_GET_CHILD_SUBREAPER,
+		linux.PR_GET_THP_DISABLE,
+		linux.PR_SET_THP_DISABLE,
+		linux.PR_MPX_ENABLE_MANAGEMENT,
+		linux.PR_MPX_DISABLE_MANAGEMENT:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
 	default:
-		t.Warningf("Unsupported prctl %d", option)
 		return 0, nil, syscall.EINVAL
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index b13d48b98..5f887523a 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -147,6 +147,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		// We currently do not support memmory locking anywhere.
 		// mlock(2)/munlock(2) are currently stubbed out as no-ops so do the
 		// same here.
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, nil
 
 	default:
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index 27ddb3808..40e84825b 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -45,6 +45,9 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		regs.Fs = 0
 		regs.Fs_base = fsbase
 
+	case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
 	default:
 		return 0, nil, syscall.EINVAL
 	}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index bae32d727..425ce900c 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -26,10 +26,8 @@ package syscalls
 
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	uspb "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -44,7 +42,7 @@ func Error(err error) kernel.SyscallFn {
 // syscall event via the event channel and returns the passed error.
 func ErrorWithEvent(err error) kernel.SyscallFn {
 	return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-		UnimplementedEvent(t)
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, err
 	}
 }
@@ -57,16 +55,7 @@ func CapError(c linux.Capability) kernel.SyscallFn {
 		if !t.HasCapability(c) {
 			return 0, nil, syserror.EPERM
 		}
-		UnimplementedEvent(t)
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, syserror.ENOSYS
 	}
 }
-
-// UnimplementedEvent emits an UnimplementedSyscall event via the event
-// channel.
-func UnimplementedEvent(t *kernel.Task) {
-	eventchannel.Emit(&uspb.UnimplementedSyscall{
-		Tid:       int32(t.ThreadID()),
-		Registers: t.Arch().StateData().Proto(),
-	})
-}
diff --git a/pkg/sentry/syscalls/unimplemented_syscall.proto b/pkg/sentry/syscalls/unimplemented_syscall.proto
deleted file mode 100644
index 41579b016..000000000
--- a/pkg/sentry/syscalls/unimplemented_syscall.proto
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto3";
-
-package gvisor;
-
-import "pkg/sentry/arch/registers.proto";
-
-message UnimplementedSyscall {
-  // Task ID.
-  int32 tid = 1;
-
-  // Registers at the time of the call.
-  Registers registers = 2;
-}
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
new file mode 100644
index 000000000..63da5e81f
--- /dev/null
+++ b/pkg/sentry/unimpl/BUILD
@@ -0,0 +1,30 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+
+proto_library(
+    name = "unimplemented_syscall_proto",
+    srcs = ["unimplemented_syscall.proto"],
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_proto"],
+)
+
+go_proto_library(
+    name = "unimplemented_syscall_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto",
+    proto = ":unimplemented_syscall_proto",
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_go_proto"],
+)
+
+go_library(
+    name = "unimpl",
+    srcs = ["events.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sentry/context",
+    ],
+)
diff --git a/pkg/sentry/unimpl/events.go b/pkg/sentry/unimpl/events.go
new file mode 100644
index 000000000..f78f8c981
--- /dev/null
+++ b/pkg/sentry/unimpl/events.go
@@ -0,0 +1,45 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package unimpl contains interface to emit events about unimplemented
+// features.
+package unimpl
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the events package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxEvents is a Context.Value key for a Events.
+	CtxEvents contextID = iota
+)
+
+// Events interface defines method to emit unsupported events.
+type Events interface {
+	EmitUnimplementedEvent(context.Context)
+}
+
+// EmitUnimplementedEvent emits unsupported syscall event to the context.
+func EmitUnimplementedEvent(ctx context.Context) {
+	e := ctx.Value(CtxEvents)
+	if e == nil {
+		log.Warningf("Context.Value(CtxEvents) not present, unimplemented syscall event not reported.")
+		return
+	}
+	e.(Events).EmitUnimplementedEvent(ctx)
+}
diff --git a/pkg/sentry/unimpl/unimplemented_syscall.proto b/pkg/sentry/unimpl/unimplemented_syscall.proto
new file mode 100644
index 000000000..41579b016
--- /dev/null
+++ b/pkg/sentry/unimpl/unimplemented_syscall.proto
@@ -0,0 +1,27 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+import "pkg/sentry/arch/registers.proto";
+
+message UnimplementedSyscall {
+  // Task ID.
+  int32 tid = 1;
+
+  // Registers at the time of the call.
+  Registers registers = 2;
+}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index f8f848ebf..04cc0e854 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "boot",
     srcs = [
         "compat.go",
+        "compat_amd64.go",
         "config.go",
         "controller.go",
         "debug.go",
@@ -59,9 +60,9 @@ go_library(
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
-        "//pkg/sentry/syscalls:unimplemented_syscall_go_proto",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/time",
+        "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/usage",
         "//pkg/sentry/watchdog",
         "//pkg/syserror",
@@ -87,12 +88,16 @@ go_library(
 go_test(
     name = "boot_test",
     size = "small",
-    srcs = ["loader_test.go"],
+    srcs = [
+        "compat_test.go",
+        "loader_test.go",
+    ],
     embed = [":boot"],
     deps = [
         "//pkg/control/server",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/unet",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 6766953b3..d18c2f802 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -17,6 +17,8 @@ package boot
 import (
 	"fmt"
 	"os"
+	"sync"
+	"syscall"
 
 	"github.com/golang/protobuf/proto"
 	"gvisor.googlesource.com/gvisor/pkg/abi"
@@ -25,7 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
-	spb "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto"
+	spb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
 )
 
 func initCompatLogs(fd int) error {
@@ -40,15 +42,27 @@ func initCompatLogs(fd int) error {
 type compatEmitter struct {
 	sink    *log.BasicLogger
 	nameMap strace.SyscallMap
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// trackers map syscall number to the respective tracker instance.
+	// Protected by 'mu'.
+	trackers map[uint64]syscallTracker
 }
 
 func newCompatEmitter(logFD int) (*compatEmitter, error) {
-	// Always logs to default logger.
 	nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
 	if !ok {
 		return nil, fmt.Errorf("amd64 Linux syscall table not found")
 	}
-	c := &compatEmitter{sink: log.Log(), nameMap: nameMap}
+
+	c := &compatEmitter{
+		// Always logs to default logger.
+		sink:     log.Log(),
+		nameMap:  nameMap,
+		trackers: make(map[uint64]syscallTracker),
+	}
 
 	if logFD > 0 {
 		f := os.NewFile(uintptr(logFD), "user log file")
@@ -61,10 +75,33 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 // Emit implements eventchannel.Emitter.
 func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
 	// Only interested in UnimplementedSyscall, skip the rest.
-	if us, ok := msg.(*spb.UnimplementedSyscall); ok {
-		regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
-		sysnr := regs.OrigRax
+	us, ok := msg.(*spb.UnimplementedSyscall)
+	if !ok {
+		return false, nil
+	}
+	regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	sysnr := regs.OrigRax
+	tr := c.trackers[sysnr]
+	if tr == nil {
+		switch sysnr {
+		case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+			tr = newCmdTracker(0)
+
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL:
+			tr = newCmdTracker(1)
+
+		default:
+			tr = &onceTracker{}
+		}
+		c.trackers[sysnr] = tr
+	}
+	if tr.shouldReport(regs) {
 		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+		tr.onReported(regs)
 	}
 	return false, nil
 }
@@ -74,3 +111,26 @@ func (c *compatEmitter) Close() error {
 	c.sink = nil
 	return nil
 }
+
+// syscallTracker interface allows filters to apply differently depending on
+// the syscall and arguments.
+type syscallTracker interface {
+	// shouldReport returns true is the syscall should be reported.
+	shouldReport(regs *rpb.AMD64Registers) bool
+
+	// onReported marks the syscall as reported.
+	onReported(regs *rpb.AMD64Registers)
+}
+
+// onceTracker reports only a single time, used for most syscalls.
+type onceTracker struct {
+	reported bool
+}
+
+func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+	return !o.reported
+}
+
+func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+	o.reported = true
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
new file mode 100644
index 000000000..2bb769a49
--- /dev/null
+++ b/runsc/boot/compat_amd64.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+)
+
+// cmdTracker reports only a single time for each different command argument in
+// the syscall. It's used for generic syscalls like ioctl to report once per
+// 'cmd'
+type cmdTracker struct {
+	// argIdx is the syscall argument index where the command is located.
+	argIdx int
+	cmds   map[uint32]struct{}
+}
+
+func newCmdTracker(argIdx int) *cmdTracker {
+	return &cmdTracker{argIdx: argIdx, cmds: make(map[uint32]struct{})}
+}
+
+// cmd returns the command based on the syscall argument index.
+func (c *cmdTracker) cmd(regs *rpb.AMD64Registers) uint32 {
+	switch c.argIdx {
+	case 0:
+		return uint32(regs.Rdi)
+	case 1:
+		return uint32(regs.Rsi)
+	}
+	panic(fmt.Sprintf("unsupported syscall argument index %d", c.argIdx))
+}
+
+func (c *cmdTracker) shouldReport(regs *rpb.AMD64Registers) bool {
+	_, ok := c.cmds[c.cmd(regs)]
+	return !ok
+}
+
+func (c *cmdTracker) onReported(regs *rpb.AMD64Registers) {
+	c.cmds[c.cmd(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
new file mode 100644
index 000000000..30b94798a
--- /dev/null
+++ b/runsc/boot/compat_test.go
@@ -0,0 +1,66 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"testing"
+
+	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+)
+
+func TestOnceTracker(t *testing.T) {
+	o := onceTracker{}
+	if !o.shouldReport(nil) {
+		t.Error("first call to checkAndMark, got: false, want: true")
+	}
+	o.onReported(nil)
+	for i := 0; i < 2; i++ {
+		if o.shouldReport(nil) {
+			t.Error("after first call to checkAndMark, got: true, want: false")
+		}
+	}
+}
+
+func TestCmdTracker(t *testing.T) {
+	for _, tc := range []struct {
+		name string
+		idx  int
+		rdi1 uint64
+		rdi2 uint64
+		rsi1 uint64
+		rsi2 uint64
+		want bool
+	}{
+		{name: "same rdi", idx: 0, rdi1: 123, rdi2: 123, want: false},
+		{name: "same rsi", idx: 1, rsi1: 123, rsi2: 123, want: false},
+		{name: "diff rdi", idx: 0, rdi1: 123, rdi2: 321, want: true},
+		{name: "diff rsi", idx: 1, rsi1: 123, rsi2: 321, want: true},
+		{name: "cmd is uint32", idx: 0, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := newCmdTracker(tc.idx)
+			regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
+			if !c.shouldReport(regs) {
+				t.Error("first call to checkAndMark, got: false, want: true")
+			}
+			c.onReported(regs)
+
+			regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
+			if got := c.shouldReport(regs); tc.want != got {
+				t.Errorf("after first call to checkAndMark, got: %t, want: %t", got, tc.want)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From c2c0f9cb7e8320de06ef280c6184bb6aeda71627 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Sun, 21 Oct 2018 19:41:44 -0700
Subject: Updated cleanup code to be more explicit about ignoring errors.

Errors are shown as being ignored by assigning to the blank identifier.

PiperOrigin-RevId: 218103819
Change-Id: I7cc7b9d8ac503a03de5504ebdeb99ed30a531cf2
---
 runsc/cgroup/cgroup.go       | 4 +++-
 runsc/container/container.go | 6 +++++-
 runsc/sandbox/sandbox.go     | 4 +++-
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index d6058a8a2..0ceeb3f28 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -190,7 +190,9 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 	// Mark that cgroup resources are owned by me.
 	log.Debugf("Creating cgroup %q", c.Name)
 	c.Own = true
-	clean := specutils.MakeCleanup(func() { c.Uninstall() })
+	// The Cleanup object cleans up partially created cgroups when an error occurs.
+	// Errors occuring during cleanup itself are ignored.
+	clean := specutils.MakeCleanup(func() { _ = c.Uninstall() })
 	defer clean.Clean()
 
 	for key, ctrl := range controllers {
diff --git a/runsc/container/container.go b/runsc/container/container.go
index cb4c9b5c1..9da25a863 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -262,7 +262,9 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		Status:        Creating,
 		Owner:         os.Getenv("USER"),
 	}
-	cu := specutils.MakeCleanup(func() { c.Destroy() })
+	// The Cleanup object cleans up partially created containers when an error occurs.
+	// Any errors occuring during cleanup itself are ignored.
+	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
 	defer cu.Clean()
 
 	// If the metadata annotations indicate that this container should be
@@ -424,6 +426,8 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	if err != nil {
 		return 0, fmt.Errorf("error creating container: %v", err)
 	}
+	// Clean up partially created container if an error ocurrs.
+	// Any errors returned by Destroy() itself are ignored.
 	defer c.Destroy()
 
 	if err := c.Start(conf); err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 0fe85cfe1..df235c5e9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -68,7 +68,9 @@ type Sandbox struct {
 // sandbox.
 func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
-	c := specutils.MakeCleanup(func() { s.destroy() })
+	// The Cleanup object cleans up partially created sandboxes when an error occurs.
+	// Any errors occuring during cleanup itself are ignored.
+	c := specutils.MakeCleanup(func() { _ = s.destroy() })
 	defer c.Clean()
 
 	if cg, ok := cgroup.New(spec); ok {
-- 
cgit v1.2.3


From 75cd70ecc9abfd5daaefea04da5070a0e0d620dd Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 23 Oct 2018 00:19:11 -0700
Subject: Track paths and provide a rename hook.

This change also adds extensive testing to the p9 package via mocks. The sanity
checks and type checks are moved from the gofer into the core package, where
they can be more easily validated.

PiperOrigin-RevId: 218296768
Change-Id: I4fc3c326e7bf1e0e140a454cbacbcc6fd617ab55
---
 WORKSPACE                                     |   20 +-
 pkg/amutex/BUILD                              |    4 +-
 pkg/atomicbitops/BUILD                        |    4 +-
 pkg/binary/BUILD                              |    4 +-
 pkg/bits/BUILD                                |    3 +-
 pkg/compressio/BUILD                          |    4 +-
 pkg/control/client/BUILD                      |    4 +-
 pkg/control/server/BUILD                      |    4 +-
 pkg/dhcp/BUILD                                |    4 +-
 pkg/eventchannel/BUILD                        |    4 +-
 pkg/fd/BUILD                                  |    4 +-
 pkg/gate/BUILD                                |    4 +-
 pkg/ilist/BUILD                               |    4 +-
 pkg/linewriter/BUILD                          |    4 +-
 pkg/log/BUILD                                 |    4 +-
 pkg/metric/BUILD                              |    4 +-
 pkg/p9/BUILD                                  |    2 +
 pkg/p9/buffer_test.go                         |   31 +
 pkg/p9/client.go                              |    6 +
 pkg/p9/client_file.go                         |    4 +-
 pkg/p9/file.go                                |  151 +-
 pkg/p9/handlers.go                            |  697 ++++++--
 pkg/p9/local_server/BUILD                     |    4 +-
 pkg/p9/local_server/local_server.go           |    5 +
 pkg/p9/messages_test.go                       |   37 +
 pkg/p9/p9.go                                  |   24 +
 pkg/p9/p9test/BUILD                           |   76 +-
 pkg/p9/p9test/client_test.go                  | 2263 ++++++++++++++++++++++---
 pkg/p9/p9test/mocks.go                        |  489 ------
 pkg/p9/p9test/p9test.go                       |  329 ++++
 pkg/p9/path_tree.go                           |  109 ++
 pkg/p9/server.go                              |  228 ++-
 pkg/p9/transport.go                           |   10 +-
 pkg/rand/BUILD                                |    4 +-
 pkg/seccomp/BUILD                             |    4 +-
 pkg/secio/BUILD                               |    4 +-
 pkg/sentry/arch/BUILD                         |    3 +-
 pkg/sentry/context/BUILD                      |    4 +-
 pkg/sentry/control/BUILD                      |    4 +-
 pkg/sentry/device/BUILD                       |    4 +-
 pkg/sentry/fs/anon/BUILD                      |    4 +-
 pkg/sentry/fs/gofer/BUILD                     |    4 -
 pkg/sentry/fs/gofer/context_file.go           |    7 -
 pkg/sentry/fs/gofer/gofer_test.go             |  894 ++--------
 pkg/sentry/fs/gofer/session.go                |    9 +-
 pkg/sentry/fs/gofer/session_state.go          |    4 +-
 pkg/sentry/fs/proc/device/BUILD               |    4 +-
 pkg/sentry/hostcpu/BUILD                      |    4 +-
 pkg/sentry/kernel/kdefs/BUILD                 |    4 +-
 pkg/sentry/kernel/memevent/BUILD              |    4 +-
 pkg/sentry/kernel/sched/BUILD                 |    4 +-
 pkg/sentry/loader/BUILD                       |    3 +-
 pkg/sentry/memutil/BUILD                      |    4 +-
 pkg/sentry/platform/interrupt/BUILD           |    4 +-
 pkg/sentry/platform/kvm/BUILD                 |    3 +-
 pkg/sentry/platform/kvm/testutil/BUILD        |    4 +-
 pkg/sentry/platform/procid/BUILD              |    4 +-
 pkg/sentry/platform/ptrace/BUILD              |    4 +-
 pkg/sentry/platform/ring0/BUILD               |    3 +-
 pkg/sentry/platform/ring0/gen_offsets/BUILD   |    3 +-
 pkg/sentry/platform/ring0/pagetables/BUILD    |    3 +-
 pkg/sentry/platform/safecopy/BUILD            |    4 +-
 pkg/sentry/safemem/BUILD                      |    4 +-
 pkg/sentry/sighandling/BUILD                  |    4 +-
 pkg/sentry/socket/rpcinet/BUILD               |    4 +-
 pkg/sentry/socket/rpcinet/conn/BUILD          |    4 +-
 pkg/sentry/socket/rpcinet/notifier/BUILD      |    4 +-
 pkg/sentry/state/BUILD                        |    4 +-
 pkg/sentry/strace/BUILD                       |    4 +-
 pkg/sentry/syscalls/BUILD                     |    4 +-
 pkg/sentry/time/BUILD                         |    3 +-
 pkg/sentry/unimpl/BUILD                       |    4 +-
 pkg/sentry/uniqueid/BUILD                     |    4 +-
 pkg/sentry/watchdog/BUILD                     |    4 +-
 pkg/sleep/BUILD                               |    4 +-
 pkg/state/BUILD                               |    5 +-
 pkg/state/statefile/BUILD                     |    4 +-
 pkg/sync/atomicptrtest/BUILD                  |    3 +-
 pkg/sync/seqatomictest/BUILD                  |    3 +-
 pkg/syserr/BUILD                              |    4 +-
 pkg/syserror/BUILD                            |    4 +-
 pkg/tcpip/adapters/gonet/BUILD                |    4 +-
 pkg/tcpip/checker/BUILD                       |    4 +-
 pkg/tcpip/link/channel/BUILD                  |    4 +-
 pkg/tcpip/link/fdbased/BUILD                  |    4 +-
 pkg/tcpip/link/loopback/BUILD                 |    4 +-
 pkg/tcpip/link/rawfile/BUILD                  |    4 +-
 pkg/tcpip/link/sharedmem/BUILD                |    4 +-
 pkg/tcpip/link/sharedmem/pipe/BUILD           |    4 +-
 pkg/tcpip/link/sharedmem/queue/BUILD          |    4 +-
 pkg/tcpip/link/sniffer/BUILD                  |    4 +-
 pkg/tcpip/link/tun/BUILD                      |    4 +-
 pkg/tcpip/link/waitable/BUILD                 |    4 +-
 pkg/tcpip/network/BUILD                       |    4 +-
 pkg/tcpip/network/arp/BUILD                   |    4 +-
 pkg/tcpip/network/hash/BUILD                  |    4 +-
 pkg/tcpip/network/ipv4/BUILD                  |    4 +-
 pkg/tcpip/network/ipv6/BUILD                  |    4 +-
 pkg/tcpip/ports/BUILD                         |    4 +-
 pkg/tcpip/sample/tun_tcp_connect/BUILD        |    4 +-
 pkg/tcpip/sample/tun_tcp_echo/BUILD           |    4 +-
 pkg/tcpip/transport/tcp/testing/context/BUILD |    4 +-
 pkg/tcpip/transport/tcpconntrack/BUILD        |    4 +-
 pkg/tmutex/BUILD                              |    4 +-
 pkg/unet/BUILD                                |    4 +-
 pkg/urpc/BUILD                                |    4 +-
 pkg/waiter/fdnotifier/BUILD                   |    4 +-
 runsc/boot/BUILD                              |    4 +-
 runsc/boot/filter/BUILD                       |    4 +-
 runsc/cgroup/BUILD                            |    4 +-
 runsc/cmd/BUILD                               |    4 +-
 runsc/console/BUILD                           |    4 +-
 runsc/container/BUILD                         |    4 +-
 runsc/fsgofer/BUILD                           |    4 +-
 runsc/fsgofer/filter/BUILD                    |    4 +-
 runsc/fsgofer/fsgofer.go                      |   98 +-
 runsc/fsgofer/fsgofer_test.go                 |   78 +-
 runsc/sandbox/BUILD                           |    4 +-
 runsc/specutils/BUILD                         |    4 +-
 runsc/test/image/BUILD                        |    4 +-
 runsc/test/integration/BUILD                  |    4 +-
 runsc/test/root/BUILD                         |    4 +-
 runsc/test/testutil/BUILD                     |    4 +-
 runsc/tools/dockercfg/BUILD                   |    4 +-
 tools/go_generics/BUILD                       |    4 +-
 tools/go_generics/globals/BUILD               |    4 +-
 tools/go_generics/rules_tests/BUILD           |    3 +-
 tools/go_stateify/BUILD                       |    4 +-
 128 files changed, 3834 insertions(+), 2147 deletions(-)
 create mode 100644 pkg/p9/buffer_test.go
 delete mode 100644 pkg/p9/p9test/mocks.go
 create mode 100644 pkg/p9/p9test/p9test.go
 create mode 100644 pkg/p9/path_tree.go

(limited to 'runsc')

diff --git a/WORKSPACE b/WORKSPACE
index 48e0d3436..841a23e06 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -15,7 +15,7 @@ go_register_toolchains(go_version="1.11.1")
 load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository")
 gazelle_dependencies()
 
-# Add dependencies on external repositories.
+# External repositories, in sorted order.
 go_repository(
     name = "com_github_cenkalti_backoff",
     importpath = "github.com/cenkalti/backoff",
@@ -28,6 +28,12 @@ go_repository(
     commit = "886344bea0798d02ff3fae16a922be5f6b26cee0"
 )
 
+go_repository(
+    name = "com_github_golang_mock",
+    importpath = "github.com/golang/mock",
+    commit = "600781dde9cca80734169b9e969d9054ccc57937",
+)
+
 go_repository(
     name = "com_github_google_go-cmp",
     importpath = "github.com/google/go-cmp",
@@ -58,6 +64,12 @@ go_repository(
     commit = "b2d941ef6a780da2d9982c1fb28d77ad97f54fc7",
 )
 
+go_repository(
+    name = "com_github_syndtr_gocapability",
+    importpath = "github.com/syndtr/gocapability",
+    commit = "d98352740cb2c55f81556b63d4a1ec64c5a319c2",
+)
+
 go_repository(
     name = "com_github_vishvananda_netlink",
     importpath = "github.com/vishvananda/netlink",
@@ -81,9 +93,3 @@ go_repository(
     importpath = "golang.org/x/sys",
     commit = "0dd5e194bbf5eb84a39666eb4c98a4d007e4203a",
 )
-
-go_repository(
-    name = "com_github_syndtr_gocapability",
-    importpath = "github.com/syndtr/gocapability",
-    commit = "d98352740cb2c55f81556b63d4a1ec64c5a319c2",
-)
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 84e6b79a5..815ee3a69 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "amutex",
     srcs = ["amutex.go"],
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index a8dd17825..235188531 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "atomicbitops",
     srcs = [
diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD
index 586d05634..571151f72 100644
--- a/pkg/binary/BUILD
+++ b/pkg/binary/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "binary",
     srcs = ["binary.go"],
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 8c943b615..46794bdb8 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_library(
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index d70f982c1..72952d735 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "compressio",
     srcs = ["compressio.go"],
diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD
index d58cd1b71..32853875d 100644
--- a/pkg/control/client/BUILD
+++ b/pkg/control/client/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "client",
     srcs = [
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index c3f74a532..ba2b1be9f 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "server",
     srcs = ["server.go"],
diff --git a/pkg/dhcp/BUILD b/pkg/dhcp/BUILD
index 711a72c99..c97dfc14b 100644
--- a/pkg/dhcp/BUILD
+++ b/pkg/dhcp/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "dhcp",
     srcs = [
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 9d531ce12..18348ef54 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "eventchannel",
     srcs = [
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index 435b6fa34..06cfd445e 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "fd",
     srcs = ["fd.go"],
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index 872eff531..9a87a3a31 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "gate",
     srcs = [
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index 1bd71b800..a67aa2cff 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ilist",
     srcs = [
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index 6c3795432..3f28ba867 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "linewriter",
     srcs = ["linewriter.go"],
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index fc9281079..bf85b4494 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "log",
     srcs = [
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index c0cd40c7b..d96e5563b 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "metric",
     srcs = ["metric.go"],
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index 1cf5c6458..2c224e65b 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -15,6 +15,7 @@ go_library(
         "handlers.go",
         "messages.go",
         "p9.go",
+        "path_tree.go",
         "pool.go",
         "server.go",
         "transport.go",
@@ -32,6 +33,7 @@ go_test(
     name = "p9_test",
     size = "small",
     srcs = [
+        "buffer_test.go",
         "client_test.go",
         "messages_test.go",
         "p9_test.go",
diff --git a/pkg/p9/buffer_test.go b/pkg/p9/buffer_test.go
new file mode 100644
index 000000000..97eceefa7
--- /dev/null
+++ b/pkg/p9/buffer_test.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package p9
+
+import (
+	"testing"
+)
+
+func TestBufferOverrun(t *testing.T) {
+	buf := &buffer{
+		// This header indicates that a large string should follow, but
+		// it is only two bytes. Reading a string should cause an
+		// overrun.
+		data: []byte{0x0, 0x16},
+	}
+	if s := buf.ReadString(); s != "" {
+		t.Errorf("overrun read got %s, want empty", s)
+	}
+}
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 3ebfab82a..67887874a 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -116,6 +116,7 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client
 			msize: largestFixedSize,
 		}
 	}
+
 	// Compute a payload size and round to 512 (normal block size)
 	// if it's larger than a single block.
 	payloadSize := messageSize - largestFixedSize
@@ -299,3 +300,8 @@ func (c *Client) sendRecv(t message, r message) error {
 func (c *Client) Version() uint32 {
 	return c.version
 }
+
+// Close closes the underlying socket.
+func (c *Client) Close() error {
+	return c.socket.Close()
+}
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 066639fda..992d1daf7 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -172,6 +172,9 @@ func (c *clientFile) SetAttr(valid SetAttrMask, attr SetAttr) error {
 }
 
 // Remove implements File.Remove.
+//
+// N.B. This method is no longer part of the file interface and should be
+// considered deprecated.
 func (c *clientFile) Remove() error {
 	// Avoid double close.
 	if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
@@ -181,7 +184,6 @@ func (c *clientFile) Remove() error {
 
 	// Send the remove message.
 	if err := c.client.sendRecv(&Tremove{FID: c.fid}, &Rremove{}); err != nil {
-		log.Warningf("Tremove failed, losing FID %v: %v", c.fid, err)
 		return err
 	}
 
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index d2e89e373..55ceb52e1 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -31,35 +31,63 @@ type Attacher interface {
 
 // File is a set of operations corresponding to a single node.
 //
-// Functions below MUST return syscall.Errno values.
-// TODO: Enforce that with the type.
+// Note that on the server side, the server logic places constraints on
+// concurrent operations to make things easier. This may reduce the need for
+// complex, error-prone locking and logic in the backend. These are documented
+// for each method.
 //
-// These must be implemented in all circumstances.
+// There are three different types of guarantees provided:
+//
+// none: There is no concurrency guarantee. The method may be invoked
+// concurrently with any other method on any other file.
+//
+// read: The method is guaranteed to be exclusive of any write or global
+// operation that is mutating the state of the directory tree starting at this
+// node. For example, this means creating new files, symlinks, directories or
+// renaming a directory entry (or renaming in to this target), but the method
+// may be called concurrently with other read methods.
+//
+// write: The method is guaranteed to be exclusive of any read, write or global
+// operation that is mutating the state of the directory tree starting at this
+// node, as described in read above. There may however, be other write
+// operations executing concurrently on other components in the directory tree.
+//
+// global: The method is guaranteed to be exclusive of any read, write or
+// global operation.
 type File interface {
 	// Walk walks to the path components given in names.
 	//
 	// Walk returns QIDs in the same order that the names were passed in.
 	//
 	// An empty list of arguments should return a copy of the current file.
+	//
+	// On the server, Walk has a read concurrency guarantee.
 	Walk(names []string) ([]QID, File, error)
 
+	// WalkGetAttr walks to the next file and returns its maximal set of
+	// attributes.
+	//
+	// Server-side p9.Files may return syscall.ENOSYS to indicate that Walk
+	// and GetAttr should be used separately to satisfy this request.
+	//
+	// On the server, WalkGetAttr has a read concurrency guarantee.
+	WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error)
+
 	// StatFS returns information about the file system associated with
 	// this file.
+	//
+	// On the server, StatFS has no concurrency guarantee.
 	StatFS() (FSStat, error)
 
 	// GetAttr returns attributes of this node.
+	//
+	// On the server, GetAttr has a read concurrency guarantee.
 	GetAttr(req AttrMask) (QID, AttrMask, Attr, error)
 
 	// SetAttr sets attributes on this node.
-	SetAttr(valid SetAttrMask, attr SetAttr) error
-
-	// Remove removes the file.
 	//
-	// This is deprecated in favor of UnlinkAt below.
-	Remove() error
-
-	// Rename renames the file.
-	Rename(directory File, name string) error
+	// On the server, SetAttr has a write concurrency guarantee.
+	SetAttr(valid SetAttrMask, attr SetAttr) error
 
 	// Close is called when all references are dropped on the server side,
 	// and Close should be called by the client to drop all references.
@@ -67,65 +95,93 @@ type File interface {
 	// For server-side implementations of Close, the error is ignored.
 	//
 	// Close must be called even when Open has not been called.
+	//
+	// On the server, Close has no concurrency guarantee.
 	Close() error
 
-	// Open is called prior to using read/write.
+	// Open must be called prior to using Read, Write or Readdir. Once Open
+	// is called, some operations, such as Walk, will no longer work.
 	//
-	// The *fd.FD may be nil. If an *fd.FD is provided, ownership now
-	// belongs to the caller and the FD must be non-blocking.
+	// On the client, Open should be called only once. The fd return is
+	// optional, and may be nil.
 	//
-	// If Open returns a non-nil *fd.FD, it should do so for all possible
-	// OpenFlags. If Open returns a nil *fd.FD, it should similarly return
-	// a nil *fd.FD for all possible OpenFlags.
+	// On the server, Open has a read concurrency guarantee. If an *fd.FD
+	// is provided, ownership now belongs to the caller. Open is guaranteed
+	// to be called only once.
 	//
-	// This can be assumed to be one-shot only.
+	// N.B. The server must resolve any lazy paths when open is called.
+	// After this point, read and write may be called on files with no
+	// deletion check, so resolving in the data path is not viable.
 	Open(mode OpenFlags) (*fd.FD, QID, uint32, error)
 
-	// Read reads from this file.
+	// Read reads from this file. Open must be called first.
 	//
 	// This may return io.EOF in addition to syscall.Errno values.
 	//
-	// Preconditions: Open has been called and returned success.
+	// On the server, ReadAt has a read concurrency guarantee. See Open for
+	// additional requirements regarding lazy path resolution.
 	ReadAt(p []byte, offset uint64) (int, error)
 
-	// Write writes to this file.
+	// Write writes to this file. Open must be called first.
 	//
 	// This may return io.EOF in addition to syscall.Errno values.
 	//
-	// Preconditions: Open has been called and returned success.
+	// On the server, WriteAt has a read concurrency guarantee. See Open
+	// for additional requirements regarding lazy path resolution.
 	WriteAt(p []byte, offset uint64) (int, error)
 
-	// FSync syncs this node.
+	// FSync syncs this node. Open must be called first.
 	//
-	// Preconditions: Open has been called and returned success.
+	// On the server, FSync has a read concurrency guarantee.
 	FSync() error
 
 	// Create creates a new regular file and opens it according to the
-	// flags given.
+	// flags given. This file is already Open.
+	//
+	// N.B. On the client, the returned file is a reference to the current
+	// file, which now represents the created file. This is not the case on
+	// the server. These semantics are very subtle and can easily lead to
+	// bugs, but are a consequence of the 9P create operation.
 	//
 	// See p9.File.Open for a description of *fd.FD.
+	//
+	// On the server, Create has a write concurrency guarantee.
 	Create(name string, flags OpenFlags, permissions FileMode, uid UID, gid GID) (*fd.FD, File, QID, uint32, error)
 
 	// Mkdir creates a subdirectory.
+	//
+	// On the server, Mkdir has a write concurrency guarantee.
 	Mkdir(name string, permissions FileMode, uid UID, gid GID) (QID, error)
 
 	// Symlink makes a new symbolic link.
-	Symlink(oldname string, newname string, uid UID, gid GID) (QID, error)
+	//
+	// On the server, Symlink has a write concurrency guarantee.
+	Symlink(oldName string, newName string, uid UID, gid GID) (QID, error)
 
 	// Link makes a new hard link.
-	Link(target File, newname string) error
+	//
+	// On the server, Link has a write concurrency guarantee.
+	Link(target File, newName string) error
 
 	// Mknod makes a new device node.
+	//
+	// On the server, Mknod has a write concurrency guarantee.
 	Mknod(name string, permissions FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error)
 
+	// Rename renames the file.
+	//
+	// Rename will never be called on the server, and RenameAt will always
+	// be used instead.
+	Rename(newDir File, newName string) error
+
 	// RenameAt renames a given file to a new name in a potentially new
 	// directory.
 	//
-	// oldname must be a name relative to this file, which must be a
-	// directory. newname is a name relative to newdir.
+	// oldName must be a name relative to this file, which must be a
+	// directory. newName is a name relative to newDir.
 	//
-	// This is deprecated in favor of Rename.
-	RenameAt(oldname string, newdir File, newname string) error
+	// On the server, RenameAt has a global concurrency guarantee.
+	RenameAt(oldName string, newDir File, newName string) error
 
 	// UnlinkAt the given named file.
 	//
@@ -133,16 +189,20 @@ type File interface {
 	//
 	// Flags are implementation-specific (e.g. O_DIRECTORY), but are
 	// generally Linux unlinkat(2) flags.
+	//
+	// On the server, UnlinkAt has a write concurrency guarantee.
 	UnlinkAt(name string, flags uint32) error
 
 	// Readdir reads directory entries.
 	//
 	// This may return io.EOF in addition to syscall.Errno values.
 	//
-	// Preconditions: Open has been called and returned success.
+	// On the server, Readdir has a read concurrency guarantee.
 	Readdir(offset uint64, count uint32) ([]Dirent, error)
 
 	// Readlink reads the link target.
+	//
+	// On the server, Readlink has a read concurrency guarantee.
 	Readlink() (string, error)
 
 	// Flush is called prior to Close.
@@ -150,16 +210,11 @@ type File interface {
 	// Whereas Close drops all references to the file, Flush cleans up the
 	// file state. Behavior is implementation-specific.
 	//
-	// Flush is not related to flush(9p).  Flush is an extension to 9P2000.L,
+	// Flush is not related to flush(9p). Flush is an extension to 9P2000.L,
 	// see version.go.
-	Flush() error
-
-	// WalkGetAttr walks to the next file and returns its maximal set of
-	// attributes.
 	//
-	// Server-side p9.Files may return syscall.ENOSYS to indicate that Walk
-	// and GetAttr should be used separately to satisfy this request.
-	WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error)
+	// On the server, Flush has a read concurrency guarantee.
+	Flush() error
 
 	// Connect establishes a new host-socket backed connection with a
 	// socket. A File does not need to be opened before it can be connected
@@ -170,8 +225,22 @@ type File interface {
 	//
 	// The returned FD must be non-blocking.
 	//
-	// flags indicates the requested type of socket.
+	// Flags indicates the requested type of socket.
+	//
+	// On the server, Connect has a read concurrency guarantee.
 	Connect(flags ConnectFlags) (*fd.FD, error)
+
+	// Renamed is called when this node is renamed.
+	//
+	// This may not fail. The file will hold a reference to its parent
+	// within the p9 package, and is therefore safe to use for the lifetime
+	// of this File (until Close is called).
+	//
+	// This method should not be called by clients, who should use the
+	// relevant Rename methods. (Although the method will be a no-op.)
+	//
+	// On the server, Renamed has a global concurrency guarantee.
+	Renamed(newDir File, newName string)
 }
 
 // DefaultWalkGetAttr implements File.WalkGetAttr to return ENOSYS for server-side Files.
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 959dff31d..0d7a6138f 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -15,6 +15,7 @@
 package p9
 
 import (
+	"fmt"
 	"io"
 	"os"
 	"path"
@@ -22,22 +23,43 @@ import (
 	"sync/atomic"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
-// newErr returns a new error message from an error.
-func newErr(err error) *Rlerror {
+const maximumNameLength = 255
+
+// ExtractErrno extracts a syscall.Errno from a error, best effort.
+func ExtractErrno(err error) syscall.Errno {
+	switch err {
+	case os.ErrNotExist:
+		return syscall.ENOENT
+	case os.ErrExist:
+		return syscall.EEXIST
+	case os.ErrPermission:
+		return syscall.EACCES
+	case os.ErrInvalid:
+		return syscall.EINVAL
+	}
+
+	// Attempt to unwrap.
 	switch e := err.(type) {
 	case syscall.Errno:
-		return &Rlerror{Error: uint32(e)}
+		return e
 	case *os.PathError:
-		return newErr(e.Err)
+		return ExtractErrno(e.Err)
 	case *os.SyscallError:
-		return newErr(e.Err)
-	default:
-		log.Warningf("unknown error: %v", err)
-		return &Rlerror{Error: uint32(syscall.EIO)}
+		return ExtractErrno(e.Err)
 	}
+
+	// Default case.
+	log.Warningf("unknown error: %v", err)
+	return syscall.EIO
+}
+
+// newErr returns a new error message from an error.
+func newErr(err error) *Rlerror {
+	return &Rlerror{Error: uint32(ExtractErrno(err))}
 }
 
 // handler is implemented for server-handled messages.
@@ -85,13 +107,15 @@ func (t *Tflush) handle(cs *connState) message {
 	return &Rflush{}
 }
 
-// isSafeName returns true iff the name does not contain directory characters.
-//
-// We permit walks only on safe names and store the sequence of paths used for
-// any given walk in each FID. (This is immutable.) We use this to mark
-// relevant FIDs as moved when a successful rename occurs.
-func isSafeName(name string) bool {
-	return name != "" && !strings.Contains(name, "/") && name != "." && name != ".."
+// checkSafeName validates the name and returns nil or returns an error.
+func checkSafeName(name string) error {
+	if name == "" || strings.Contains(name, "/") || name == "." || name == ".." {
+		return syscall.EINVAL
+	}
+	if len(name) > maximumNameLength {
+		return syscall.ENAMETOOLONG
+	}
+	return nil
 }
 
 // handle implements handler.handle.
@@ -110,22 +134,54 @@ func (t *Tremove) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
+	// Frustratingly, because we can't be guaranteed that a rename is not
+	// occurring simultaneously with this removal, we need to acquire the
+	// global rename lock for this kind of remove operation to ensure that
+	// ref.parent does not change out from underneath us.
+	//
+	// This is why Tremove is a bad idea, and clients should generally use
+	// Tunlinkat. All p9 clients will use Tunlinkat.
+	err := ref.safelyGlobal(func() error {
+		// Is this a root? Can't remove that.
+		if ref.isRoot() {
+			return syscall.EINVAL
+		}
+
+		// N.B. this remove operation is permitted, even if the file is open.
+		// See also rename below for reasoning.
+
+		// Is this file already deleted?
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Retrieve the file's proper name.
+		name := ref.parent.pathNode.nameFor(ref)
+
+		// Attempt the removal.
+		if err := ref.parent.file.UnlinkAt(name, 0); err != nil {
+			return err
+		}
+
+		// Mark all relevant fids as deleted. We don't need to lock any
+		// individual nodes because we already hold the global lock.
+		ref.parent.markChildDeleted(name)
+		return nil
+	})
+
 	// "The remove request asks the file server both to remove the file
 	// represented by fid and to clunk the fid, even if the remove fails."
 	//
 	// "It is correct to consider remove to be a clunk with the side effect
 	// of removing the file if permissions allow."
 	// https://swtch.com/plan9port/man/man9/remove.html
-	err := ref.file.Remove()
-
-	// Clunk the FID regardless of Remove error.
 	if !cs.DeleteFID(t.FID) {
 		return newErr(syscall.EBADF)
 	}
-
 	if err != nil {
 		return newErr(err)
 	}
+
 	return &Rremove{}
 }
 
@@ -168,9 +224,12 @@ func (t *Tattach) handle(cs *connState) message {
 
 	// Build a transient reference.
 	root := &fidRef{
+		server:   cs.server,
+		parent:   nil,
 		file:     sf,
 		refs:     1,
-		walkable: attr.Mode.IsDir(),
+		mode:     attr.Mode.FileType(),
+		pathNode: &cs.server.pathTree,
 	}
 	defer root.DecRef()
 
@@ -183,20 +242,24 @@ func (t *Tattach) handle(cs *connState) message {
 	// We want the same traversal checks to apply on attach, so always
 	// attach at the root and use the regular walk paths.
 	names := strings.Split(t.Auth.AttachName, "/")
-	_, target, _, attr, err := doWalk(cs, root, names)
+	_, newRef, _, attr, err := doWalk(cs, root, names)
 	if err != nil {
 		return newErr(err)
 	}
+	defer newRef.DecRef()
 
 	// Insert the FID.
-	cs.InsertFID(t.FID, &fidRef{
-		file:     target,
-		walkable: attr.Mode.IsDir(),
-	})
-
+	cs.InsertFID(t.FID, newRef)
 	return &Rattach{}
 }
 
+// CanOpen returns whether this file open can be opened, read and written to.
+//
+// This includes everything except symlinks and sockets.
+func CanOpen(mode FileMode) bool {
+	return mode.IsRegular() || mode.IsDir() || mode.IsNamedPipe() || mode.IsBlockDevice() || mode.IsCharacterDevice()
+}
+
 // handle implements handler.handle.
 func (t *Tlopen) handle(cs *connState) message {
 	// Lookup the FID.
@@ -210,13 +273,35 @@ func (t *Tlopen) handle(cs *connState) message {
 	defer ref.openedMu.Unlock()
 
 	// Has it been opened already?
-	if ref.opened {
+	if ref.opened || !CanOpen(ref.mode) {
 		return newErr(syscall.EINVAL)
 	}
 
-	// Do the open.
-	osFile, qid, ioUnit, err := ref.file.Open(t.Flags)
-	if err != nil {
+	// Are flags valid?
+	if t.Flags&^OpenFlagsModeMask != 0 {
+		return newErr(syscall.EINVAL)
+	}
+
+	// Is this an attempt to open a directory as writable? Don't accept.
+	if ref.mode.IsDir() && t.Flags != ReadOnly {
+		return newErr(syscall.EINVAL)
+	}
+
+	var (
+		qid    QID
+		ioUnit uint32
+		osFile *fd.FD
+	)
+	if err := ref.safelyRead(func() (err error) {
+		// Has it been deleted already?
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Do the open.
+		osFile, qid, ioUnit, err = ref.file.Open(t.Flags)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -229,8 +314,8 @@ func (t *Tlopen) handle(cs *connState) message {
 
 func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return nil, syscall.EINVAL
+	if err := checkSafeName(t.Name); err != nil {
+		return nil, err
 	}
 
 	// Lookup the FID.
@@ -240,20 +325,48 @@ func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) {
 	}
 	defer ref.DecRef()
 
-	// Do the create.
-	osFile, nsf, qid, ioUnit, err := ref.file.Create(t.Name, t.OpenFlags, t.Permissions, uid, t.GID)
-	if err != nil {
+	var (
+		osFile *fd.FD
+		nsf    File
+		qid    QID
+		ioUnit uint32
+		newRef *fidRef
+	)
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow creation from non-directories or deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the create.
+		osFile, nsf, qid, ioUnit, err = ref.file.Create(t.Name, t.OpenFlags, t.Permissions, uid, t.GID)
+		if err != nil {
+			return err
+		}
+
+		newRef = &fidRef{
+			server:    cs.server,
+			parent:    ref,
+			file:      nsf,
+			opened:    true,
+			openFlags: t.OpenFlags,
+			mode:      ModeRegular,
+			pathNode:  ref.pathNode.pathNodeFor(t.Name),
+		}
+		ref.pathNode.addChild(newRef, t.Name)
+		ref.IncRef() // Acquire parent reference.
+		return nil
+	}); err != nil {
 		return nil, err
 	}
 
 	// Replace the FID reference.
-	//
-	// The new file will be opened already.
-	cs.InsertFID(t.FID, &fidRef{
-		file:      nsf,
-		opened:    true,
-		openFlags: t.OpenFlags,
-	})
+	cs.InsertFID(t.FID, newRef)
 
 	return &Rlcreate{Rlopen: Rlopen{QID: qid, IoUnit: ioUnit, File: osFile}}, nil
 }
@@ -278,8 +391,8 @@ func (t *Tsymlink) handle(cs *connState) message {
 
 func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return nil, syscall.EINVAL
+	if err := checkSafeName(t.Name); err != nil {
+		return nil, err
 	}
 
 	// Lookup the FID.
@@ -289,9 +402,22 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 	}
 	defer ref.DecRef()
 
-	// Do the symlink.
-	qid, err := ref.file.Symlink(t.Target, t.Name, uid, t.GID)
-	if err != nil {
+	var qid QID
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow symlinks from non-directories or deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the symlink.
+		qid, err = ref.file.Symlink(t.Target, t.Name, uid, t.GID)
+		return err
+	}); err != nil {
 		return nil, err
 	}
 
@@ -301,8 +427,8 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 // handle implements handler.handle.
 func (t *Tlink) handle(cs *connState) message {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return newErr(syscall.EINVAL)
+	if err := checkSafeName(t.Name); err != nil {
+		return newErr(err)
 	}
 
 	// Lookup the FID.
@@ -319,8 +445,20 @@ func (t *Tlink) handle(cs *connState) message {
 	}
 	defer refTarget.DecRef()
 
-	// Do the link.
-	if err := ref.file.Link(refTarget.file, t.Name); err != nil {
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow create links from non-directories or deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the link.
+		return ref.file.Link(refTarget.file, t.Name)
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -330,8 +468,11 @@ func (t *Tlink) handle(cs *connState) message {
 // handle implements handler.handle.
 func (t *Trenameat) handle(cs *connState) message {
 	// Don't allow complex names.
-	if !isSafeName(t.OldName) || !isSafeName(t.NewName) {
-		return newErr(syscall.EINVAL)
+	if err := checkSafeName(t.OldName); err != nil {
+		return newErr(err)
+	}
+	if err := checkSafeName(t.NewName); err != nil {
+		return newErr(err)
 	}
 
 	// Lookup the FID.
@@ -348,8 +489,32 @@ func (t *Trenameat) handle(cs *connState) message {
 	}
 	defer refTarget.DecRef()
 
-	// Do the rename.
-	if err := ref.file.RenameAt(t.OldName, refTarget.file, t.NewName); err != nil {
+	// Perform the rename holding the global lock.
+	if err := ref.safelyGlobal(func() (err error) {
+		// Don't allow renaming across deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() || refTarget.isDeleted() || !refTarget.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Is this the same file? If yes, short-circuit and return success.
+		if ref.pathNode == refTarget.pathNode && t.OldName == t.NewName {
+			return nil
+		}
+
+		// Attempt the actual rename.
+		if err := ref.file.RenameAt(t.OldName, refTarget.file, t.NewName); err != nil {
+			return err
+		}
+
+		// Update the path tree.
+		ref.renameChildTo(t.OldName, refTarget, t.NewName)
+		return nil
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -359,8 +524,8 @@ func (t *Trenameat) handle(cs *connState) message {
 // handle implements handler.handle.
 func (t *Tunlinkat) handle(cs *connState) message {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return newErr(syscall.EINVAL)
+	if err := checkSafeName(t.Name); err != nil {
+		return newErr(err)
 	}
 
 	// Lookup the FID.
@@ -370,8 +535,40 @@ func (t *Tunlinkat) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Do the unlink.
-	if err := ref.file.UnlinkAt(t.Name, t.Flags); err != nil {
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow deletion from non-directories or deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Before we do the unlink itself, we need to ensure that there
+		// are no operations in flight on associated path node. The
+		// child's path node lock must be held to ensure that the
+		// unlink at marking the child deleted below is atomic with
+		// respect to any other read or write operations.
+		//
+		// This is one case where we have a lock ordering issue, but
+		// since we always acquire deeper in the hierarchy, we know
+		// that we are free of lock cycles.
+		childPathNode := ref.pathNode.pathNodeFor(t.Name)
+		childPathNode.mu.Lock()
+		defer childPathNode.mu.Unlock()
+
+		// Do the unlink.
+		err = ref.file.UnlinkAt(t.Name, t.Flags)
+		if err != nil {
+			return err
+		}
+
+		// Mark the path as deleted.
+		ref.markChildDeleted(t.Name)
+		return nil
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -381,8 +578,8 @@ func (t *Tunlinkat) handle(cs *connState) message {
 // handle implements handler.handle.
 func (t *Trename) handle(cs *connState) message {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return newErr(syscall.EINVAL)
+	if err := checkSafeName(t.Name); err != nil {
+		return newErr(err)
 	}
 
 	// Lookup the FID.
@@ -399,8 +596,43 @@ func (t *Trename) handle(cs *connState) message {
 	}
 	defer refTarget.DecRef()
 
-	// Call the rename method.
-	if err := ref.file.Rename(refTarget.file, t.Name); err != nil {
+	if err := ref.safelyGlobal(func() (err error) {
+		// Don't allow a root rename.
+		if ref.isRoot() {
+			return syscall.EINVAL
+		}
+
+		// Don't allow renaming deleting entries, or target non-directories.
+		if ref.isDeleted() || refTarget.isDeleted() || !refTarget.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// If the parent is deleted, but we not, something is seriously wrong.
+		// It's fail to die at this point with an assertion failure.
+		if ref.parent.isDeleted() {
+			panic(fmt.Sprintf("parent %+v deleted, child %+v is not", ref.parent, ref))
+		}
+
+		// N.B. The rename operation is allowed to proceed on open files. It
+		// does impact the state of its parent, but this is merely a sanity
+		// check in any case, and the operation is safe. There may be other
+		// files corresponding to the same path that are renamed anyways.
+
+		// Check for the exact same file and short-circuit.
+		oldName := ref.parent.pathNode.nameFor(ref)
+		if ref.parent.pathNode == refTarget.pathNode && oldName == t.Name {
+			return nil
+		}
+
+		// Call the rename method on the parent.
+		if err := ref.parent.file.RenameAt(oldName, refTarget.file, t.Name); err != nil {
+			return err
+		}
+
+		// Update the path tree.
+		ref.parent.renameChildTo(oldName, refTarget, t.Name)
+		return nil
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -416,9 +648,19 @@ func (t *Treadlink) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Do the read.
-	target, err := ref.file.Readlink()
-	if err != nil {
+	var target string
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow readlink on deleted files. There is no need to
+		// check if this file is opened because symlinks cannot be
+		// opened.
+		if ref.isDeleted() || !ref.mode.IsSymlink() {
+			return syscall.EINVAL
+		}
+
+		// Do the read.
+		target, err = ref.file.Readlink()
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -434,26 +676,30 @@ func (t *Tread) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	openFlags, opened := ref.OpenFlags()
-	if !opened {
-		return newErr(syscall.EINVAL)
-	}
-
-	// Can it be read? Check permissions.
-	if openFlags&OpenFlagsModeMask == WriteOnly {
-		return newErr(syscall.EPERM)
-	}
-
 	// Constrain the size of the read buffer.
 	if int(t.Count) > int(maximumLength) {
 		return newErr(syscall.ENOBUFS)
 	}
 
-	// Do the read.
-	data := make([]byte, t.Count)
-	n, err := ref.file.ReadAt(data, t.Offset)
-	if err != nil && err != io.EOF {
+	var (
+		data = make([]byte, t.Count)
+		n    int
+	)
+	if err := ref.safelyRead(func() (err error) {
+		// Has it been opened already?
+		openFlags, opened := ref.OpenFlags()
+		if !opened {
+			return syscall.EINVAL
+		}
+
+		// Can it be read? Check permissions.
+		if openFlags&OpenFlagsModeMask == WriteOnly {
+			return syscall.EPERM
+		}
+
+		n, err = ref.file.ReadAt(data, t.Offset)
+		return err
+	}); err != nil && err != io.EOF {
 		return newErr(err)
 	}
 
@@ -469,20 +715,22 @@ func (t *Twrite) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	openFlags, opened := ref.OpenFlags()
-	if !opened {
-		return newErr(syscall.EINVAL)
-	}
+	var n int
+	if err := ref.safelyRead(func() (err error) {
+		// Has it been opened already?
+		openFlags, opened := ref.OpenFlags()
+		if !opened {
+			return syscall.EINVAL
+		}
 
-	// Can it be write? Check permissions.
-	if openFlags&OpenFlagsModeMask == ReadOnly {
-		return newErr(syscall.EPERM)
-	}
+		// Can it be write? Check permissions.
+		if openFlags&OpenFlagsModeMask == ReadOnly {
+			return syscall.EPERM
+		}
 
-	// Do the write.
-	n, err := ref.file.WriteAt(t.Data, t.Offset)
-	if err != nil {
+		n, err = ref.file.WriteAt(t.Data, t.Offset)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -500,8 +748,8 @@ func (t *Tmknod) handle(cs *connState) message {
 
 func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return nil, syscall.EINVAL
+	if err := checkSafeName(t.Name); err != nil {
+		return nil, err
 	}
 
 	// Lookup the FID.
@@ -511,9 +759,22 @@ func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
 	}
 	defer ref.DecRef()
 
-	// Do the mknod.
-	qid, err := ref.file.Mknod(t.Name, t.Permissions, t.Major, t.Minor, uid, t.GID)
-	if err != nil {
+	var qid QID
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow mknod on deleted files.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the mknod.
+		qid, err = ref.file.Mknod(t.Name, t.Permissions, t.Major, t.Minor, uid, t.GID)
+		return err
+	}); err != nil {
 		return nil, err
 	}
 
@@ -531,8 +792,8 @@ func (t *Tmkdir) handle(cs *connState) message {
 
 func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return nil, syscall.EINVAL
+	if err := checkSafeName(t.Name); err != nil {
+		return nil, err
 	}
 
 	// Lookup the FID.
@@ -542,9 +803,22 @@ func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
 	}
 	defer ref.DecRef()
 
-	// Do the mkdir.
-	qid, err := ref.file.Mkdir(t.Name, t.Permissions, uid, t.GID)
-	if err != nil {
+	var qid QID
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow mkdir on deleted files.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the mkdir.
+		qid, err = ref.file.Mkdir(t.Name, t.Permissions, uid, t.GID)
+		return err
+	}); err != nil {
 		return nil, err
 	}
 
@@ -560,9 +834,20 @@ func (t *Tgetattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Get attributes.
-	qid, valid, attr, err := ref.file.GetAttr(t.AttrMask)
-	if err != nil {
+	// We allow getattr on deleted files. Depending on the backing
+	// implementation, it's possible that races exist that might allow
+	// fetching attributes of other files. But we need to generally allow
+	// refreshing attributes and this is a minor leak, if at all.
+
+	var (
+		qid   QID
+		valid AttrMask
+		attr  Attr
+	)
+	if err := ref.safelyRead(func() (err error) {
+		qid, valid, attr, err = ref.file.GetAttr(t.AttrMask)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -578,8 +863,18 @@ func (t *Tsetattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Set attributes.
-	if err := ref.file.SetAttr(t.Valid, t.SetAttr); err != nil {
+	if err := ref.safelyWrite(func() error {
+		// We don't allow setattr on files that have been deleted.
+		// This might be technically incorrect, as it's possible that
+		// there were multiple links and you can still change the
+		// corresponding inode information.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Set the attributes.
+		return ref.file.SetAttr(t.Valid, t.SetAttr)
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -621,14 +916,25 @@ func (t *Treaddir) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	if _, opened := ref.OpenFlags(); !opened {
-		return newErr(syscall.EINVAL)
-	}
+	var entries []Dirent
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow reading deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Has it been opened already?
+		if _, opened := ref.OpenFlags(); !opened {
+			return syscall.EINVAL
+		}
 
-	// Read the entries.
-	entries, err := ref.file.Readdir(t.Offset, t.Count)
-	if err != nil && err != io.EOF {
+		// Read the entries.
+		entries, err = ref.file.Readdir(t.Offset, t.Count)
+		if err != nil && err != io.EOF {
+			return err
+		}
+		return nil
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -644,13 +950,15 @@ func (t *Tfsync) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	if _, opened := ref.OpenFlags(); !opened {
-		return newErr(syscall.EINVAL)
-	}
+	if err := ref.safelyRead(func() (err error) {
+		// Has it been opened already?
+		if _, opened := ref.OpenFlags(); !opened {
+			return syscall.EINVAL
+		}
 
-	err := ref.file.FSync()
-	if err != nil {
+		// Perform the sync.
+		return ref.file.FSync()
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -671,6 +979,11 @@ func (t *Tstatfs) handle(cs *connState) message {
 		return newErr(err)
 	}
 
+	// Constrain the name length.
+	if st.NameLength > maximumNameLength {
+		st.NameLength = maximumNameLength
+	}
+
 	return &Rstatfs{st}
 }
 
@@ -682,7 +995,7 @@ func (t *Tflushf) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	if err := ref.file.Flush(); err != nil {
+	if err := ref.safelyRead(ref.file.Flush); err != nil {
 		return newErr(err)
 	}
 
@@ -726,12 +1039,14 @@ func walkOne(qids []QID, from File, names []string) ([]QID, File, AttrMask, Attr
 
 // doWalk walks from a given fidRef.
 //
-// This enforces that all intermediate nodes are walkable (directories).
-func doWalk(cs *connState, ref *fidRef, names []string) (qids []QID, sf File, valid AttrMask, attr Attr, err error) {
+// This enforces that all intermediate nodes are walkable (directories). The
+// fidRef returned (newRef) has a reference associated with it that is now
+// owned by the caller and must be handled appropriately.
+func doWalk(cs *connState, ref *fidRef, names []string) (qids []QID, newRef *fidRef, valid AttrMask, attr Attr, err error) {
 	// Check the names.
 	for _, name := range names {
-		if !isSafeName(name) {
-			err = syscall.EINVAL
+		err = checkSafeName(name)
+		if err != nil {
 			return
 		}
 	}
@@ -745,44 +1060,88 @@ func doWalk(cs *connState, ref *fidRef, names []string) (qids []QID, sf File, va
 	// Is this an empty list? Handle specially. We don't actually need to
 	// validate anything since this is always permitted.
 	if len(names) == 0 {
-		return walkOne(nil, ref.file, nil)
-	}
-
-	// Is it walkable?
-	if !ref.walkable {
-		err = syscall.EINVAL
-		return
+		var sf File // Temporary.
+		if err := ref.maybeParent().safelyRead(func() (err error) {
+			// Clone the single element.
+			qids, sf, valid, attr, err = walkOne(nil, ref.file, nil)
+			if err != nil {
+				return err
+			}
+
+			newRef = &fidRef{
+				server:   cs.server,
+				parent:   ref.parent,
+				file:     sf,
+				mode:     ref.mode,
+				pathNode: ref.pathNode,
+
+				// For the clone case, the cloned fid must
+				// preserve the deleted property of the
+				// original FID.
+				deleted: ref.deleted,
+			}
+			if !ref.isRoot() {
+				if !newRef.isDeleted() {
+					// Add only if a non-root node; the same node.
+					ref.parent.pathNode.addChild(newRef, ref.parent.pathNode.nameFor(ref))
+				}
+				ref.parent.IncRef() // Acquire parent reference.
+			}
+			// doWalk returns a reference.
+			newRef.IncRef()
+			return nil
+		}); err != nil {
+			return nil, nil, AttrMask{}, Attr{}, err
+		}
+		return qids, newRef, valid, attr, nil
 	}
 
-	from := ref.file // Start at the passed ref.
-
 	// Do the walk, one element at a time.
+	walkRef := ref
+	walkRef.IncRef()
 	for i := 0; i < len(names); i++ {
-		qids, sf, valid, attr, err = walkOne(qids, from, names[i:i+1])
-
-		// Close the intermediate file. Note that we don't close the
-		// first file because in that case we are walking from the
-		// existing reference.
-		if i > 0 {
-			from.Close()
-		}
-		from = sf // Use the new file.
-
-		// Was there an error walking?
-		if err != nil {
-			return nil, nil, AttrMask{}, Attr{}, err
-		}
-
 		// We won't allow beyond past symlinks; stop here if this isn't
 		// a proper directory and we have additional paths to walk.
-		if !valid.Mode || (!attr.Mode.IsDir() && i < len(names)-1) {
-			from.Close() // Not using the file object.
+		if !walkRef.mode.IsDir() {
+			walkRef.DecRef() // Drop walk reference; no lock required.
 			return nil, nil, AttrMask{}, Attr{}, syscall.EINVAL
 		}
+
+		var sf File // Temporary.
+		if err := walkRef.safelyRead(func() (err error) {
+			qids, sf, valid, attr, err = walkOne(qids, walkRef.file, names[i:i+1])
+			if err != nil {
+				return err
+			}
+
+			// Note that we don't need to acquire a lock on any of
+			// these individual instances. That's because they are
+			// not actually addressable via a FID. They are
+			// anonymous. They exist in the tree for tracking
+			// purposes.
+			newRef := &fidRef{
+				server:   cs.server,
+				parent:   walkRef,
+				file:     sf,
+				mode:     attr.Mode.FileType(),
+				pathNode: walkRef.pathNode.pathNodeFor(names[i]),
+			}
+			walkRef.pathNode.addChild(newRef, names[i])
+			// We allow our walk reference to become the new parent
+			// reference here and so we don't IncRef. Instead, just
+			// set walkRef to the newRef above and acquire a new
+			// walk reference.
+			walkRef = newRef
+			walkRef.IncRef()
+			return nil
+		}); err != nil {
+			walkRef.DecRef() // Drop the old walkRef.
+			return nil, nil, AttrMask{}, Attr{}, err
+		}
 	}
 
 	// Success.
-	return qids, sf, valid, attr, nil
+	return qids, walkRef, valid, attr, nil
 }
 
 // handle implements handler.handle.
@@ -795,17 +1154,14 @@ func (t *Twalk) handle(cs *connState) message {
 	defer ref.DecRef()
 
 	// Do the walk.
-	qids, sf, _, attr, err := doWalk(cs, ref, t.Names)
+	qids, newRef, _, _, err := doWalk(cs, ref, t.Names)
 	if err != nil {
 		return newErr(err)
 	}
+	defer newRef.DecRef()
 
 	// Install the new FID.
-	cs.InsertFID(t.NewFID, &fidRef{
-		file:     sf,
-		walkable: attr.Mode.IsDir(),
-	})
-
+	cs.InsertFID(t.NewFID, newRef)
 	return &Rwalk{QIDs: qids}
 }
 
@@ -819,17 +1175,14 @@ func (t *Twalkgetattr) handle(cs *connState) message {
 	defer ref.DecRef()
 
 	// Do the walk.
-	qids, sf, valid, attr, err := doWalk(cs, ref, t.Names)
+	qids, newRef, valid, attr, err := doWalk(cs, ref, t.Names)
 	if err != nil {
 		return newErr(err)
 	}
+	defer newRef.DecRef()
 
 	// Install the new FID.
-	cs.InsertFID(t.NewFID, &fidRef{
-		file:     sf,
-		walkable: attr.Mode.IsDir(),
-	})
-
+	cs.InsertFID(t.NewFID, newRef)
 	return &Rwalkgetattr{QIDs: qids, Valid: valid, Attr: attr}
 }
 
@@ -878,9 +1231,17 @@ func (t *Tlconnect) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Do the connect.
-	osFile, err := ref.file.Connect(t.Flags)
-	if err != nil {
+	var osFile *fd.FD
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow connecting to deleted files.
+		if ref.isDeleted() || !ref.mode.IsSocket() {
+			return syscall.EINVAL
+		}
+
+		// Do the connect.
+		osFile, err = ref.file.Connect(t.Flags)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
diff --git a/pkg/p9/local_server/BUILD b/pkg/p9/local_server/BUILD
index 8229e6308..b17ebb79d 100644
--- a/pkg/p9/local_server/BUILD
+++ b/pkg/p9/local_server/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "local_server",
     srcs = ["local_server.go"],
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index 1e6aaa762..69b90c6cd 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -318,6 +318,11 @@ func (l *local) Connect(p9.ConnectFlags) (*fd.FD, error) {
 	return nil, syscall.ECONNREFUSED
 }
 
+// Renamed implements p9.File.Renamed.
+func (l *local) Renamed(parent p9.File, newName string) {
+	l.path = path.Join(parent.(*local).path, newName)
+}
+
 func main() {
 	log.SetLevel(log.Debug)
 
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index dfb41bb76..c0d65d82c 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -15,6 +15,7 @@
 package p9
 
 import (
+	"fmt"
 	"reflect"
 	"testing"
 )
@@ -186,6 +187,13 @@ func TestEncodeDecode(t *testing.T) {
 		&Rxattrwalk{
 			Size: 1,
 		},
+		&Txattrcreate{
+			FID:      1,
+			Name:     "a",
+			AttrSize: 2,
+			Flags:    3,
+		},
+		&Rxattrcreate{},
 		&Treaddir{
 			Directory: 1,
 			Offset:    2,
@@ -389,3 +397,32 @@ func TestEncodeDecode(t *testing.T) {
 		}
 	}
 }
+
+func TestMessageStrings(t *testing.T) {
+	for typ, fn := range messageRegistry {
+		name := fmt.Sprintf("%+v", typ)
+		t.Run(name, func(t *testing.T) {
+			defer func() { // Ensure no panic.
+				if r := recover(); r != nil {
+					t.Errorf("printing %s failed: %v", name, r)
+				}
+			}()
+			m := fn()
+			_ = fmt.Sprintf("%v", m)
+			err := ErrInvalidMsgType{typ}
+			_ = err.Error()
+		})
+	}
+}
+
+func TestRegisterDuplicate(t *testing.T) {
+	defer func() {
+		if r := recover(); r == nil {
+			// We expect a panic.
+			t.FailNow()
+		}
+	}()
+
+	// Register a duplicate.
+	register(&Rlerror{})
+}
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 3b0993ecd..be644e7bf 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -984,6 +984,30 @@ func (s *SetAttr) Encode(b *buffer) {
 	b.Write64(s.MTimeNanoSeconds)
 }
 
+// Apply applies this to the given Attr.
+func (a *Attr) Apply(mask SetAttrMask, attr SetAttr) {
+	if mask.Permissions {
+		a.Mode = a.Mode&^PermissionsMask | (attr.Permissions & PermissionsMask)
+	}
+	if mask.UID {
+		a.UID = attr.UID
+	}
+	if mask.GID {
+		a.GID = attr.GID
+	}
+	if mask.Size {
+		a.Size = attr.Size
+	}
+	if mask.ATime {
+		a.ATimeSeconds = attr.ATimeSeconds
+		a.ATimeNanoSeconds = attr.ATimeNanoSeconds
+	}
+	if mask.MTime {
+		a.MTimeSeconds = attr.MTimeSeconds
+		a.MTimeNanoSeconds = attr.MTimeNanoSeconds
+	}
+}
+
 // Dirent is used for readdir.
 type Dirent struct {
 	// QID is the entry QID.
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index d6f428e11..7c4b875ce 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -1,16 +1,60 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+alias(
+    name = "mockgen",
+    actual = "@com_github_golang_mock//mockgen:mockgen",
+)
 
-go_test(
-    name = "p9test_test",
-    size = "small",
-    srcs = ["client_test.go"],
-    embed = [":p9test"],
+MOCK_SRC_PACKAGE = "gvisor.googlesource.com/gvisor/pkg/p9"
+
+# mockgen_reflect is a source file that contains mock generation code that
+# imports the p9 package and generates a specification via reflection. The
+# usual generation path must be split into two distinct parts because the full
+# source tree is not available to all build targets. Only declared depencies
+# are available (and even then, not the Go source files).
+genrule(
+    name = "mockgen_reflect",
+    testonly = 1,
+    outs = ["mockgen_reflect.go"],
+    cmd = (
+        "$(location :mockgen) " +
+        "-package p9test " +
+        "-prog_only " + MOCK_SRC_PACKAGE + " " +
+        "Attacher,File > $@"
+    ),
+    tools = [":mockgen"],
+)
+
+# mockgen_exec is the binary that includes the above reflection generator.
+# Running this binary will emit an encoded version of the p9 Attacher and File
+# structures. This is consumed by the mocks genrule, below.
+go_binary(
+    name = "mockgen_exec",
+    testonly = 1,
+    srcs = ["mockgen_reflect.go"],
     deps = [
-        "//pkg/fd",
         "//pkg/p9",
-        "//pkg/unet",
+        "@com_github_golang_mock//mockgen/model:go_default_library",
+    ],
+)
+
+# mocks consumes the encoded output above, and generates the full source for a
+# set of mocks. These are included directly in the p9test library.
+genrule(
+    name = "mocks",
+    testonly = 1,
+    outs = ["mocks.go"],
+    cmd = (
+        "$(location :mockgen) " +
+        "-package p9test " +
+        "-exec_only $(location :mockgen_exec) " + MOCK_SRC_PACKAGE + " File > $@"
+    ),
+    tools = [
+        ":mockgen",
+        ":mockgen_exec",
     ],
 )
 
@@ -18,11 +62,27 @@ go_library(
     name = "p9test",
     srcs = [
         "mocks.go",
+        "p9test.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/p9/p9test",
     visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/unet",
+        "@com_github_golang_mock//gomock:go_default_library",
+    ],
+)
+
+go_test(
+    name = "client_test",
+    size = "small",
+    srcs = ["client_test.go"],
+    embed = [":p9test"],
     deps = [
         "//pkg/fd",
         "//pkg/p9",
+        "@com_github_golang_mock//gomock:go_default_library",
     ],
 )
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index db562b9ba..242d81b95 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -15,360 +15,2059 @@
 package p9test
 
 import (
-	"io/ioutil"
+	"bytes"
+	"fmt"
+	"io"
+	"math/rand"
 	"os"
 	"reflect"
+	"strings"
+	"sync"
 	"syscall"
 	"testing"
+	"time"
 
+	"github.com/golang/mock/gomock"
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
-func TestDonateFD(t *testing.T) {
-	// Temporary file.
-	osFile, err := ioutil.TempFile("", "p9")
+func TestPanic(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	// Create a new root.
+	d := h.NewDirectory(nil)(nil)
+	defer d.Close() // Needed manually.
+	h.Attacher.EXPECT().Attach().Return(d, nil).Do(func() {
+		// Panic here, and ensure that we get back EFAULT.
+		panic("handler")
+	})
+
+	// Attach to the client.
+	if _, err := c.Attach("/"); err != syscall.EFAULT {
+		t.Fatalf("got attach err %v, want EFAULT", err)
+	}
+}
+
+func TestAttachNoLeak(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	// Create a new root.
+	d := h.NewDirectory(nil)(nil)
+	h.Attacher.EXPECT().Attach().Return(d, nil).Times(1)
+
+	// Attach to the client.
+	f, err := c.Attach("/")
+	if err != nil {
+		t.Fatalf("got attach err %v, want nil", err)
+	}
+
+	// Don't close the file. This should be closed automatically when the
+	// client disconnects. The mock asserts that everything is closed
+	// exactly once. This statement just removes the unused variable error.
+	_ = f
+}
+
+func TestBadAttach(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	// Return an error on attach.
+	h.Attacher.EXPECT().Attach().Return(nil, syscall.EINVAL).Times(1)
+
+	// Attach to the client.
+	if _, err := c.Attach("/"); err != syscall.EINVAL {
+		t.Fatalf("got attach err %v, want syscall.EINVAL", err)
+	}
+}
+
+func TestWalkAttach(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	// Create a new root.
+	d := h.NewDirectory(map[string]Generator{
+		"a": h.NewDirectory(map[string]Generator{
+			"b": h.NewFile(),
+		}),
+	})(nil)
+	h.Attacher.EXPECT().Attach().Return(d, nil).Times(1)
+
+	// Attach to the client as a non-root, and ensure that the walk above
+	// occurs as expected. We should get back b, and all references should
+	// be dropped when the file is closed.
+	f, err := c.Attach("/a/b")
+	if err != nil {
+		t.Fatalf("got attach err %v, want nil", err)
+	}
+	defer f.Close()
+
+	// Check that's a regular file.
+	if _, _, attr, err := f.GetAttr(p9.AttrMaskAll()); err != nil {
+		t.Errorf("got err %v, want nil", err)
+	} else if !attr.Mode.IsRegular() {
+		t.Errorf("got mode %v, want regular file", err)
+	}
+}
+
+// newTypeMap returns a new type map dictionary.
+func newTypeMap(h *Harness) map[string]Generator {
+	return map[string]Generator{
+		"directory":        h.NewDirectory(map[string]Generator{}),
+		"file":             h.NewFile(),
+		"symlink":          h.NewSymlink(),
+		"block-device":     h.NewBlockDevice(),
+		"character-device": h.NewCharacterDevice(),
+		"named-pipe":       h.NewNamedPipe(),
+		"socket":           h.NewSocket(),
+	}
+}
+
+// newRoot returns a new root filesystem.
+//
+// This is set up in a deterministic way for testing most operations.
+//
+// The represented file system looks like:
+// - file
+// - symlink
+// - directory
+// ...
+// + one
+//   - file
+//   - symlink
+//   - directory
+//   ...
+//   + two
+//     - file
+//     - symlink
+//     - directory
+//     ...
+// + three
+//   - file
+//   - symlink
+//   - directory
+//   ...
+func newRoot(h *Harness, c *p9.Client) (*Mock, p9.File) {
+	root := newTypeMap(h)
+	one := newTypeMap(h)
+	two := newTypeMap(h)
+	three := newTypeMap(h)
+	one["two"] = h.NewDirectory(two)      // Will be nested in one.
+	root["one"] = h.NewDirectory(one)     // Top level.
+	root["three"] = h.NewDirectory(three) // Alternate top-level.
+
+	// Create a new root.
+	rootBackend := h.NewDirectory(root)(nil)
+	h.Attacher.EXPECT().Attach().Return(rootBackend, nil)
+
+	// Attach to the client.
+	r, err := c.Attach("/")
+	if err != nil {
+		h.t.Fatalf("got attach err %v, want nil", err)
+	}
+
+	return rootBackend, r
+}
+
+func allInvalidNames(from string) []string {
+	return []string{
+		from + "/other",
+		from + "/..",
+		from + "/.",
+		from + "/",
+		"other/" + from,
+		"/" + from,
+		"./" + from,
+		"../" + from,
+		".",
+		"..",
+		"/",
+		"",
+	}
+}
+
+func TestWalkInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	// Run relevant tests.
+	for name := range newTypeMap(h) {
+		// These are all the various ways that one might attempt to
+		// construct compound paths. They should all be rejected, as
+		// any compound that contains a / is not allowed, as well as
+		// the singular paths of '.' and '..'.
+		if _, _, err := root.Walk([]string{".", name}); err != syscall.EINVAL {
+			t.Errorf("Walk through . %s wanted EINVAL, got %v", name, err)
+		}
+		if _, _, err := root.Walk([]string{"..", name}); err != syscall.EINVAL {
+			t.Errorf("Walk through . %s wanted EINVAL, got %v", name, err)
+		}
+		if _, _, err := root.Walk([]string{name, "."}); err != syscall.EINVAL {
+			t.Errorf("Walk through %s . wanted EINVAL, got %v", name, err)
+		}
+		if _, _, err := root.Walk([]string{name, ".."}); err != syscall.EINVAL {
+			t.Errorf("Walk through %s .. wanted EINVAL, got %v", name, err)
+		}
+		for _, invalidName := range allInvalidNames(name) {
+			if _, _, err := root.Walk([]string{invalidName}); err != syscall.EINVAL {
+				t.Errorf("Walk through %s wanted EINVAL, got %v", invalidName, err)
+			}
+		}
+		wantErr := syscall.EINVAL
+		if name == "directory" {
+			// We can attempt a walk through a directory. However,
+			// we should never see a file named "other", so we
+			// expect this to return ENOENT.
+			wantErr = syscall.ENOENT
+		}
+		if _, _, err := root.Walk([]string{name, "other"}); err != wantErr {
+			t.Errorf("Walk through %s/other wanted %v, got %v", name, wantErr, err)
+		}
+
+		// Do a successful walk.
+		_, f, err := root.Walk([]string{name})
+		if err != nil {
+			t.Errorf("Walk to %s wanted nil, got %v", name, err)
+		}
+		defer f.Close()
+		local := h.Pop(f)
+
+		// Check that the file matches.
+		_, localMask, localAttr, localErr := local.GetAttr(p9.AttrMaskAll())
+		if _, mask, attr, err := f.GetAttr(p9.AttrMaskAll()); mask != localMask || attr != localAttr || err != localErr {
+			t.Errorf("GetAttr got (%v, %v, %v), wanted (%v, %v, %v)",
+				mask, attr, err, localMask, localAttr, localErr)
+		}
+
+		// Ensure we can't walk backwards.
+		if _, _, err := f.Walk([]string{"."}); err != syscall.EINVAL {
+			t.Errorf("Walk through %s/. wanted EINVAL, got %v", name, err)
+		}
+		if _, _, err := f.Walk([]string{".."}); err != syscall.EINVAL {
+			t.Errorf("Walk through %s/.. wanted EINVAL, got %v", name, err)
+		}
+	}
+}
+
+// fileGenerator is a function to generate files via walk or create.
+//
+// Examples are:
+//	- walkHelper
+//	- walkAndOpenHelper
+//	- createHelper
+type fileGenerator func(*Harness, string, p9.File) (*Mock, *Mock, p9.File)
+
+// walkHelper walks to the given file.
+//
+// The backends of the parent and walked file are returned, as well as the
+// walked client file.
+func walkHelper(h *Harness, name string, dir p9.File) (parentBackend *Mock, walkedBackend *Mock, walked p9.File) {
+	_, parent, err := dir.Walk(nil)
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer parent.Close()
+	parentBackend = h.Pop(parent)
+
+	_, walked, err = parent.Walk([]string{name})
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	walkedBackend = h.Pop(walked)
+
+	return parentBackend, walkedBackend, walked
+}
+
+// walkAndOpenHelper additionally opens the walked file, if possible.
+func walkAndOpenHelper(h *Harness, name string, dir p9.File) (*Mock, *Mock, p9.File) {
+	parentBackend, walkedBackend, walked := walkHelper(h, name, dir)
+	if p9.CanOpen(walkedBackend.Attr.Mode) {
+		// Open for all file types that we can. We stick to a read-only
+		// open here because directories may not be opened otherwise.
+		walkedBackend.EXPECT().Open(p9.ReadOnly).Times(1)
+		if _, _, _, err := walked.Open(p9.ReadOnly); err != nil {
+			h.t.Errorf("got open err %v, want nil", err)
+		}
+	} else {
+		// ... or assert an error for others.
+		if _, _, _, err := walked.Open(p9.ReadOnly); err != syscall.EINVAL {
+			h.t.Errorf("got open err %v, want EINVAL", err)
+		}
+	}
+	return parentBackend, walkedBackend, walked
+}
+
+// createHelper creates the given file and returns the parent directory,
+// created file and client file, which must be closed when done.
+func createHelper(h *Harness, name string, dir p9.File) (*Mock, *Mock, p9.File) {
+	// Clone the directory first, since Create replaces the existing file.
+	// We change the type after calling create.
+	_, dirThenFile, err := dir.Walk(nil)
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+
+	// Create a new server-side file. On the server-side, the a new file is
+	// returned from a create call. The client will reuse the same file,
+	// but we still expect the normal chain of closes. This complicates
+	// things a bit because the "parent" will always chain to the cloned
+	// dir above.
+	dirBackend := h.Pop(dirThenFile)   // New backend directory.
+	newFile := h.NewFile()(dirBackend) // New file with backend parent.
+	dirBackend.EXPECT().Create(name, gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, newFile, newFile.QID, uint32(0), nil)
+
+	// Create via the client.
+	_, dirThenFile, _, _, err = dirThenFile.Create(name, p9.ReadOnly, 0, 0, 0)
 	if err != nil {
-		t.Fatalf("could not create temporary file: %v", err)
+		h.t.Fatalf("got create err %v, want nil", err)
+	}
+
+	// Ensure subsequent walks succeed.
+	dirBackend.AddChild(name, h.NewFile())
+	return dirBackend, newFile, dirThenFile
+}
+
+// deprecatedRemover allows us to access the deprecated Remove operation within
+// the p9.File client object.
+type deprecatedRemover interface {
+	Remove() error
+}
+
+// checkDeleted asserts that relevant methods fail for an unlinked file.
+//
+// This function will close the file at the end.
+func checkDeleted(h *Harness, file p9.File) {
+	defer file.Close() // See doc.
+
+	if _, _, _, err := file.Open(p9.ReadOnly); err != syscall.EINVAL {
+		h.t.Errorf("open while deleted, got %v, want EINVAL", err)
+	}
+	if _, _, _, _, err := file.Create("created", p9.ReadOnly, 0, 0, 0); err != syscall.EINVAL {
+		h.t.Errorf("create while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Symlink("old", "new", 0, 0); err != syscall.EINVAL {
+		h.t.Errorf("symlink while deleted, got %v, want EINVAL", err)
+	}
+	// N.B. This link is technically invalid, but if a call to link is
+	// actually made in the backend then the mock will panic.
+	if err := file.Link(file, "new"); err != syscall.EINVAL {
+		h.t.Errorf("link while deleted, got %v, want EINVAL", err)
+	}
+	if err := file.RenameAt("src", file, "dst"); err != syscall.EINVAL {
+		h.t.Errorf("renameAt while deleted, got %v, want EINVAL", err)
+	}
+	if err := file.UnlinkAt("file", 0); err != syscall.EINVAL {
+		h.t.Errorf("unlinkAt while deleted, got %v, want EINVAL", err)
+	}
+	if err := file.Rename(file, "dst"); err != syscall.EINVAL {
+		h.t.Errorf("rename while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Readlink(); err != syscall.EINVAL {
+		h.t.Errorf("readlink while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Mkdir("dir", p9.ModeDirectory, 0, 0); err != syscall.EINVAL {
+		h.t.Errorf("mkdir while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Mknod("dir", p9.ModeDirectory, 0, 0, 0, 0); err != syscall.EINVAL {
+		h.t.Errorf("mknod while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Readdir(0, 1); err != syscall.EINVAL {
+		h.t.Errorf("readdir while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Connect(p9.ConnectFlags(0)); err != syscall.EINVAL {
+		h.t.Errorf("connect while deleted, got %v, want EINVAL", err)
+	}
+
+	// The remove method is technically deprecated, but we want to ensure
+	// that it still checks for deleted appropriately. We must first clone
+	// the file because remove is equivalent to close.
+	_, newFile, err := file.Walk(nil)
+	if err == syscall.EBUSY {
+		// We can't walk from here because this reference is open
+		// aleady. Okay, we will also have unopened cases through
+		// TestUnlink, just skip the remove operation for now.
+		return
+	} else if err != nil {
+		h.t.Fatalf("clone failed, got %v, want nil", err)
+	}
+	if err := newFile.(deprecatedRemover).Remove(); err != syscall.EINVAL {
+		h.t.Errorf("remove while deleted, got %v, want EINVAL", err)
+	}
+}
+
+// deleter is a function to remove a file.
+type deleter func(parent p9.File, name string) error
+
+// unlinkAt is a deleter.
+func unlinkAt(parent p9.File, name string) error {
+	// Call unlink. Note that a filesystem may normally impose additional
+	// constaints on unlinkat success, such as ensuring that a directory is
+	// empty, requiring AT_REMOVEDIR in flags to remove a directory, etc.
+	// None of that is required internally (entire trees can be marked
+	// deleted when this operation succeeds), so the mock will succeed.
+	return parent.UnlinkAt(name, 0)
+}
+
+// remove is a deleter.
+func remove(parent p9.File, name string) error {
+	// See notes above re: remove.
+	_, newFile, err := parent.Walk([]string{name})
+	if err != nil {
+		// Should not be expected.
+		return err
+	}
+
+	// Do the actual remove.
+	if err := newFile.(deprecatedRemover).Remove(); err != nil {
+		return err
+	}
+
+	// Ensure that the remove closed the file.
+	if err := newFile.(deprecatedRemover).Remove(); err != syscall.EBADF {
+		return syscall.EBADF // Propagate this code.
+	}
+
+	return nil
+}
+
+// unlinkHelper unlinks the noted path, and ensures that all relevant
+// operations on that path, acquired from multiple paths, start failing.
+func unlinkHelper(h *Harness, root p9.File, targetNames []string, targetGen fileGenerator, deleteFn deleter) {
+	// name is the file to be unlinked.
+	name := targetNames[len(targetNames)-1]
+
+	// Walk to the directory containing the target.
+	_, parent, err := root.Walk(targetNames[:len(targetNames)-1])
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer parent.Close()
+	parentBackend := h.Pop(parent)
+
+	// Walk to or generate the target file.
+	_, _, target := targetGen(h, name, parent)
+	defer checkDeleted(h, target)
+
+	// Walk to a second reference.
+	_, second, err := parent.Walk([]string{name})
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer checkDeleted(h, second)
+
+	// Walk to a third reference, from the start.
+	_, third, err := root.Walk(targetNames)
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer checkDeleted(h, third)
+
+	// This will be translated in the backend to an unlinkat.
+	parentBackend.EXPECT().UnlinkAt(name, uint32(0)).Return(nil)
+
+	// Actually perform the deletion.
+	if err := deleteFn(parent, name); err != nil {
+		h.t.Fatalf("got delete err %v, want nil", err)
+	}
+}
+
+func unlinkTest(t *testing.T, targetNames []string, targetGen fileGenerator) {
+	t.Run(fmt.Sprintf("unlinkAt(%s)", strings.Join(targetNames, "/")), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		unlinkHelper(h, root, targetNames, targetGen, unlinkAt)
+	})
+	t.Run(fmt.Sprintf("remove(%s)", strings.Join(targetNames, "/")), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		unlinkHelper(h, root, targetNames, targetGen, remove)
+	})
+}
+
+func TestUnlink(t *testing.T) {
+	// Unlink all files.
+	for name := range newTypeMap(nil) {
+		unlinkTest(t, []string{name}, walkHelper)
+		unlinkTest(t, []string{name}, walkAndOpenHelper)
+		unlinkTest(t, []string{"one", name}, walkHelper)
+		unlinkTest(t, []string{"one", name}, walkAndOpenHelper)
+		unlinkTest(t, []string{"one", "two", name}, walkHelper)
+		unlinkTest(t, []string{"one", "two", name}, walkAndOpenHelper)
+	}
+
+	// Unlink a directory.
+	unlinkTest(t, []string{"one"}, walkHelper)
+	unlinkTest(t, []string{"one"}, walkAndOpenHelper)
+	unlinkTest(t, []string{"one", "two"}, walkHelper)
+	unlinkTest(t, []string{"one", "two"}, walkAndOpenHelper)
+
+	// Unlink created files.
+	unlinkTest(t, []string{"created"}, createHelper)
+	unlinkTest(t, []string{"one", "created"}, createHelper)
+	unlinkTest(t, []string{"one", "two", "created"}, createHelper)
+}
+
+func TestUnlinkAtInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if err := root.UnlinkAt(invalidName, 0); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+// expectRenamed asserts an ordered sequence of rename calls, based on all the
+// elements in elements being the source, and the first element therein
+// changing to dstName, parented at dstParent.
+func expectRenamed(file *Mock, elements []string, dstParent *Mock, dstName string) *gomock.Call {
+	if len(elements) > 0 {
+		// Recurse to the parent, if necessary.
+		call := expectRenamed(file.parent, elements[:len(elements)-1], dstParent, dstName)
+
+		// Recursive case: this element is unchanged, but should have
+		// it's hook called after the parent.
+		return file.EXPECT().Renamed(file.parent, elements[len(elements)-1]).Do(func(p p9.File, _ string) {
+			file.parent = p.(*Mock)
+		}).After(call)
+	}
+
+	// Base case: this is the changed element.
+	return file.EXPECT().Renamed(dstParent, dstName).Do(func(p p9.File, name string) {
+		file.parent = p.(*Mock)
+	})
+}
+
+// renamer is a rename function.
+type renamer func(h *Harness, srcParent, dstParent p9.File, origName, newName string, selfRename bool) error
+
+// renameAt is a renamer.
+func renameAt(_ *Harness, srcParent, dstParent p9.File, srcName, dstName string, selfRename bool) error {
+	return srcParent.RenameAt(srcName, dstParent, dstName)
+}
+
+// rename is a renamer.
+func rename(h *Harness, srcParent, dstParent p9.File, srcName, dstName string, selfRename bool) error {
+	_, f, err := srcParent.Walk([]string{srcName})
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	if !selfRename {
+		backend := h.Pop(f)
+		backend.EXPECT().Renamed(gomock.Any(), dstName).Do(func(p p9.File, name string) {
+			backend.parent = p.(*Mock) // Required for close ordering.
+		})
+	}
+	return f.Rename(dstParent, dstName)
+}
+
+// renameHelper executes a rename, and asserts that all relevant elements
+// receive expected notifications. If overwriting a file, this includes
+// ensuring that the target has been appropriately marked as unlinked.
+func renameHelper(h *Harness, root p9.File, srcNames []string, dstNames []string, target fileGenerator, renameFn renamer) {
+	// Walk to the directory containing the target.
+	srcQID, targetParent, err := root.Walk(srcNames[:len(srcNames)-1])
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer targetParent.Close()
+	targetParentBackend := h.Pop(targetParent)
+
+	// Walk to or generate the target file.
+	_, targetBackend, src := target(h, srcNames[len(srcNames)-1], targetParent)
+	defer src.Close()
+
+	// Walk to a second reference.
+	_, second, err := targetParent.Walk([]string{srcNames[len(srcNames)-1]})
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer second.Close()
+	secondBackend := h.Pop(second)
+
+	// Walk to a third reference, from the start.
+	_, third, err := root.Walk(srcNames)
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer third.Close()
+	thirdBackend := h.Pop(third)
+
+	// Find the common suffix to identify the rename parent.
+	var (
+		renameDestPath []string
+		renameSrcPath  []string
+		selfRename     bool
+	)
+	for i := 1; i <= len(srcNames) && i <= len(dstNames); i++ {
+		if srcNames[len(srcNames)-i] != dstNames[len(dstNames)-i] {
+			// Take the full prefix of dstNames up until this
+			// point, including the first mismatched name. The
+			// first mismatch must be the renamed entry.
+			renameDestPath = dstNames[:len(dstNames)-i+1]
+			renameSrcPath = srcNames[:len(srcNames)-i+1]
+
+			// Does the renameDestPath fully contain the
+			// renameSrcPath here? If yes, then this is a mismatch.
+			// We can't rename the src to some subpath of itself.
+			if len(renameDestPath) > len(renameSrcPath) &&
+				reflect.DeepEqual(renameDestPath[:len(renameSrcPath)], renameSrcPath) {
+				renameDestPath = nil
+				renameSrcPath = nil
+				continue
+			}
+			break
+		}
+	}
+	if len(renameSrcPath) == 0 || len(renameDestPath) == 0 {
+		// This must be a rename to self, or a tricky look-alike. This
+		// happens iff we fail to find a suitable divergence in the two
+		// paths. It's a true self move if the path length is the same.
+		renameDestPath = dstNames
+		renameSrcPath = srcNames
+		selfRename = len(srcNames) == len(dstNames)
+	}
+
+	// Walk to the source parent.
+	_, srcParent, err := root.Walk(renameSrcPath[:len(renameSrcPath)-1])
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer srcParent.Close()
+	srcParentBackend := h.Pop(srcParent)
+
+	// Walk to the destination parent.
+	_, dstParent, err := root.Walk(renameDestPath[:len(renameDestPath)-1])
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer dstParent.Close()
+	dstParentBackend := h.Pop(dstParent)
+
+	// expectedErr is the result of the rename operation.
+	var expectedErr error
+
+	// Walk to the target file, if one exists.
+	dstQID, dst, err := root.Walk(renameDestPath)
+	if err == nil {
+		if !selfRename && srcQID[0].Type == dstQID[0].Type {
+			// If there is a destination file, and is it of the
+			// same type as the source file, then we expect the
+			// rename to succeed. We expect the destination file to
+			// be deleted, so we run a deletion test on it in this
+			// case.
+			defer checkDeleted(h, dst)
+		} else {
+			if !selfRename {
+				// If the type is different than the
+				// destination, then we expect the rename to
+				// fail. We expect ensure that this is
+				// returned.
+				expectedErr = syscall.EINVAL
+			} else {
+				// This is the file being renamed to itself.
+				// This is technically allowed and a no-op, but
+				// all the triggers will fire.
+			}
+			dst.Close()
+		}
+	}
+	dstName := renameDestPath[len(renameDestPath)-1] // Renamed element.
+	srcName := renameSrcPath[len(renameSrcPath)-1]   // Renamed element.
+	if expectedErr == nil && !selfRename {
+		// Expect all to be renamed appropriately. Note that if this is
+		// a final file being renamed, then we expect the file to be
+		// called with the new parent. If not, then we expect the
+		// rename hook to be called, but the parent will remain
+		// unchanged.
+		elements := srcNames[len(renameSrcPath):]
+		expectRenamed(targetBackend, elements, dstParentBackend, dstName)
+		expectRenamed(secondBackend, elements, dstParentBackend, dstName)
+		expectRenamed(thirdBackend, elements, dstParentBackend, dstName)
+
+		// The target parent has also been opened, and may be moved
+		// directly or indirectly.
+		if len(elements) > 1 {
+			expectRenamed(targetParentBackend, elements[:len(elements)-1], dstParentBackend, dstName)
+		}
+	}
+
+	// Expect the rename if it's not the same file. Note that like unlink,
+	// renames are always translated to the at variant in the backend.
+	if !selfRename {
+		srcParentBackend.EXPECT().RenameAt(srcName, dstParentBackend, dstName).Return(expectedErr)
+	}
+
+	// Perform the actual rename; everything has been lined up.
+	if err := renameFn(h, srcParent, dstParent, srcName, dstName, selfRename); err != expectedErr {
+		h.t.Fatalf("got rename err %v, want %v", err, expectedErr)
+	}
+}
+
+func renameTest(t *testing.T, srcNames []string, dstNames []string, target fileGenerator) {
+	t.Run(fmt.Sprintf("renameAt(%s->%s)", strings.Join(srcNames, "/"), strings.Join(dstNames, "/")), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		renameHelper(h, root, srcNames, dstNames, target, renameAt)
+	})
+	t.Run(fmt.Sprintf("rename(%s->%s)", strings.Join(srcNames, "/"), strings.Join(dstNames, "/")), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		renameHelper(h, root, srcNames, dstNames, target, rename)
+	})
+}
+
+func TestRename(t *testing.T) {
+	// In-directory rename, simple case.
+	for name := range newTypeMap(nil) {
+		// Within the root.
+		renameTest(t, []string{name}, []string{"renamed"}, walkHelper)
+		renameTest(t, []string{name}, []string{"renamed"}, walkAndOpenHelper)
+
+		// Within a subdirectory.
+		renameTest(t, []string{"one", name}, []string{"one", "renamed"}, walkHelper)
+		renameTest(t, []string{"one", name}, []string{"one", "renamed"}, walkAndOpenHelper)
+	}
+
+	// ... with created files.
+	renameTest(t, []string{"created"}, []string{"renamed"}, createHelper)
+	renameTest(t, []string{"one", "created"}, []string{"one", "renamed"}, createHelper)
+
+	// Across directories.
+	for name := range newTypeMap(nil) {
+		// Down one level.
+		renameTest(t, []string{"one", name}, []string{"one", "two", "renamed"}, walkHelper)
+		renameTest(t, []string{"one", name}, []string{"one", "two", "renamed"}, walkAndOpenHelper)
+
+		// Up one level.
+		renameTest(t, []string{"one", "two", name}, []string{"one", "renamed"}, walkHelper)
+		renameTest(t, []string{"one", "two", name}, []string{"one", "renamed"}, walkAndOpenHelper)
+
+		// Across at the same level.
+		renameTest(t, []string{"one", name}, []string{"three", "renamed"}, walkHelper)
+		renameTest(t, []string{"one", name}, []string{"three", "renamed"}, walkAndOpenHelper)
+	}
+
+	// ... with created files.
+	renameTest(t, []string{"one", "created"}, []string{"one", "two", "renamed"}, createHelper)
+	renameTest(t, []string{"one", "two", "created"}, []string{"one", "renamed"}, createHelper)
+	renameTest(t, []string{"one", "created"}, []string{"three", "renamed"}, createHelper)
+
+	// Renaming parents.
+	for name := range newTypeMap(nil) {
+		// Rename a parent.
+		renameTest(t, []string{"one", name}, []string{"renamed", name}, walkHelper)
+		renameTest(t, []string{"one", name}, []string{"renamed", name}, walkAndOpenHelper)
+
+		// Rename a super parent.
+		renameTest(t, []string{"one", "two", name}, []string{"renamed", name}, walkHelper)
+		renameTest(t, []string{"one", "two", name}, []string{"renamed", name}, walkAndOpenHelper)
+	}
+
+	// ... with created files.
+	renameTest(t, []string{"one", "created"}, []string{"renamed", "created"}, createHelper)
+	renameTest(t, []string{"one", "two", "created"}, []string{"renamed", "created"}, createHelper)
+
+	// Over existing files, including itself.
+	for name := range newTypeMap(nil) {
+		for other := range newTypeMap(nil) {
+			// Overwrite the noted file (may be itself).
+			renameTest(t, []string{"one", name}, []string{"one", other}, walkHelper)
+			renameTest(t, []string{"one", name}, []string{"one", other}, walkAndOpenHelper)
+
+			// Overwrite other files in another directory.
+			renameTest(t, []string{"one", name}, []string{"one", "two", other}, walkHelper)
+			renameTest(t, []string{"one", name}, []string{"one", "two", other}, walkAndOpenHelper)
+		}
+
+		// Overwrite by moving the parent.
+		renameTest(t, []string{"three", name}, []string{"one", name}, walkHelper)
+		renameTest(t, []string{"three", name}, []string{"one", name}, walkAndOpenHelper)
+
+		// Create over the types.
+		renameTest(t, []string{"one", "created"}, []string{"one", name}, createHelper)
+		renameTest(t, []string{"one", "created"}, []string{"one", "two", name}, createHelper)
+		renameTest(t, []string{"three", "created"}, []string{"one", name}, createHelper)
+	}
+}
+
+func TestRenameInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if err := root.Rename(root, invalidName); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestRenameAtInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if err := root.RenameAt(invalidName, root, "okay"); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+			if err := root.RenameAt("okay", root, invalidName); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestReadlink(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			// Walk to the file normally.
+			_, f, err := root.Walk([]string{name})
+			if err != nil {
+				t.Fatalf("walk failed: got %v, wanted nil", err)
+			}
+			defer f.Close()
+			backend := h.Pop(f)
+
+			const symlinkTarget = "symlink-target"
+
+			if backend.Attr.Mode.IsSymlink() {
+				// This should only go through on symlinks.
+				backend.EXPECT().Readlink().Return(symlinkTarget, nil)
+			}
+
+			// Attempt a Readlink operation.
+			target, err := f.Readlink()
+			if err != nil && err != syscall.EINVAL {
+				t.Errorf("readlink got %v, wanted EINVAL", err)
+			} else if err == nil && target != symlinkTarget {
+				t.Errorf("readlink got %v, wanted %v", target, symlinkTarget)
+			}
+		})
+	}
+}
+
+// fdTest is a wrapper around operations that may send file descriptors. This
+// asserts that the file descriptors are working as intended.
+func fdTest(t *testing.T, sendFn func(*fd.FD) *fd.FD) {
+	// Create a pipe that we can read from.
+	r, w, err := os.Pipe()
+	if err != nil {
+		t.Fatalf("unable to create pipe: %v", err)
+	}
+	defer r.Close()
+	defer w.Close()
+
+	// Attempt to send the write end.
+	wFD, err := fd.NewFromFile(w)
+	if err != nil {
+		t.Fatalf("unable to convert file: %v", err)
+	}
+	defer wFD.Close() // This is a copy.
+
+	// Send wFD and receive newFD.
+	newFD := sendFn(wFD)
+	defer newFD.Close()
+
+	// Attempt to write.
+	const message = "hello"
+	if _, err := newFD.Write([]byte(message)); err != nil {
+		t.Fatalf("write got %v, wanted nil", err)
+	}
+
+	// Should see the message on our end.
+	buffer := []byte(message)
+	if _, err := io.ReadFull(r, buffer); err != nil {
+		t.Fatalf("read got %v, wanted nil", err)
+	}
+	if string(buffer) != message {
+		t.Errorf("got message %v, wanted %v", string(buffer), message)
+	}
+}
+
+func TestConnect(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			// Walk to the file normally.
+			_, backend, f := walkHelper(h, name, root)
+			defer f.Close()
+
+			// Catch all the non-socket cases.
+			if !backend.Attr.Mode.IsSocket() {
+				// This has been set up to fail if Connect is called.
+				if _, err := f.Connect(p9.ConnectFlags(0)); err != syscall.EINVAL {
+					t.Errorf("connect got %v, wanted EINVAL", err)
+				}
+				return
+			}
+
+			// Ensure the fd exchange works.
+			fdTest(t, func(send *fd.FD) *fd.FD {
+				backend.EXPECT().Connect(p9.ConnectFlags(0)).Return(send, nil)
+				recv, err := backend.Connect(p9.ConnectFlags(0))
+				if err != nil {
+					t.Fatalf("connect got %v, wanted nil", err)
+				}
+				return recv
+			})
+		})
+	}
+}
+
+func TestReaddir(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			// Walk to the file normally.
+			_, backend, f := walkHelper(h, name, root)
+			defer f.Close()
+
+			// Catch all the non-directory cases.
+			if !backend.Attr.Mode.IsDir() {
+				// This has also been set up to fail if Readdir is called.
+				if _, err := f.Readdir(0, 1); err != syscall.EINVAL {
+					t.Errorf("readdir got %v, wanted EINVAL", err)
+				}
+				return
+			}
+
+			// Ensure that readdir works for directories.
+			if _, err := f.Readdir(0, 1); err != syscall.EINVAL {
+				t.Errorf("readdir got %v, wanted EINVAL", err)
+			}
+			if _, _, _, err := f.Open(p9.ReadWrite); err != syscall.EINVAL {
+				t.Errorf("readdir got %v, wanted EINVAL", err)
+			}
+			if _, _, _, err := f.Open(p9.WriteOnly); err != syscall.EINVAL {
+				t.Errorf("readdir got %v, wanted EINVAL", err)
+			}
+			backend.EXPECT().Open(p9.ReadOnly).Times(1)
+			if _, _, _, err := f.Open(p9.ReadOnly); err != nil {
+				t.Errorf("readdir got %v, wanted nil", err)
+			}
+			backend.EXPECT().Readdir(uint64(0), uint32(1)).Times(1)
+			if _, err := f.Readdir(0, 1); err != nil {
+				t.Errorf("readdir got %v, wanted nil", err)
+			}
+		})
+	}
+}
+
+func TestOpen(t *testing.T) {
+	type openTest struct {
+		name  string
+		mode  p9.OpenFlags
+		err   error
+		match func(p9.FileMode) bool
+	}
+
+	cases := []openTest{
+		{
+			name:  "invalid",
+			mode:  ^p9.OpenFlagsModeMask,
+			err:   syscall.EINVAL,
+			match: func(p9.FileMode) bool { return true },
+		},
+		{
+			name:  "not-openable-read-only",
+			mode:  p9.ReadOnly,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
+		},
+		{
+			name:  "not-openable-write-only",
+			mode:  p9.WriteOnly,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
+		},
+		{
+			name:  "not-openable-read-write",
+			mode:  p9.ReadWrite,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
+		},
+		{
+			name:  "directory-read-only",
+			mode:  p9.ReadOnly,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+		},
+		{
+			name:  "directory-read-write",
+			mode:  p9.ReadWrite,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+		},
+		{
+			name:  "directory-write-only",
+			mode:  p9.WriteOnly,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+		},
+		{
+			name:  "read-only",
+			mode:  p9.ReadOnly,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) },
+		},
+		{
+			name:  "write-only",
+			mode:  p9.WriteOnly,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
+		},
+		{
+			name:  "read-write",
+			mode:  p9.ReadWrite,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
+		},
+	}
+
+	// Open(mode OpenFlags) (*fd.FD, QID, uint32, error)
+	// - only works on Regular, NamedPipe, BLockDevice, CharacterDevice
+	// - returning a file works as expected
+	for name := range newTypeMap(nil) {
+		for _, tc := range cases {
+			t.Run(fmt.Sprintf("%s-%s", tc.name, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				// Walk to the file normally.
+				_, backend, f := walkHelper(h, name, root)
+				defer f.Close()
+
+				// Does this match the case?
+				if !tc.match(backend.Attr.Mode) {
+					t.SkipNow()
+				}
+
+				// Ensure open-required operations fail.
+				if _, err := f.ReadAt([]byte("hello"), 0); err != syscall.EINVAL {
+					t.Errorf("readAt got %v, wanted EINVAL", err)
+				}
+				if _, err := f.WriteAt(make([]byte, 6), 0); err != syscall.EINVAL {
+					t.Errorf("writeAt got %v, wanted EINVAL", err)
+				}
+				if err := f.FSync(); err != syscall.EINVAL {
+					t.Errorf("fsync got %v, wanted EINVAL", err)
+				}
+				if _, err := f.Readdir(0, 1); err != syscall.EINVAL {
+					t.Errorf("readdir got %v, wanted EINVAL", err)
+				}
+
+				// Attempt the given open.
+				if tc.err != nil {
+					// We expect an error, just test and return.
+					if _, _, _, err := f.Open(tc.mode); err != tc.err {
+						t.Fatalf("open with mode %v got %v, want %v", tc.mode, err, tc.err)
+					}
+					return
+				}
+
+				// Run an FD test, since we expect success.
+				fdTest(t, func(send *fd.FD) *fd.FD {
+					backend.EXPECT().Open(tc.mode).Return(send, p9.QID{}, uint32(0), nil).Times(1)
+					recv, _, _, err := f.Open(tc.mode)
+					if err != tc.err {
+						t.Fatalf("open with mode %v got %v, want %v", tc.mode, err, tc.err)
+					}
+					return recv
+				})
+
+				// If the open was successful, attempt another one.
+				if _, _, _, err := f.Open(tc.mode); err != syscall.EINVAL {
+					t.Errorf("second open with mode %v got %v, want EINVAL", tc.mode, err)
+				}
+
+				// Ensure that all illegal operations fail.
+				if _, _, err := f.Walk(nil); err != syscall.EINVAL && err != syscall.EBUSY {
+					t.Errorf("walk got %v, wanted EINVAL or EBUSY", err)
+				}
+				if _, _, _, _, err := f.WalkGetAttr(nil); err != syscall.EINVAL && err != syscall.EBUSY {
+					t.Errorf("walkgetattr got %v, wanted EINVAL or EBUSY", err)
+				}
+			})
+		}
+	}
+}
+
+func TestClose(t *testing.T) {
+	type closeTest struct {
+		name    string
+		closeFn func(backend *Mock, f p9.File)
+	}
+
+	cases := []closeTest{
+		{
+			name: "close",
+			closeFn: func(_ *Mock, f p9.File) {
+				f.Close()
+			},
+		},
+		{
+			name: "remove",
+			closeFn: func(backend *Mock, f p9.File) {
+				// Allow the rename call in the parent, automatically translated.
+				backend.parent.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Times(1)
+				f.(deprecatedRemover).Remove()
+			},
+		},
 	}
-	os.Remove(osFile.Name())
 
-	hfi, err := osFile.Stat()
-	if err != nil {
-		osFile.Close()
-		t.Fatalf("stat failed: %v", err)
+	for name := range newTypeMap(nil) {
+		for _, tc := range cases {
+			t.Run(fmt.Sprintf("%s(%s)", tc.name, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				// Walk to the file normally.
+				_, backend, f := walkHelper(h, name, root)
+
+				// Close via the prescribed method.
+				tc.closeFn(backend, f)
+
+				// Everything should fail with EBADF.
+				if _, _, err := f.Walk(nil); err != syscall.EBADF {
+					t.Errorf("walk got %v, wanted EBADF", err)
+				}
+				if _, err := f.StatFS(); err != syscall.EBADF {
+					t.Errorf("statfs got %v, wanted EBADF", err)
+				}
+				if _, _, _, err := f.GetAttr(p9.AttrMaskAll()); err != syscall.EBADF {
+					t.Errorf("getattr got %v, wanted EBADF", err)
+				}
+				if err := f.SetAttr(p9.SetAttrMask{}, p9.SetAttr{}); err != syscall.EBADF {
+					t.Errorf("setattrk got %v, wanted EBADF", err)
+				}
+				if err := f.Rename(root, "new-name"); err != syscall.EBADF {
+					t.Errorf("rename got %v, wanted EBADF", err)
+				}
+				if err := f.Close(); err != syscall.EBADF {
+					t.Errorf("close got %v, wanted EBADF", err)
+				}
+				if _, _, _, err := f.Open(p9.ReadOnly); err != syscall.EBADF {
+					t.Errorf("open got %v, wanted EBADF", err)
+				}
+				if _, err := f.ReadAt([]byte("hello"), 0); err != syscall.EBADF {
+					t.Errorf("readAt got %v, wanted EBADF", err)
+				}
+				if _, err := f.WriteAt(make([]byte, 6), 0); err != syscall.EBADF {
+					t.Errorf("writeAt got %v, wanted EBADF", err)
+				}
+				if err := f.FSync(); err != syscall.EBADF {
+					t.Errorf("fsync got %v, wanted EBADF", err)
+				}
+				if _, _, _, _, err := f.Create("new-file", p9.ReadWrite, 0, 0, 0); err != syscall.EBADF {
+					t.Errorf("create got %v, wanted EBADF", err)
+				}
+				if _, err := f.Mkdir("new-directory", 0, 0, 0); err != syscall.EBADF {
+					t.Errorf("mkdir got %v, wanted EBADF", err)
+				}
+				if _, err := f.Symlink("old-name", "new-name", 0, 0); err != syscall.EBADF {
+					t.Errorf("symlink got %v, wanted EBADF", err)
+				}
+				if err := f.Link(root, "new-name"); err != syscall.EBADF {
+					t.Errorf("link got %v, wanted EBADF", err)
+				}
+				if _, err := f.Mknod("new-block-device", 0, 0, 0, 0, 0); err != syscall.EBADF {
+					t.Errorf("mknod got %v, wanted EBADF", err)
+				}
+				if err := f.RenameAt("old-name", root, "new-name"); err != syscall.EBADF {
+					t.Errorf("renameAt got %v, wanted EBADF", err)
+				}
+				if err := f.UnlinkAt("name", 0); err != syscall.EBADF {
+					t.Errorf("unlinkAt got %v, wanted EBADF", err)
+				}
+				if _, err := f.Readdir(0, 1); err != syscall.EBADF {
+					t.Errorf("readdir got %v, wanted EBADF", err)
+				}
+				if _, err := f.Readlink(); err != syscall.EBADF {
+					t.Errorf("readlink got %v, wanted EBADF", err)
+				}
+				if err := f.Flush(); err != syscall.EBADF {
+					t.Errorf("flush got %v, wanted EBADF", err)
+				}
+				if _, _, _, _, err := f.WalkGetAttr(nil); err != syscall.EBADF {
+					t.Errorf("walkgetattr got %v, wanted EBADF", err)
+				}
+				if _, err := f.Connect(p9.ConnectFlags(0)); err != syscall.EBADF {
+					t.Errorf("connect got %v, wanted EBADF", err)
+				}
+			})
+		}
+	}
+}
+
+// onlyWorksOnOpenThings is a helper test method for operations that should
+// only work on files that have been explicitly opened.
+func onlyWorksOnOpenThings(h *Harness, t *testing.T, name string, root p9.File, mode p9.OpenFlags, expectedErr error, fn func(backend *Mock, f p9.File, shouldSucceed bool) error) {
+	// Walk to the file normally.
+	_, backend, f := walkHelper(h, name, root)
+	defer f.Close()
+
+	// Does it work before opening?
+	if err := fn(backend, f, false); err != syscall.EINVAL {
+		t.Errorf("operation got %v, wanted EINVAL", err)
 	}
-	osFileStat := hfi.Sys().(*syscall.Stat_t)
 
-	f, err := fd.NewFromFile(osFile)
-	// osFile should always be closed.
-	osFile.Close()
-	if err != nil {
-		t.Fatalf("unable to create file: %v", err)
+	// Is this openable?
+	if !p9.CanOpen(backend.Attr.Mode) {
+		return // Nothing to do.
+	}
+
+	// If this is a directory, we can't handle writing.
+	if backend.Attr.Mode.IsDir() && (mode == p9.ReadWrite || mode == p9.WriteOnly) {
+		return // Skip.
+	}
+
+	// Open the file.
+	backend.EXPECT().Open(mode)
+	if _, _, _, err := f.Open(mode); err != nil {
+		t.Fatalf("open got %v, wanted nil", err)
+	}
+
+	// Attempt the operation.
+	if err := fn(backend, f, expectedErr == nil); err != expectedErr {
+		t.Fatalf("operation got %v, wanted %v", err, expectedErr)
+	}
+}
+
+func TestRead(t *testing.T) {
+	type readTest struct {
+		name string
+		mode p9.OpenFlags
+		err  error
+	}
+
+	cases := []readTest{
+		{
+			name: "read-only",
+			mode: p9.ReadOnly,
+			err:  nil,
+		},
+		{
+			name: "read-write",
+			mode: p9.ReadWrite,
+			err:  nil,
+		},
+		{
+			name: "write-only",
+			mode: p9.WriteOnly,
+			err:  syscall.EPERM,
+		},
+	}
+
+	for name := range newTypeMap(nil) {
+		for _, tc := range cases {
+			t.Run(fmt.Sprintf("%s-%s", tc.name, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				const message = "hello"
+
+				onlyWorksOnOpenThings(h, t, name, root, tc.mode, tc.err, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+					if !shouldSucceed {
+						_, err := f.ReadAt([]byte(message), 0)
+						return err
+					}
+
+					// Prepare for the call to readAt in the backend.
+					backend.EXPECT().ReadAt(gomock.Any(), uint64(0)).Do(func(p []byte, offset uint64) {
+						copy(p, message)
+					}).Return(len(message), nil)
+
+					// Make the client call.
+					p := make([]byte, 2*len(message)) // Double size.
+					n, err := f.ReadAt(p, 0)
+
+					// Sanity check result.
+					if err != nil {
+						return err
+					}
+					if n != len(message) {
+						t.Fatalf("message length incorrect, got %d, want %d", n, len(message))
+					}
+					if !bytes.Equal(p[:n], []byte(message)) {
+						t.Fatalf("message incorrect, got %v, want %v", p, []byte(message))
+					}
+					return nil // Success.
+				})
+			})
+		}
+	}
+}
+
+func TestWrite(t *testing.T) {
+	type writeTest struct {
+		name string
+		mode p9.OpenFlags
+		err  error
 	}
 
-	// Craft attacher to attach to the mocked file which will return our
-	// temporary file.
-	fileMock := &FileMock{
-		OpenMock: OpenMock{File: f},
-		GetAttrMock: GetAttrMock{
-			// The mode must be valid always.
-			Valid: p9.AttrMask{Mode: true},
+	cases := []writeTest{
+		{
+			name: "read-only",
+			mode: p9.ReadOnly,
+			err:  syscall.EPERM,
+		},
+		{
+			name: "read-write",
+			mode: p9.ReadWrite,
+			err:  nil,
+		},
+		{
+			name: "write-only",
+			mode: p9.WriteOnly,
+			err:  nil,
 		},
 	}
-	attacher := &AttachMock{
-		File: fileMock,
+
+	for name := range newTypeMap(nil) {
+		for _, tc := range cases {
+			t.Run(fmt.Sprintf("%s-%s", tc.name, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				const message = "hello"
+
+				onlyWorksOnOpenThings(h, t, name, root, tc.mode, tc.err, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+					if !shouldSucceed {
+						_, err := f.WriteAt([]byte(message), 0)
+						return err
+					}
+
+					// Prepare for the call to readAt in the backend.
+					var output []byte // Saved by Do below.
+					backend.EXPECT().WriteAt(gomock.Any(), uint64(0)).Do(func(p []byte, offset uint64) {
+						output = p
+					}).Return(len(message), nil)
+
+					// Make the client call.
+					n, err := f.WriteAt([]byte(message), 0)
+
+					// Sanity check result.
+					if err != nil {
+						return err
+					}
+					if n != len(message) {
+						t.Fatalf("message length incorrect, got %d, want %d", n, len(message))
+					}
+					if !bytes.Equal(output, []byte(message)) {
+						t.Fatalf("message incorrect, got %v, want %v", output, []byte(message))
+					}
+					return nil // Success.
+				})
+			})
+		}
 	}
+}
 
-	// Make socket pair.
-	serverSocket, clientSocket, err := unet.SocketPair(false)
-	if err != nil {
-		t.Fatalf("socketpair got err %v wanted nil", err)
+func TestFSync(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		for _, mode := range []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} {
+			t.Run(fmt.Sprintf("%s-%s", mode, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				onlyWorksOnOpenThings(h, t, name, root, mode, nil, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+					if shouldSucceed {
+						backend.EXPECT().FSync().Times(1)
+					}
+					return f.FSync()
+				})
+			})
+		}
 	}
-	defer clientSocket.Close()
-	server := p9.NewServer(attacher)
-	go server.Handle(serverSocket)
-	client, err := p9.NewClient(clientSocket, 1024*1024 /* 1M message size */, p9.HighestVersionString())
-	if err != nil {
-		t.Fatalf("new client got %v, expected nil", err)
+}
+
+func TestFlush(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			_, backend, f := walkHelper(h, name, root)
+			defer f.Close()
+
+			backend.EXPECT().Flush()
+			f.Flush()
+		})
 	}
+}
 
-	// Attach to the mocked file.
-	cFile, err := client.Attach("")
-	if err != nil {
-		t.Fatalf("attach failed: %v", err)
+// onlyWorksOnDirectories is a helper test method for operations that should
+// only work on unopened directories, such as create, mkdir and symlink.
+func onlyWorksOnDirectories(h *Harness, t *testing.T, name string, root p9.File, fn func(backend *Mock, f p9.File, shouldSucceed bool) error) {
+	// Walk to the file normally.
+	_, backend, f := walkHelper(h, name, root)
+	defer f.Close()
+
+	// Only directories support mknod.
+	if !backend.Attr.Mode.IsDir() {
+		if err := fn(backend, f, false); err != syscall.EINVAL {
+			t.Errorf("operation got %v, wanted EINVAL", err)
+		}
+		return // Nothing else to do.
 	}
 
-	// Try to open the mocked file.
-	clientHostFile, _, _, err := cFile.Open(0)
-	if err != nil {
-		t.Fatalf("open failed: %v", err)
+	// Should succeed.
+	if err := fn(backend, f, true); err != nil {
+		t.Fatalf("operation got %v, wanted nil", err)
 	}
-	var clientStat syscall.Stat_t
-	if err := syscall.Fstat(clientHostFile.FD(), &clientStat); err != nil {
-		t.Fatalf("stat failed: %v", err)
+
+	// Open the directory.
+	backend.EXPECT().Open(p9.ReadOnly).Times(1)
+	if _, _, _, err := f.Open(p9.ReadOnly); err != nil {
+		t.Fatalf("open got %v, wanted nil", err)
 	}
 
-	// Compare inode nums to make sure it's the same file.
-	if clientStat.Ino != osFileStat.Ino {
-		t.Errorf("fd donation failed")
+	// Should not work again.
+	if err := fn(backend, f, false); err != syscall.EINVAL {
+		t.Fatalf("operation got %v, wanted EINVAL", err)
 	}
 }
 
-// TestClient is a megatest.
-//
-// This allows us to probe various edge cases, while changing the state of the
-// underlying server in expected ways. The test slowly builds server state and
-// is documented inline.
-//
-// We wind up with the following, after probing edge cases:
-//
-// FID 1: ServerFile (sf).
-// FID 2: Directory (d).
-// FID 3: File (f).
-// FID 4: Symlink (s).
-//
-// Although you should use the FID method on the individual files.
-func TestClient(t *testing.T) {
+func TestCreate(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if !shouldSucceed {
+					_, _, _, _, err := f.Create("new-file", p9.ReadWrite, 0, 1, 2)
+					return err
+				}
+
+				// If the create is going to succeed, then we
+				// need to create a new backend file, and we
+				// clone to ensure that we don't close the
+				// original.
+				_, newF, err := f.Walk(nil)
+				if err != nil {
+					t.Fatalf("clone got %v, wanted nil", err)
+				}
+				defer newF.Close()
+				newBackend := h.Pop(newF)
+
+				// Run a regular FD test to validate that path.
+				fdTest(t, func(send *fd.FD) *fd.FD {
+					// Return the send FD on success.
+					newFile := h.NewFile()(backend) // New file with the parent backend.
+					newBackend.EXPECT().Create("new-file", p9.ReadWrite, p9.FileMode(0), p9.UID(1), p9.GID(2)).Return(send, newFile, p9.QID{}, uint32(0), nil)
+
+					// Receive the fd back.
+					recv, _, _, _, err := newF.Create("new-file", p9.ReadWrite, 0, 1, 2)
+					if err != nil {
+						t.Fatalf("create got %v, wanted nil", err)
+					}
+					return recv
+				})
+
+				// The above will fail via normal test flow, so
+				// we can assume that it passed.
+				return nil
+			})
+		})
+	}
+}
+
+func TestCreateInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if _, _, _, _, err := root.Create(invalidName, p9.ReadWrite, 0, 0, 0); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestMkdir(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if shouldSucceed {
+					backend.EXPECT().Mkdir("new-directory", p9.FileMode(0), p9.UID(1), p9.GID(2))
+				}
+				_, err := f.Mkdir("new-directory", 0, 1, 2)
+				return err
+			})
+		})
+	}
+}
+
+func TestMkdirInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if _, err := root.Mkdir(invalidName, 0, 0, 0); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestSymlink(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if shouldSucceed {
+					backend.EXPECT().Symlink("old-name", "new-name", p9.UID(1), p9.GID(2))
+				}
+				_, err := f.Symlink("old-name", "new-name", 1, 2)
+				return err
+			})
+		})
+	}
+}
+
+func TestSyminkInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			// We need only test for invalid names in the new name,
+			// the target can be an arbitrary string and we don't
+			// need to sanity check it.
+			if _, err := root.Symlink("old-name", invalidName, 0, 0); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestLink(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if shouldSucceed {
+					backend.EXPECT().Link(gomock.Any(), "new-link")
+				}
+				return f.Link(f, "new-link")
+			})
+		})
+	}
+}
+
+func TestLinkInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if err := root.Link(root, invalidName); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestMknod(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if shouldSucceed {
+					backend.EXPECT().Mknod("new-block-device", p9.FileMode(0), uint32(1), uint32(2), p9.UID(3), p9.GID(4)).Times(1)
+				}
+				_, err := f.Mknod("new-block-device", 0, 1, 2, 3, 4)
+				return err
+			})
+		})
+	}
+}
+
+// concurrentFn is a specification of a concurrent operation. This is used to
+// drive the concurrency tests below.
+type concurrentFn struct {
+	name  string
+	match func(p9.FileMode) bool
+	op    func(h *Harness, backend *Mock, f p9.File, callback func())
+}
+
+func concurrentTest(t *testing.T, name string, fn1, fn2 concurrentFn, sameDir, expectedOkay bool) {
 	var (
-		// Sentinel error.
-		sentinelErr = syscall.Errno(4383)
-
-		// Backend mocks.
-		a  = &AttachMock{}
-		sf = &FileMock{}
-		d  = &FileMock{}
-		f  = &FileMock{}
-		s  = &FileMock{}
-
-		// Client Files for the above.
-		sfFile p9.File
+		names1 []string
+		names2 []string
 	)
+	if sameDir {
+		// Use the same file one directory up.
+		names1, names2 = []string{"one", name}, []string{"one", name}
+	} else {
+		// For different directories, just use siblings.
+		names1, names2 = []string{"one", name}, []string{"three", name}
+	}
 
-	testSteps := []struct {
-		name string
-		fn   func(*p9.Client) error
-		want error
-	}{
-		{
-			name: "bad-attach",
-			want: sentinelErr,
-			fn: func(c *p9.Client) error {
-				a.File = nil
-				a.Err = sentinelErr
-				_, err := c.Attach("")
-				return err
+	t.Run(fmt.Sprintf("%s(%v)+%s(%v)", fn1.name, names1, fn2.name, names2), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		// Walk to both files as given.
+		_, f1, err := root.Walk(names1)
+		if err != nil {
+			t.Fatalf("error walking, got %v, want nil", err)
+		}
+		defer f1.Close()
+		b1 := h.Pop(f1)
+		_, f2, err := root.Walk(names2)
+		if err != nil {
+			t.Fatalf("error walking, got %v, want nil", err)
+		}
+		defer f2.Close()
+		b2 := h.Pop(f2)
+
+		// Are these a good match for the current test case?
+		if !fn1.match(b1.Attr.Mode) {
+			t.SkipNow()
+		}
+		if !fn2.match(b2.Attr.Mode) {
+			t.SkipNow()
+		}
+
+		// Construct our "concurrency creator".
+		in1 := make(chan struct{}, 1)
+		in2 := make(chan struct{}, 1)
+		var top sync.WaitGroup
+		var fns sync.WaitGroup
+		defer top.Wait()
+		top.Add(2) // Accounting for below.
+		defer fns.Done()
+		fns.Add(1) // See line above; released before top.Wait.
+		go func() {
+			defer top.Done()
+			fn1.op(h, b1, f1, func() {
+				in1 <- struct{}{}
+				fns.Wait()
+			})
+		}()
+		go func() {
+			defer top.Done()
+			fn2.op(h, b2, f2, func() {
+				in2 <- struct{}{}
+				fns.Wait()
+			})
+		}()
+
+		// Compute a reasonable timeout. If we expect the operation to hang,
+		// give it 10 milliseconds before we assert that it's fine. After all,
+		// there will be a lot of these tests. If we don't expect it to hang,
+		// give it a full minute, since the machine could be slow.
+		timeout := 10 * time.Millisecond
+		if expectedOkay {
+			timeout = 1 * time.Minute
+		}
+
+		// Read the first channel.
+		var second chan struct{}
+		select {
+		case <-in1:
+			second = in2
+		case <-in2:
+			second = in1
+		}
+
+		// Catch concurrency.
+		select {
+		case <-second:
+			// We finished successful. Is this good? Depends on the
+			// expected result.
+			if !expectedOkay {
+				t.Errorf("%q and %q proceeded concurrently!", fn1.name, fn2.name)
+			}
+		case <-time.After(timeout):
+			// Great, things did not proceed concurrently. Is that what we
+			// expected?
+			if expectedOkay {
+				t.Errorf("%q and %q hung concurrently!", fn1.name, fn2.name)
+			}
+		}
+	})
+}
+
+func randomFileName() string {
+	return fmt.Sprintf("%x", rand.Int63())
+}
+
+func TestConcurrency(t *testing.T) {
+	readExclusive := []concurrentFn{
+		{
+			// N.B. We can't explicitly check WalkGetAttr behavior,
+			// but we rely on the fact that the internal code paths
+			// are the same.
+			name:  "walk",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				// See the documentation of WalkCallback.
+				// Because walk is actually implemented by the
+				// mock, we need a special place for this
+				// callback.
+				//
+				// Note that a clone actually locks the parent
+				// node. So we walk from this node to test
+				// concurrent operations appropriately.
+				backend.WalkCallback = func() error {
+					callback()
+					return nil
+				}
+				f.Walk([]string{randomFileName()}) // Won't exist.
 			},
 		},
 		{
-			name: "attach",
-			fn: func(c *p9.Client) error {
-				a.Called = false
-				a.File = sf
-				a.Err = nil
-				// The attached root must have a valid mode.
-				sf.GetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory}
-				sf.GetAttrMock.Valid = p9.AttrMask{Mode: true}
-				var err error
-				sfFile, err = c.Attach("")
-				if !a.Called {
-					t.Errorf("Attach never Called?")
-				}
-				return err
+			name:  "fsync",
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Open(gomock.Any())
+				backend.EXPECT().FSync().Do(func() {
+					callback()
+				})
+				f.Open(p9.ReadOnly) // Required.
+				f.FSync()
 			},
 		},
 		{
-			name: "bad-walk",
-			want: sentinelErr,
-			fn: func(c *p9.Client) error {
-				// Walk only called when WalkGetAttr not available.
-				sf.WalkGetAttrMock.Err = syscall.ENOSYS
-				sf.WalkMock.File = d
-				sf.WalkMock.Err = sentinelErr
-				_, _, err := sfFile.Walk([]string{"foo", "bar"})
-				return err
+			name:  "readdir",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Open(gomock.Any())
+				backend.EXPECT().Readdir(gomock.Any(), gomock.Any()).Do(func(uint64, uint32) {
+					callback()
+				})
+				f.Open(p9.ReadOnly) // Required.
+				f.Readdir(0, 1)
 			},
 		},
 		{
-			name: "walk-to-dir",
-			fn: func(c *p9.Client) error {
-				// Walk only called when WalkGetAttr not available.
-				sf.WalkGetAttrMock.Err = syscall.ENOSYS
-				sf.WalkMock.Called = false
-				sf.WalkMock.Names = nil
-				sf.WalkMock.File = d
-				sf.WalkMock.Err = nil
-				sf.WalkMock.QIDs = []p9.QID{{Type: 1}}
-				// All intermediate values must be directories.
-				d.WalkGetAttrMock.Err = syscall.ENOSYS
-				d.WalkMock.Called = false
-				d.WalkMock.Names = nil
-				d.WalkMock.File = d // Walk to self.
-				d.WalkMock.Err = nil
-				d.WalkMock.QIDs = []p9.QID{{Type: 1}}
-				d.GetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory}
-				d.GetAttrMock.Valid = p9.AttrMask{Mode: true}
-				var qids []p9.QID
-				var err error
-				qids, _, err = sfFile.Walk([]string{"foo", "bar"})
-				if !sf.WalkMock.Called {
-					t.Errorf("Walk never Called?")
-				}
-				if !d.GetAttrMock.Called {
-					t.Errorf("GetAttr never Called?")
-				}
-				if !reflect.DeepEqual(sf.WalkMock.Names, []string{"foo"}) {
-					t.Errorf("got names %v wanted []{foo}", sf.WalkMock.Names)
-				}
-				if !reflect.DeepEqual(d.WalkMock.Names, []string{"bar"}) {
-					t.Errorf("got names %v wanted []{bar}", d.WalkMock.Names)
-				}
-				if len(qids) != 2 || qids[len(qids)-1].Type != 1 {
-					t.Errorf("got qids %v wanted []{..., {Type: 1}}", qids)
-				}
-				return err
+			name:  "readlink",
+			match: func(mode p9.FileMode) bool { return mode.IsSymlink() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Readlink().Do(func() {
+					callback()
+				})
+				f.Readlink()
 			},
 		},
 		{
-			name: "walkgetattr-to-dir",
-			fn: func(c *p9.Client) error {
-				sf.WalkGetAttrMock.Called = false
-				sf.WalkGetAttrMock.Names = nil
-				sf.WalkGetAttrMock.File = d
-				sf.WalkGetAttrMock.Err = nil
-				sf.WalkGetAttrMock.QIDs = []p9.QID{{Type: 1}}
-				sf.WalkGetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory, UID: 1}
-				sf.WalkGetAttrMock.Valid = p9.AttrMask{Mode: true}
-				// See above.
-				d.WalkGetAttrMock.Called = false
-				d.WalkGetAttrMock.Names = nil
-				d.WalkGetAttrMock.File = d // Walk to self.
-				d.WalkGetAttrMock.Err = nil
-				d.WalkGetAttrMock.QIDs = []p9.QID{{Type: 1}}
-				d.WalkGetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory, UID: 1}
-				d.WalkGetAttrMock.Valid = p9.AttrMask{Mode: true}
-				var qids []p9.QID
-				var err error
-				var mask p9.AttrMask
-				var attr p9.Attr
-				qids, _, mask, attr, err = sfFile.WalkGetAttr([]string{"foo", "bar"})
-				if !sf.WalkGetAttrMock.Called {
-					t.Errorf("Walk never Called?")
-				}
-				if !reflect.DeepEqual(sf.WalkGetAttrMock.Names, []string{"foo"}) {
-					t.Errorf("got names %v wanted []{foo}", sf.WalkGetAttrMock.Names)
-				}
-				if !reflect.DeepEqual(d.WalkGetAttrMock.Names, []string{"bar"}) {
-					t.Errorf("got names %v wanted []{bar}", d.WalkGetAttrMock.Names)
-				}
-				if len(qids) != 2 || qids[len(qids)-1].Type != 1 {
-					t.Errorf("got qids %v wanted []{..., {Type: 1}}", qids)
-				}
-				if !reflect.DeepEqual(attr, sf.WalkGetAttrMock.Attr) {
-					t.Errorf("got attrs %s wanted %s", attr, sf.WalkGetAttrMock.Attr)
-				}
-				if !reflect.DeepEqual(mask, sf.WalkGetAttrMock.Valid) {
-					t.Errorf("got mask %s wanted %s", mask, sf.WalkGetAttrMock.Valid)
-				}
-				return err
+			name:  "connect",
+			match: func(mode p9.FileMode) bool { return mode.IsSocket() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Connect(gomock.Any()).Do(func(p9.ConnectFlags) {
+					callback()
+				})
+				f.Connect(0)
 			},
 		},
 		{
-			name: "walk-to-file",
-			fn: func(c *p9.Client) error {
-				// Basic sanity check is done in walk-to-dir.
-				//
-				// Here we just create basic file FIDs to use.
-				sf.WalkMock.File = f
-				sf.WalkMock.Err = nil
-				var err error
-				_, _, err = sfFile.Walk(nil)
-				return err
+			name:  "open",
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Open(gomock.Any()).Do(func(p9.OpenFlags) {
+					callback()
+				})
+				f.Open(p9.ReadOnly)
 			},
 		},
 		{
-			name: "walk-to-symlink",
-			fn: func(c *p9.Client) error {
-				// See note in walk-to-file.
-				sf.WalkMock.File = s
-				sf.WalkMock.Err = nil
-				var err error
-				_, _, err = sfFile.Walk(nil)
-				return err
+			name:  "flush",
+			match: func(mode p9.FileMode) bool { return true },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Flush().Do(func() {
+					callback()
+				})
+				f.Flush()
+			},
+		},
+	}
+	writeExclusive := []concurrentFn{
+		{
+			// N.B. We can't really check getattr. But this is an
+			// extremely low-risk function, it seems likely that
+			// this check is paranoid anyways.
+			name:  "setattr",
+			match: func(mode p9.FileMode) bool { return true },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().SetAttr(gomock.Any(), gomock.Any()).Do(func(p9.SetAttrMask, p9.SetAttr) {
+					callback()
+				})
+				f.SetAttr(p9.SetAttrMask{}, p9.SetAttr{})
 			},
 		},
 		{
-			name: "bad-statfs",
-			want: sentinelErr,
-			fn: func(c *p9.Client) error {
-				sf.StatFSMock.Err = sentinelErr
-				_, err := sfFile.StatFS()
-				return err
+			name:  "unlinkAt",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Do(func(string, uint32) {
+					callback()
+				})
+				f.UnlinkAt(randomFileName(), 0)
 			},
 		},
 		{
-			name: "statfs",
-			fn: func(c *p9.Client) error {
-				sf.StatFSMock.Called = false
-				sf.StatFSMock.Stat = p9.FSStat{Type: 1}
-				sf.StatFSMock.Err = nil
-				stat, err := sfFile.StatFS()
-				if !sf.StatFSMock.Called {
-					t.Errorf("StatfS never Called?")
-				}
-				if stat.Type != 1 {
-					t.Errorf("got stat %v wanted {Type: 1}", stat)
-				}
-				return err
+			name:  "mknod",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Mknod(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, p9.FileMode, uint32, uint32, p9.UID, p9.GID) {
+					callback()
+				})
+				f.Mknod(randomFileName(), 0, 0, 0, 0, 0)
+			},
+		},
+		{
+			name:  "link",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Link(gomock.Any(), gomock.Any()).Do(func(p9.File, string) {
+					callback()
+				})
+				f.Link(f, randomFileName())
+			},
+		},
+		{
+			name:  "symlink",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Symlink(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, string, p9.UID, p9.GID) {
+					callback()
+				})
+				f.Symlink(randomFileName(), randomFileName(), 0, 0)
+			},
+		},
+		{
+			name:  "mkdir",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Mkdir(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, p9.FileMode, p9.UID, p9.GID) {
+					callback()
+				})
+				f.Mkdir(randomFileName(), 0, 0, 0)
+			},
+		},
+		{
+			name:  "create",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				// Return an error for the creation operation, as this is the simplest.
+				backend.EXPECT().Create(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil, p9.QID{}, uint32(0), syscall.EINVAL).Do(func(string, p9.OpenFlags, p9.FileMode, p9.UID, p9.GID) {
+					callback()
+				})
+				f.Create(randomFileName(), p9.ReadOnly, 0, 0, 0)
 			},
 		},
 	}
+	globalExclusive := []concurrentFn{
+		{
+			name:  "remove",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				// Remove operates on a locked parent. So we
+				// add a child, walk to it and call remove.
+				// Note that because this operation can operate
+				// concurrently with itself, we need to
+				// generate a random file name.
+				randomFile := randomFileName()
+				backend.AddChild(randomFile, h.NewFile())
+				defer backend.RemoveChild(randomFile)
+				_, file, err := f.Walk([]string{randomFile})
+				if err != nil {
+					h.t.Fatalf("walk got %v, want nil", err)
+				}
 
-	// First, create a new server and connection.
-	serverSocket, clientSocket, err := unet.SocketPair(false)
-	if err != nil {
-		t.Fatalf("socketpair got err %v wanted nil", err)
-	}
-	defer clientSocket.Close()
-	server := p9.NewServer(a)
-	go server.Handle(serverSocket)
-	client, err := p9.NewClient(clientSocket, 1024*1024 /* 1M message size */, p9.HighestVersionString())
-	if err != nil {
-		t.Fatalf("new client got err %v, wanted nil", err)
-	}
+				// Remove is automatically translated to the parent.
+				backend.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Do(func(string, uint32) {
+					callback()
+				})
 
-	// Now, run through each of the test steps.
-	for _, step := range testSteps {
-		err := step.fn(client)
-		if err != step.want {
-			// Don't fail, just note this one step failed.
-			t.Errorf("step %q got %v wanted %v", step.name, err, step.want)
-		}
-	}
-}
+				// Remove is also a close.
+				file.(deprecatedRemover).Remove()
+			},
+		},
+		{
+			name:  "rename",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				// Similarly to remove, because we need to
+				// operate on a child, we allow a walk.
+				randomFile := randomFileName()
+				backend.AddChild(randomFile, h.NewFile())
+				defer backend.RemoveChild(randomFile)
+				_, file, err := f.Walk([]string{randomFile})
+				if err != nil {
+					h.t.Fatalf("walk got %v, want nil", err)
+				}
+				defer file.Close()
+				fileBackend := h.Pop(file)
 
-func BenchmarkClient(b *testing.B) {
-	// Backend mock.
-	a := &AttachMock{
-		File: &FileMock{
-			ReadAtMock: ReadAtMock{N: 1},
+				// Rename is automatically translated to the parent.
+				backend.EXPECT().RenameAt(gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, p9.File, string) {
+					callback()
+				})
+
+				// Attempt the rename.
+				fileBackend.EXPECT().Renamed(gomock.Any(), gomock.Any())
+				file.Rename(f, randomFileName())
+			},
 		},
-	}
+		{
+			name:  "renameAt",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().RenameAt(gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, p9.File, string) {
+					callback()
+				})
 
-	// First, create a new server and connection.
-	serverSocket, clientSocket, err := unet.SocketPair(false)
-	if err != nil {
-		b.Fatalf("socketpair got err %v wanted nil", err)
-	}
-	defer clientSocket.Close()
-	server := p9.NewServer(a)
-	go server.Handle(serverSocket)
-	client, err := p9.NewClient(clientSocket, 1024*1024 /* 1M message size */, p9.HighestVersionString())
-	if err != nil {
-		b.Fatalf("new client got %v, expected nil", err)
+				// Attempt the rename. There are no active fids
+				// with this name, so we don't need to expect
+				// Renamed hooks on anything.
+				f.RenameAt(randomFileName(), f, randomFileName())
+			},
+		},
 	}
 
-	// Attach to the server.
-	f, err := client.Attach("")
-	if err != nil {
-		b.Fatalf("error during attach, got %v wanted nil", err)
+	for _, fn1 := range readExclusive {
+		for _, fn2 := range readExclusive {
+			for name := range newTypeMap(nil) {
+				// Everything should be able to proceed in parallel.
+				concurrentTest(t, name, fn1, fn2, true, true)
+				concurrentTest(t, name, fn1, fn2, false, true)
+			}
+		}
 	}
 
-	// Open the file.
-	if _, _, _, err := f.Open(p9.ReadOnly); err != nil {
-		b.Fatalf("error during open, got %v wanted nil", err)
+	for _, fn1 := range append(readExclusive, writeExclusive...) {
+		for _, fn2 := range writeExclusive {
+			for name := range newTypeMap(nil) {
+				// Only cross-directory functions should proceed in parallel.
+				concurrentTest(t, name, fn1, fn2, true, false)
+				concurrentTest(t, name, fn1, fn2, false, true)
+			}
+		}
 	}
 
-	// Reset the clock.
-	b.ResetTimer()
-
-	// Do N reads.
-	var buf [1]byte
-	for i := 0; i < b.N; i++ {
-		_, err := f.ReadAt(buf[:], 0)
-		if err != nil {
-			b.Fatalf("error during read %d, got %v wanted nil", i, err)
+	for _, fn1 := range append(append(readExclusive, writeExclusive...), globalExclusive...) {
+		for _, fn2 := range globalExclusive {
+			for name := range newTypeMap(nil) {
+				// Nothing should be able to run in parallel.
+				concurrentTest(t, name, fn1, fn2, true, false)
+				concurrentTest(t, name, fn1, fn2, false, false)
+			}
 		}
 	}
 }
diff --git a/pkg/p9/p9test/mocks.go b/pkg/p9/p9test/mocks.go
deleted file mode 100644
index 9a8c14975..000000000
--- a/pkg/p9/p9test/mocks.go
+++ /dev/null
@@ -1,489 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package p9test
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/fd"
-	"gvisor.googlesource.com/gvisor/pkg/p9"
-)
-
-// StatFSMock mocks p9.File.StatFS.
-type StatFSMock struct {
-	Called bool
-
-	// Return.
-	Stat p9.FSStat
-	Err  error
-}
-
-// StatFS implements p9.File.StatFS.
-func (f *StatFSMock) StatFS() (p9.FSStat, error) {
-	f.Called = true
-	return f.Stat, f.Err
-}
-
-// GetAttrMock mocks p9.File.GetAttr.
-type GetAttrMock struct {
-	Called bool
-
-	// Args.
-	Req p9.AttrMask
-
-	// Return.
-	QID   p9.QID
-	Valid p9.AttrMask
-	Attr  p9.Attr
-	Err   error
-}
-
-// GetAttr implements p9.File.GetAttr.
-func (g *GetAttrMock) GetAttr(req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
-	g.Called, g.Req = true, req
-	return g.QID, g.Valid, g.Attr, g.Err
-}
-
-// WalkGetAttrMock mocks p9.File.WalkGetAttr.
-type WalkGetAttrMock struct {
-	Called bool
-
-	// Args.
-	Names []string
-
-	// Return.
-	QIDs  []p9.QID
-	File  p9.File
-	Valid p9.AttrMask
-	Attr  p9.Attr
-	Err   error
-}
-
-// WalkGetAttr implements p9.File.WalkGetAttr.
-func (w *WalkGetAttrMock) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) {
-	w.Called = true
-	w.Names = append(w.Names, names...)
-	return w.QIDs, w.File, w.Valid, w.Attr, w.Err
-}
-
-// SetAttrMock mocks p9.File.SetAttr.
-type SetAttrMock struct {
-	Called bool
-
-	// Args.
-	Valid p9.SetAttrMask
-	Attr  p9.SetAttr
-
-	// Return.
-	Err error
-}
-
-// SetAttr implements p9.File.SetAttr.
-func (s *SetAttrMock) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
-	s.Called, s.Valid, s.Attr = true, valid, attr
-	return s.Err
-}
-
-// RemoveMock mocks p9.File.Remove.
-type RemoveMock struct {
-	Called bool
-
-	// Return.
-	Err error
-}
-
-// Remove implements p9.File.Remove.
-func (r *RemoveMock) Remove() error {
-	r.Called = true
-	return r.Err
-}
-
-// OpenMock mocks p9.File.Open.
-type OpenMock struct {
-	Called bool
-
-	// Args.
-	Flags p9.OpenFlags
-
-	// Return.
-	File   *fd.FD
-	QID    p9.QID
-	IOUnit uint32
-	Err    error
-}
-
-// Open implements p9.File.Open.
-func (o *OpenMock) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
-	o.Called, o.Flags = true, flags
-	return o.File, o.QID, o.IOUnit, o.Err
-}
-
-// ReadAtMock mocks p9.File.ReadAt.
-type ReadAtMock struct {
-	Called bool
-
-	// Args.
-	P      []byte
-	Offset uint64
-
-	// Return.
-	N   int
-	Err error
-}
-
-// ReadAt implements p9.File.ReadAt.
-func (r *ReadAtMock) ReadAt(p []byte, offset uint64) (int, error) {
-	r.Called, r.P, r.Offset = true, p, offset
-	return r.N, r.Err
-}
-
-// WriteAtMock mocks p9.File.WriteAt.
-type WriteAtMock struct {
-	Called bool
-
-	// Args.
-	P      []byte
-	Offset uint64
-
-	// Return.
-	N   int
-	Err error
-}
-
-// WriteAt implements p9.File.WriteAt.
-func (w *WriteAtMock) WriteAt(p []byte, offset uint64) (int, error) {
-	w.Called, w.P, w.Offset = true, p, offset
-	return w.N, w.Err
-}
-
-// FSyncMock mocks p9.File.FSync.
-type FSyncMock struct {
-	Called bool
-
-	// Return.
-	Err error
-}
-
-// FSync implements p9.File.FSync.
-func (f *FSyncMock) FSync() error {
-	f.Called = true
-	return f.Err
-}
-
-// MkdirMock mocks p9.File.Mkdir.
-type MkdirMock struct {
-	Called bool
-
-	// Args.
-	Name        string
-	Permissions p9.FileMode
-	UID         p9.UID
-	GID         p9.GID
-
-	// Return.
-	QID p9.QID
-	Err error
-}
-
-// Mkdir implements p9.File.Mkdir.
-func (s *MkdirMock) Mkdir(name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	s.Called, s.Name, s.Permissions, s.UID, s.GID = true, name, permissions, uid, gid
-	return s.QID, s.Err
-}
-
-// SymlinkMock mocks p9.File.Symlink.
-type SymlinkMock struct {
-	Called bool
-
-	// Args.
-	Oldname string
-	Newname string
-	UID     p9.UID
-	GID     p9.GID
-
-	// Return.
-	QID p9.QID
-	Err error
-}
-
-// Symlink implements p9.File.Symlink.
-func (s *SymlinkMock) Symlink(oldname string, newname string, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	s.Called, s.Oldname, s.Newname, s.UID, s.GID = true, oldname, newname, uid, gid
-	return s.QID, s.Err
-}
-
-// MknodMock mocks p9.File.Mknod.
-type MknodMock struct {
-	Called bool
-
-	// Args.
-	Name        string
-	Permissions p9.FileMode
-	Major       uint32
-	Minor       uint32
-	UID         p9.UID
-	GID         p9.GID
-
-	// Return.
-	QID p9.QID
-	Err error
-}
-
-// Mknod implements p9.File.Mknod.
-func (m *MknodMock) Mknod(name string, permissions p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	m.Called, m.Name, m.Permissions, m.Major, m.Minor, m.UID, m.GID = true, name, permissions, major, minor, uid, gid
-	return m.QID, m.Err
-}
-
-// UnlinkAtMock mocks p9.File.UnlinkAt.
-type UnlinkAtMock struct {
-	Called bool
-
-	// Args.
-	Name  string
-	Flags uint32
-
-	// Return.
-	Err error
-}
-
-// UnlinkAt implements p9.File.UnlinkAt.
-func (u *UnlinkAtMock) UnlinkAt(name string, flags uint32) error {
-	u.Called, u.Name, u.Flags = true, name, flags
-	return u.Err
-}
-
-// ReaddirMock mocks p9.File.Readdir.
-type ReaddirMock struct {
-	Called bool
-
-	// Args.
-	Offset uint64
-	Count  uint32
-
-	// Return.
-	Dirents []p9.Dirent
-	Err     error
-}
-
-// Readdir implements p9.File.Readdir.
-func (r *ReaddirMock) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
-	r.Called, r.Offset, r.Count = true, offset, count
-	return r.Dirents, r.Err
-}
-
-// ReadlinkMock mocks p9.File.Readlink.
-type ReadlinkMock struct {
-	Called bool
-
-	// Return.
-	Target string
-	Err    error
-}
-
-// Readlink implements p9.File.Readlink.
-func (r *ReadlinkMock) Readlink() (string, error) {
-	r.Called = true
-	return r.Target, r.Err
-}
-
-// AttachMock mocks p9.Attacher.Attach.
-type AttachMock struct {
-	Called bool
-
-	// Return.
-	File p9.File
-	Err  error
-}
-
-// Attach implements p9.Attacher.Attach.
-func (a *AttachMock) Attach() (p9.File, error) {
-	a.Called = true
-	return a.File, a.Err
-}
-
-// WalkMock mocks p9.File.Walk.
-type WalkMock struct {
-	Called bool
-
-	// Args.
-	Names []string
-
-	// Return.
-	QIDs []p9.QID
-	File p9.File
-	Err  error
-}
-
-// Walk implements p9.File.Walk.
-func (w *WalkMock) Walk(names []string) ([]p9.QID, p9.File, error) {
-	w.Called = true
-	w.Names = append(w.Names, names...)
-	return w.QIDs, w.File, w.Err
-}
-
-// RenameMock mocks p9.File.Rename.
-type RenameMock struct {
-	Called bool
-
-	// Args.
-	Directory p9.File
-	Name      string
-
-	// Return.
-	Err error
-}
-
-// Rename implements p9.File.Rename.
-func (r *RenameMock) Rename(directory p9.File, name string) error {
-	r.Called, r.Directory, r.Name = true, directory, name
-	return r.Err
-}
-
-// CloseMock mocks p9.File.Close.
-type CloseMock struct {
-	Called bool
-
-	// Return.
-	Err error
-}
-
-// Close implements p9.File.Close.
-func (d *CloseMock) Close() error {
-	d.Called = true
-	return d.Err
-}
-
-// CreateMock mocks p9.File.Create.
-type CreateMock struct {
-	Called bool
-
-	// Args.
-	Name        string
-	Flags       p9.OpenFlags
-	Permissions p9.FileMode
-	UID         p9.UID
-	GID         p9.GID
-
-	// Return.
-	HostFile *fd.FD
-	File     p9.File
-	QID      p9.QID
-	IOUnit   uint32
-	Err      error
-}
-
-// Create implements p9.File.Create.
-func (c *CreateMock) Create(name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
-	c.Called, c.Name, c.Flags, c.Permissions, c.UID, c.GID = true, name, flags, permissions, uid, gid
-	return c.HostFile, c.File, c.QID, c.IOUnit, c.Err
-}
-
-// LinkMock mocks p9.File.Link.
-type LinkMock struct {
-	Called bool
-
-	// Args.
-	Target  p9.File
-	Newname string
-
-	// Return.
-	Err error
-}
-
-// Link implements p9.File.Link.
-func (l *LinkMock) Link(target p9.File, newname string) error {
-	l.Called, l.Target, l.Newname = true, target, newname
-	return l.Err
-}
-
-// RenameAtMock mocks p9.File.RenameAt.
-type RenameAtMock struct {
-	Called bool
-
-	// Args.
-	Oldname string
-	Newdir  p9.File
-	Newname string
-
-	// Return.
-	Err error
-}
-
-// RenameAt implements p9.File.RenameAt.
-func (r *RenameAtMock) RenameAt(oldname string, newdir p9.File, newname string) error {
-	r.Called, r.Oldname, r.Newdir, r.Newname = true, oldname, newdir, newname
-	return r.Err
-}
-
-// FlushMock mocks p9.File.Flush.
-type FlushMock struct {
-	Called bool
-
-	// Return.
-	Err error
-}
-
-// Flush implements p9.File.Flush.
-func (f *FlushMock) Flush() error {
-	return f.Err
-}
-
-// ConnectMock mocks p9.File.Connect.
-type ConnectMock struct {
-	Called bool
-
-	// Args.
-	Flags p9.ConnectFlags
-
-	// Return.
-	File *fd.FD
-	Err  error
-}
-
-// Connect implements p9.File.Connect.
-func (o *ConnectMock) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
-	o.Called, o.Flags = true, flags
-	return o.File, o.Err
-}
-
-// FileMock mocks p9.File.
-type FileMock struct {
-	WalkMock
-	WalkGetAttrMock
-	StatFSMock
-	GetAttrMock
-	SetAttrMock
-	RemoveMock
-	RenameMock
-	CloseMock
-	OpenMock
-	ReadAtMock
-	WriteAtMock
-	FSyncMock
-	CreateMock
-	MkdirMock
-	SymlinkMock
-	LinkMock
-	MknodMock
-	RenameAtMock
-	UnlinkAtMock
-	ReaddirMock
-	ReadlinkMock
-	FlushMock
-	ConnectMock
-}
-
-var (
-	_ p9.File = &FileMock{}
-)
diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go
new file mode 100644
index 000000000..417b55950
--- /dev/null
+++ b/pkg/p9/p9test/p9test.go
@@ -0,0 +1,329 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package p9test provides standard mocks for p9.
+package p9test
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"testing"
+
+	"github.com/golang/mock/gomock"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+)
+
+// Harness is an attacher mock.
+type Harness struct {
+	t            *testing.T
+	mockCtrl     *gomock.Controller
+	Attacher     *MockAttacher
+	wg           sync.WaitGroup
+	clientSocket *unet.Socket
+	mu           sync.Mutex
+	created      []*Mock
+}
+
+// globalPath is a QID.Path Generator.
+var globalPath uint64
+
+// MakePath returns a globally unique path.
+func MakePath() uint64 {
+	return atomic.AddUint64(&globalPath, 1)
+}
+
+// Generator is a function that generates a new file.
+type Generator func(parent *Mock) *Mock
+
+// Mock is a common mock element.
+type Mock struct {
+	p9.DefaultWalkGetAttr
+	*MockFile
+	parent   *Mock
+	closed   bool
+	harness  *Harness
+	QID      p9.QID
+	Attr     p9.Attr
+	children map[string]Generator
+
+	// WalkCallback is a special function that will be called from within
+	// the walk context. This is needed for the concurrent tests within
+	// this package.
+	WalkCallback func() error
+}
+
+// globalMu protects the children maps in all mocks. Note that this is not a
+// particularly elegant solution, but because the test has walks from the root
+// through to final nodes, we must share maps below, and it's easiest to simply
+// protect against concurrent access globally.
+var globalMu sync.RWMutex
+
+// AddChild adds a new child to the Mock.
+func (m *Mock) AddChild(name string, generator Generator) {
+	globalMu.Lock()
+	defer globalMu.Unlock()
+	m.children[name] = generator
+}
+
+// RemoveChild removes the child with the given name.
+func (m *Mock) RemoveChild(name string) {
+	globalMu.Lock()
+	defer globalMu.Unlock()
+	delete(m.children, name)
+}
+
+// Matches implements gomock.Matcher.Matches.
+func (m *Mock) Matches(x interface{}) bool {
+	if om, ok := x.(*Mock); ok {
+		return m.QID.Path == om.QID.Path
+	}
+	return false
+}
+
+// String implements gomock.Matcher.String.
+func (m *Mock) String() string {
+	return fmt.Sprintf("Mock{Mode: 0x%x, QID.Path: %d}", m.Attr.Mode, m.QID.Path)
+}
+
+// GetAttr returns the current attributes.
+func (m *Mock) GetAttr(mask p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	return m.QID, p9.AttrMaskAll(), m.Attr, nil
+}
+
+// Walk supports clone and walking in directories.
+func (m *Mock) Walk(names []string) ([]p9.QID, p9.File, error) {
+	if m.WalkCallback != nil {
+		if err := m.WalkCallback(); err != nil {
+			return nil, nil, err
+		}
+	}
+	if len(names) == 0 {
+		// Clone the file appropriately.
+		nm := m.harness.NewMock(m.parent, m.QID.Path, m.Attr)
+		nm.children = m.children // Inherit children.
+		return []p9.QID{nm.QID}, nm, nil
+	} else if len(names) != 1 {
+		m.harness.t.Fail() // Should not happen.
+		return nil, nil, syscall.EINVAL
+	}
+
+	if m.Attr.Mode.IsDir() {
+		globalMu.RLock()
+		defer globalMu.RUnlock()
+		if fn, ok := m.children[names[0]]; ok {
+			// Generate the child.
+			nm := fn(m)
+			return []p9.QID{nm.QID}, nm, nil
+		}
+		// No child found.
+		return nil, nil, syscall.ENOENT
+	}
+
+	// Call the underlying mock.
+	return m.MockFile.Walk(names)
+}
+
+// WalkGetAttr calls the default implementation; this is a client-side optimization.
+func (m *Mock) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) {
+	return m.DefaultWalkGetAttr.WalkGetAttr(names)
+}
+
+// Pop pops off the most recently created Mock and assert that this mock
+// represents the same file passed in. If nil is passed in, no check is
+// performed.
+//
+// Precondition: there must be at least one Mock or this will panic.
+func (h *Harness) Pop(clientFile p9.File) *Mock {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	if clientFile == nil {
+		// If no clientFile is provided, then we always return the last
+		// created file. The caller can safely use this as long as
+		// there is no concurrency.
+		m := h.created[len(h.created)-1]
+		h.created = h.created[:len(h.created)-1]
+		return m
+	}
+
+	qid, _, _, err := clientFile.GetAttr(p9.AttrMaskAll())
+	if err != nil {
+		// We do not expect this to happen.
+		panic(fmt.Sprintf("err during Pop: %v", err))
+	}
+
+	// Find the relevant file in our created list. We must scan the last
+	// from back to front to ensure that we favor the most recently
+	// generated file.
+	for i := len(h.created) - 1; i >= 0; i-- {
+		m := h.created[i]
+		if qid.Path == m.QID.Path {
+			// Copy and truncate.
+			copy(h.created[i:], h.created[i+1:])
+			h.created = h.created[:len(h.created)-1]
+			return m
+		}
+	}
+
+	// Unable to find relevant file.
+	panic(fmt.Sprintf("unable to locate file with QID %+v", qid.Path))
+}
+
+// NewMock returns a new base file.
+func (h *Harness) NewMock(parent *Mock, path uint64, attr p9.Attr) *Mock {
+	m := &Mock{
+		MockFile: NewMockFile(h.mockCtrl),
+		parent:   parent,
+		harness:  h,
+		QID: p9.QID{
+			Type: p9.QIDType((attr.Mode & p9.FileModeMask) >> 12),
+			Path: path,
+		},
+		Attr: attr,
+	}
+
+	// Always ensure Close is after the parent's close. Note that this
+	// can't be done via a straight-forward After call, because the parent
+	// might change after initial creation. We ensure that this is true at
+	// close time.
+	m.EXPECT().Close().Return(nil).Times(1).Do(func() {
+		if m.parent != nil && m.parent.closed {
+			h.t.FailNow()
+		}
+		// Note that this should not be racy, as this operation should
+		// be protected by the Times(1) above first.
+		m.closed = true
+	})
+
+	// Remember what was created.
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.created = append(h.created, m)
+
+	return m
+}
+
+// NewFile returns a new file mock.
+//
+// Note that ReadAt and WriteAt must be mocked separately.
+func (h *Harness) NewFile() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeRegular})
+	}
+}
+
+// NewDirectory returns a new mock directory.
+//
+// Note that Mkdir, Link, Mknod, RenameAt, UnlinkAt and Readdir must be mocked
+// separately. Walk is provided and children may be manipulated via AddChild
+// and RemoveChild. After calling Walk remotely, one can use Pop to find the
+// corresponding backend mock on the server side.
+func (h *Harness) NewDirectory(contents map[string]Generator) Generator {
+	return func(parent *Mock) *Mock {
+		m := h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeDirectory})
+		m.children = contents // Save contents.
+		return m
+	}
+}
+
+// NewSymlink returns a new mock directory.
+//
+// Note that Readlink must be mocked separately.
+func (h *Harness) NewSymlink() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeSymlink})
+	}
+}
+
+// NewBlockDevice returns a new mock block device.
+func (h *Harness) NewBlockDevice() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeBlockDevice})
+	}
+}
+
+// NewCharacterDevice returns a new mock character device.
+func (h *Harness) NewCharacterDevice() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeCharacterDevice})
+	}
+}
+
+// NewNamedPipe returns a new mock named pipe.
+func (h *Harness) NewNamedPipe() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeNamedPipe})
+	}
+}
+
+// NewSocket returns a new mock socket.
+func (h *Harness) NewSocket() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeSocket})
+	}
+}
+
+// Finish completes all checks and shuts down the server.
+func (h *Harness) Finish() {
+	h.clientSocket.Close()
+	h.wg.Wait()
+	h.mockCtrl.Finish()
+}
+
+// NewHarness creates and returns a new test server.
+//
+// It should always be used as:
+//
+//	h, c := NewHarness(t)
+//	defer h.Finish()
+//
+func NewHarness(t *testing.T) (*Harness, *p9.Client) {
+	// Create the mock.
+	mockCtrl := gomock.NewController(t)
+	h := &Harness{
+		t:        t,
+		mockCtrl: mockCtrl,
+		Attacher: NewMockAttacher(mockCtrl),
+	}
+
+	// Make socket pair.
+	serverSocket, clientSocket, err := unet.SocketPair(false)
+	if err != nil {
+		t.Fatalf("socketpair got err %v wanted nil", err)
+	}
+
+	// Start the server, synchronized on exit.
+	server := p9.NewServer(h.Attacher)
+	h.wg.Add(1)
+	go func() {
+		defer h.wg.Done()
+		server.Handle(serverSocket)
+	}()
+
+	// Create the client.
+	client, err := p9.NewClient(clientSocket, 1024, p9.HighestVersionString())
+	if err != nil {
+		serverSocket.Close()
+		clientSocket.Close()
+		t.Fatalf("new client got %v, expected nil", err)
+		return nil, nil // Never hit.
+	}
+
+	// Capture the client socket.
+	h.clientSocket = clientSocket
+	return h, client
+}
diff --git a/pkg/p9/path_tree.go b/pkg/p9/path_tree.go
new file mode 100644
index 000000000..97f90bcd5
--- /dev/null
+++ b/pkg/p9/path_tree.go
@@ -0,0 +1,109 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package p9
+
+import (
+	"fmt"
+	"sync"
+)
+
+// pathNode is a single node in a path traversal.
+//
+// These are shared by all fidRefs that point to the same path.
+//
+// These are not synchronized because we allow certain operations (file walk)
+// to proceed without having to acquire a write lock. The lock in this
+// structure exists to synchronize high-level, semantic operations, such as the
+// simultaneous creation and deletion of a file.
+//
+// (+) below is the path component string.
+type pathNode struct {
+	mu       sync.RWMutex // See above.
+	fidRefs  sync.Map     // => map[*fidRef]string(+)
+	children sync.Map     // => map[string(+)]*pathNode
+	count    int64
+}
+
+// pathNodeFor returns the path node for the given name, or a new one.
+//
+// Precondition: mu must be held in a readable fashion.
+func (p *pathNode) pathNodeFor(name string) *pathNode {
+	// Load the existing path node.
+	if pn, ok := p.children.Load(name); ok {
+		return pn.(*pathNode)
+	}
+
+	// Create a new pathNode for shared use.
+	pn, _ := p.children.LoadOrStore(name, new(pathNode))
+	return pn.(*pathNode)
+}
+
+// nameFor returns the name for the given fidRef.
+//
+// Precondition: mu must be held in a readable fashion.
+func (p *pathNode) nameFor(ref *fidRef) string {
+	if s, ok := p.fidRefs.Load(ref); ok {
+		return s.(string)
+	}
+
+	// This should not happen, don't proceed.
+	panic(fmt.Sprintf("expected name for %+v, none found", ref))
+}
+
+// addChild adds a child to the given pathNode.
+//
+// This applies only to an individual fidRef.
+//
+// Precondition: mu must be held in a writable fashion.
+func (p *pathNode) addChild(ref *fidRef, name string) {
+	if s, ok := p.fidRefs.Load(ref); ok {
+		// This should not happen, don't proceed.
+		panic(fmt.Sprintf("unexpected fidRef %+v with path %q, wanted %q", ref, s, name))
+	}
+
+	p.fidRefs.Store(ref, name)
+}
+
+// removeChild removes the given child.
+//
+// This applies only to an individual fidRef.
+//
+// Precondition: mu must be held in a writable fashion.
+func (p *pathNode) removeChild(ref *fidRef) {
+	p.fidRefs.Delete(ref)
+}
+
+// removeWithName removes all references with the given name.
+//
+// The original pathNode is returned by this function, and removed from this
+// pathNode. Any operations on the removed tree must use this value.
+//
+// The provided function is executed after removal.
+//
+// Precondition: mu must be held in a writable fashion.
+func (p *pathNode) removeWithName(name string, fn func(ref *fidRef)) *pathNode {
+	p.fidRefs.Range(func(key, value interface{}) bool {
+		if value.(string) == name {
+			p.fidRefs.Delete(key)
+			fn(key.(*fidRef))
+		}
+		return true
+	})
+
+	// Return the original path node.
+	origPathNode := p.pathNodeFor(name)
+	p.children.Delete(name)
+	return origPathNode
+}
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 5c7cb18c8..3ef151595 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -15,6 +15,8 @@
 package p9
 
 import (
+	"io"
+	"runtime/debug"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -27,6 +29,19 @@ import (
 type Server struct {
 	// attacher provides the attach function.
 	attacher Attacher
+
+	// pathTree is the full set of paths opened on this server.
+	//
+	// These may be across different connections, but rename operations
+	// must be serialized globally for safely. There is a single pathTree
+	// for the entire server, and not per connection.
+	pathTree pathNode
+
+	// renameMu is a global lock protecting rename operations. With this
+	// lock, we can be certain that any given rename operation can safely
+	// acquire two path nodes in any order, as all other concurrent
+	// operations acquire at most a single node.
+	renameMu sync.RWMutex
 }
 
 // NewServer returns a new server.
@@ -81,6 +96,9 @@ type connState struct {
 
 // fidRef wraps a node and tracks references.
 type fidRef struct {
+	// server is the associated server.
+	server *Server
+
 	// file is the associated File.
 	file File
 
@@ -97,13 +115,39 @@ type fidRef struct {
 	// This is updated in handlers.go.
 	opened bool
 
-	// walkable indicates this fidRef may be walked.
-	walkable bool
+	// mode is the fidRef's mode from the walk. Only the type bits are
+	// valid, the permissions may change. This is used to sanity check
+	// operations on this element, and prevent walks across
+	// non-directories.
+	mode FileMode
 
 	// openFlags is the mode used in the open.
 	//
 	// This is updated in handlers.go.
 	openFlags OpenFlags
+
+	// pathNode is the current pathNode for this FID.
+	pathNode *pathNode
+
+	// parent is the parent fidRef. We hold on to a parent reference to
+	// ensure that hooks, such as Renamed, can be executed safely by the
+	// server code.
+	//
+	// Note that parent cannot be changed without holding both the global
+	// rename lock and a writable lock on the associated pathNode for this
+	// fidRef. Holding either of these locks is sufficient to examine
+	// parent safely.
+	//
+	// The parent will be nil for root fidRefs, and non-nil otherwise. The
+	// method maybeParent can be used to return a cyclical reference, and
+	// isRoot should be used to check for root over looking at parent
+	// directly.
+	parent *fidRef
+
+	// deleted indicates that the backing file has been deleted. We stop
+	// many operations at the API level if they are incompatible with a
+	// file that has already been unlinked.
+	deleted uint32
 }
 
 // OpenFlags returns the flags the file was opened with and true iff the fid was opened previously.
@@ -113,13 +157,146 @@ func (f *fidRef) OpenFlags() (OpenFlags, bool) {
 	return f.openFlags, f.opened
 }
 
+// IncRef increases the references on a fid.
+func (f *fidRef) IncRef() {
+	atomic.AddInt64(&f.refs, 1)
+}
+
 // DecRef should be called when you're finished with a fid.
 func (f *fidRef) DecRef() {
 	if atomic.AddInt64(&f.refs, -1) == 0 {
 		f.file.Close()
+
+		// Drop the parent reference.
+		//
+		// Since this fidRef is guaranteed to be non-discoverable when
+		// the references reach zero, we don't need to worry about
+		// clearing the parent.
+		if f.parent != nil {
+			// If we've been previously deleted, this removing this
+			// ref is a no-op. That's expected.
+			f.parent.pathNode.removeChild(f)
+			f.parent.DecRef()
+		}
 	}
 }
 
+// isDeleted returns true if this fidRef has been deleted.
+func (f *fidRef) isDeleted() bool {
+	return atomic.LoadUint32(&f.deleted) != 0
+}
+
+// isRoot indicates whether this is a root fid.
+func (f *fidRef) isRoot() bool {
+	return f.parent == nil
+}
+
+// maybeParent returns a cyclic reference for roots, and the parent otherwise.
+func (f *fidRef) maybeParent() *fidRef {
+	if f.parent != nil {
+		return f.parent
+	}
+	return f // Root has itself.
+}
+
+// notifyDelete marks all fidRefs as deleted.
+//
+// Precondition: the write lock must be held on the given pathNode.
+func notifyDelete(pn *pathNode) {
+	// Call on all local references.
+	pn.fidRefs.Range(func(key, _ interface{}) bool {
+		ref := key.(*fidRef)
+		atomic.StoreUint32(&ref.deleted, 1)
+		return true
+	})
+
+	// Call on all subtrees.
+	pn.children.Range(func(_, value interface{}) bool {
+		notifyDelete(value.(*pathNode))
+		return true
+	})
+}
+
+// markChildDeleted marks all children below the given name as deleted.
+//
+// Precondition: this must be called via safelyWrite or safelyGlobal.
+func (f *fidRef) markChildDeleted(name string) {
+	origPathNode := f.pathNode.removeWithName(name, func(ref *fidRef) {
+		atomic.StoreUint32(&ref.deleted, 1)
+	})
+
+	// Mark everything below as deleted.
+	notifyDelete(origPathNode)
+}
+
+// notifyNameChange calls the relevant Renamed method on all nodes in the path,
+// recursively. Note that this applies only for subtrees, as these
+// notifications do not apply to the actual file whose name has changed.
+//
+// Precondition: the write lock must be held on the given pathNode.
+func notifyNameChange(pn *pathNode) {
+	// Call on all local references.
+	pn.fidRefs.Range(func(key, value interface{}) bool {
+		ref := key.(*fidRef)
+		name := value.(string)
+		ref.file.Renamed(ref.parent.file, name)
+		return true
+	})
+
+	// Call on all subtrees.
+	pn.children.Range(func(_, value interface{}) bool {
+		notifyNameChange(value.(*pathNode))
+		return true
+	})
+}
+
+// renameChildTo renames the given child to the target.
+//
+// Precondition: this must be called via safelyGlobal.
+func (f *fidRef) renameChildTo(oldName string, target *fidRef, newName string) {
+	target.markChildDeleted(newName)
+	origPathNode := f.pathNode.removeWithName(oldName, func(ref *fidRef) {
+		ref.parent.DecRef() // Drop original reference.
+		ref.parent = target // Change parent.
+		ref.parent.IncRef() // Acquire new one.
+		target.pathNode.addChild(ref, newName)
+		ref.file.Renamed(target.file, newName)
+	})
+
+	// Replace the previous (now deleted) path node.
+	f.pathNode.children.Store(newName, origPathNode)
+
+	// Call Renamed on everything above.
+	notifyNameChange(origPathNode)
+}
+
+// safelyRead executes the given operation with the local path node locked.
+// This implies that paths will not change during the operation.
+func (f *fidRef) safelyRead(fn func() error) (err error) {
+	f.server.renameMu.RLock()
+	defer f.server.renameMu.RUnlock()
+	f.pathNode.mu.RLock()
+	defer f.pathNode.mu.RUnlock()
+	return fn()
+}
+
+// safelyWrite executes the given operation with the local path node locked in
+// a writable fashion. This implies some paths may change.
+func (f *fidRef) safelyWrite(fn func() error) (err error) {
+	f.server.renameMu.RLock()
+	defer f.server.renameMu.RUnlock()
+	f.pathNode.mu.Lock()
+	defer f.pathNode.mu.Unlock()
+	return fn()
+}
+
+// safelyGlobal executes the given operation with the global path lock held.
+func (f *fidRef) safelyGlobal(fn func() error) (err error) {
+	f.server.renameMu.Lock()
+	defer f.server.renameMu.Unlock()
+	return fn()
+}
+
 // LookupFID finds the given FID.
 //
 // You should call fid.DecRef when you are finished using the fid.
@@ -128,7 +305,7 @@ func (cs *connState) LookupFID(fid FID) (*fidRef, bool) {
 	defer cs.fidMu.Unlock()
 	fidRef, ok := cs.fids[fid]
 	if ok {
-		atomic.AddInt64(&fidRef.refs, 1)
+		fidRef.IncRef()
 		return fidRef, true
 	}
 	return nil, false
@@ -145,7 +322,7 @@ func (cs *connState) InsertFID(fid FID, newRef *fidRef) {
 	if ok {
 		defer origRef.DecRef()
 	}
-	atomic.AddInt64(&newRef.refs, 1)
+	newRef.IncRef()
 	cs.fids[fid] = newRef
 }
 
@@ -229,10 +406,9 @@ func (cs *connState) handleRequest() {
 	cs.recvDone <- nil
 
 	// Deal with other errors.
-	if err != nil {
+	if err != nil && err != io.EOF {
 		// If it's not a connection error, but some other protocol error,
 		// we can send a response immediately.
-		log.Debugf("err [%05d] %v", tag, err)
 		cs.sendMu.Lock()
 		err := send(cs.conn, tag, newErr(err))
 		cs.sendMu.Unlock()
@@ -243,12 +419,38 @@ func (cs *connState) handleRequest() {
 	// Try to start the tag.
 	if !cs.StartTag(tag) {
 		// Nothing we can do at this point; client is bogus.
+		log.Debugf("no valid tag [%05d]", tag)
 		cs.sendDone <- ErrNoValidMessage
 		return
 	}
 
 	// Handle the message.
-	var r message
+	var r message // r is the response.
+	defer func() {
+		if r == nil {
+			// Don't allow a panic to propagate.
+			recover()
+
+			// Include a useful log message.
+			log.Warningf("panic in handler: %s", debug.Stack())
+
+			// Wrap in an EFAULT error; we don't really have a
+			// better way to describe this kind of error. It will
+			// usually manifest as a result of the test framework.
+			r = newErr(syscall.EFAULT)
+		}
+
+		// Clear the tag before sending. That's because as soon as this
+		// hits the wire, the client can legally send another message
+		// with the same tag.
+		cs.ClearTag(tag)
+
+		// Send back the result.
+		cs.sendMu.Lock()
+		err = send(cs.conn, tag, r)
+		cs.sendMu.Unlock()
+		cs.sendDone <- err
+	}()
 	if handler, ok := m.(handler); ok {
 		// Call the message handler.
 		r = handler.handle(cs)
@@ -256,18 +458,6 @@ func (cs *connState) handleRequest() {
 		// Produce an ENOSYS error.
 		r = newErr(syscall.ENOSYS)
 	}
-
-	// Clear the tag before sending. That's because as soon
-	// as this hits the wire, the client can legally send
-	// another message with the same tag.
-	cs.ClearTag(tag)
-
-	// Send back the result.
-	cs.sendMu.Lock()
-	err = send(cs.conn, tag, r)
-	cs.sendMu.Unlock()
-	cs.sendDone <- err
-	return
 }
 
 func (cs *connState) handleRequests() {
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index 97396806c..bafb377de 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -167,7 +167,7 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 	r.EnableFDs(1)
 
 	n, err := r.ReadVec([][]byte{hdr[:]})
-	if err != nil {
+	if err != nil && (n == 0 || err != io.EOF) {
 		r.CloseFDs()
 		return NoTag, nil, ErrSocket{err}
 	}
@@ -189,10 +189,8 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 	// Continuing reading for a short header.
 	for n < int(headerLength) {
 		cur, err := r.ReadVec([][]byte{hdr[n:]})
-		if err != nil {
+		if err != nil && (cur == 0 || err != io.EOF) {
 			return NoTag, nil, ErrSocket{err}
-		} else if cur == 0 {
-			return NoTag, nil, ErrSocket{io.EOF}
 		}
 		n += cur
 	}
@@ -296,10 +294,8 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 		r := s.Reader(true)
 		for n := 0; n < int(remaining); {
 			cur, err := r.ReadVec(vecs)
-			if err != nil {
+			if err != nil && (cur == 0 || err != io.EOF) {
 				return NoTag, nil, ErrSocket{err}
-			} else if cur == 0 {
-				return NoTag, nil, ErrSocket{io.EOF}
 			}
 			n += cur
 
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index 97b9ba3ff..0c9efc709 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "rand",
     srcs = [
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 1975d17a6..657f923ed 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "victim",
     testonly = 1,
diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD
index 0ed38c64a..29f751725 100644
--- a/pkg/secio/BUILD
+++ b/pkg/secio/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "secio",
     srcs = [
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 314b3e962..9bf04360a 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,6 +1,7 @@
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
index 2a7a6df23..02d24defd 100644
--- a/pkg/sentry/context/BUILD
+++ b/pkg/sentry/context/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "context",
     srcs = ["context.go"],
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index fbdde0721..c3b682d6f 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "control",
     srcs = [
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 69c99b0b3..bebdb2939 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "device",
     srcs = ["device.go"],
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
index ff4ab850a..4bd912e95 100644
--- a/pkg/sentry/fs/anon/BUILD
+++ b/pkg/sentry/fs/anon/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "anon",
     srcs = [
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index cef01829a..c9e531e40 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -56,14 +56,10 @@ go_test(
     srcs = ["gofer_test.go"],
     embed = [":gofer"],
     deps = [
-        "//pkg/log",
         "//pkg/p9",
         "//pkg/p9/p9test",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/kernel/time",
-        "//pkg/sentry/usermem",
-        "//pkg/unet",
     ],
 )
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index a0265c2aa..455953237 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -58,13 +58,6 @@ func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9
 	return c.file.SetAttr(valid, attr)
 }
 
-func (c *contextFile) remove(ctx context.Context) error {
-	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Remove()
-}
-
 func (c *contextFile) rename(ctx context.Context, directory contextFile, name string) error {
 	ctx.UninterruptibleSleepStart(false)
 	defer ctx.UninterruptibleSleepFinish(false)
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 3190d1e18..b450778ca 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -16,110 +16,102 @@ package gofer
 
 import (
 	"fmt"
-	"io"
 	"syscall"
 	"testing"
 	"time"
 
-	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/p9/p9test"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
-// goodMockFile returns a file that can be Walk'ed to and created.
-func goodMockFile(mode p9.FileMode, size uint64) *p9test.FileMock {
-	return &p9test.FileMock{
-		GetAttrMock: p9test.GetAttrMock{
-			Attr:  p9.Attr{Mode: mode, Size: size, RDev: 0},
-			Valid: p9.AttrMaskAll(),
-		},
-	}
-}
-
-func newClosedSocket() (*unet.Socket, error) {
-	fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
-	if err != nil {
-		return nil, err
-	}
-
-	s, err := unet.NewSocket(fd)
-	if err != nil {
-		syscall.Close(fd)
-		return nil, err
-	}
-
-	return s, s.Close()
-}
-
-// root returns a p9 file mock and an fs.InodeOperations created from that file.  Any
-// functions performed on fs.InodeOperations will use the p9 file mock.
-func root(ctx context.Context, cp cachePolicy, mode p9.FileMode, size uint64) (*p9test.FileMock, *fs.Inode, error) {
-	sock, err := newClosedSocket()
-	if err != nil {
-		return nil, nil, err
-	}
-
-	// Construct a dummy session that we can destruct.
-	s := &session{
-		conn:        sock,
-		mounter:     fs.RootOwner,
-		cachePolicy: cp,
-		client:      &p9.Client{},
-	}
-
-	rootFile := goodMockFile(mode, size)
-	sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{file: rootFile}, p9.QID{}, rootFile.GetAttrMock.Valid, rootFile.GetAttrMock.Attr, false /* socket */)
-	m := fs.NewMountSource(s, &filesystem{}, fs.MountSourceFlags{})
-	return rootFile, fs.NewInode(rootInodeOperations, m, sattr), nil
+// rootTest runs a test with a p9 mock and an fs.InodeOperations created from
+// the attached root directory. The root file will be closed and client
+// disconnected, but additional files must be closed manually.
+func rootTest(t *testing.T, name string, cp cachePolicy, fn func(context.Context, *p9test.Harness, *p9test.Mock, *fs.Inode)) {
+	t.Run(name, func(t *testing.T) {
+		h, c := p9test.NewHarness(t)
+		defer h.Finish()
+
+		// Create a new root. Note that we pass an empty, but non-nil
+		// map here. This allows tests to extend the root children
+		// dynamically.
+		root := h.NewDirectory(map[string]p9test.Generator{})(nil)
+
+		// Return this as the root.
+		h.Attacher.EXPECT().Attach().Return(root, nil).Times(1)
+
+		// ... and open via the client.
+		rootFile, err := c.Attach("/")
+		if err != nil {
+			t.Fatalf("unable to attach: %v", err)
+		}
+		defer rootFile.Close()
+
+		// Wrap an a session.
+		s := &session{
+			mounter:     fs.RootOwner,
+			cachePolicy: cp,
+			client:      c,
+		}
+
+		// ... and an INode, with only the mode being explicitly valid for now.
+		ctx := contexttest.Context(t)
+		sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{
+			file: rootFile,
+		}, root.QID, p9.AttrMaskAll(), root.Attr, false /* socket */)
+		m := fs.NewMountSource(s, &filesystem{}, fs.MountSourceFlags{})
+		rootInode := fs.NewInode(rootInodeOperations, m, sattr)
+
+		// Ensure that the cache is fully invalidated, so that any
+		// close actions actually take place before the full harness is
+		// torn down.
+		defer m.FlushDirentRefs()
+
+		// Execute the test.
+		fn(ctx, h, root, rootInode)
+	})
 }
 
 func TestLookup(t *testing.T) {
-	// Test parameters.
 	type lookupTest struct {
 		// Name of the test.
 		name string
 
-		// Function input parameters.
-		fileName string
-
 		// Expected return value.
 		want error
 	}
 
 	tests := []lookupTest{
 		{
-			name:     "mock Walk passes (function succeeds)",
-			fileName: "ppp",
-			want:     nil,
+			name: "mock Walk passes (function succeeds)",
+			want: nil,
 		},
 		{
-			name:     "mock Walk fails (function fails)",
-			fileName: "ppp",
-			want:     syscall.ENOENT,
+			name: "mock Walk fails (function fails)",
+			want: syscall.ENOENT,
 		},
 	}
 
-	ctx := contexttest.Context(t)
+	const file = "file" // The walked target file.
+
 	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
+		rootTest(t, test.name, cacheNone, func(ctx context.Context, h *p9test.Harness, rootFile *p9test.Mock, rootInode *fs.Inode) {
+			// Setup the appropriate result.
+			rootFile.WalkCallback = func() error {
+				return test.want
+			}
+			if test.want == nil {
+				// Set the contents of the root. We expect a
+				// normal file generator for ppp above. This is
+				// overriden by setting WalkErr in the mock.
+				rootFile.AddChild(file, h.NewFile())
 			}
-
-			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
-			rootFile.WalkGetAttrMock.Err = test.want
-			rootFile.WalkGetAttrMock.File = goodMockFile(p9.PermissionsMask, 0)
 
 			// Call function.
-			dirent, err := rootInode.Lookup(ctx, test.fileName)
+			dirent, err := rootInode.Lookup(ctx, file)
 
 			// Unwrap the InodeOperations.
 			var newInodeOperations fs.InodeOperations
@@ -138,19 +130,12 @@ func TestLookup(t *testing.T) {
 			if err == nil && newInodeOperations == nil {
 				t.Errorf("Lookup got non-nil err and non-nil node, wanted at least one non-nil")
 			}
-
-			// Check mock parameters.
-			if !rootFile.WalkGetAttrMock.Called {
-				t.Errorf("GetAttr not called; error: %v", err)
-			} else if rootFile.WalkGetAttrMock.Names[0] != test.fileName {
-				t.Errorf("file name not set")
-			}
 		})
 	}
 }
 
 func TestRevalidation(t *testing.T) {
-	tests := []struct {
+	type revalidationTest struct {
 		cachePolicy cachePolicy
 
 		// Whether dirent should be reloaded before any modifications.
@@ -167,7 +152,9 @@ func TestRevalidation(t *testing.T) {
 		// Whether dirent should be reloaded after the remote has
 		// removed the file.
 		postRemovalWantReload bool
-	}{
+	}
+
+	tests := []revalidationTest{
 		{
 			// Policy cacheNone causes Revalidate to always return
 			// true.
@@ -208,67 +195,83 @@ func TestRevalidation(t *testing.T) {
 		},
 	}
 
-	ctx := contexttest.Context(t)
+	const file = "file" // The file walked below.
+
 	for _, test := range tests {
 		name := fmt.Sprintf("cachepolicy=%s", test.cachePolicy)
-		t.Run(name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, test.cachePolicy, p9.ModeDirectory|p9.PermissionsMask, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
+		rootTest(t, name, test.cachePolicy, func(ctx context.Context, h *p9test.Harness, rootFile *p9test.Mock, rootInode *fs.Inode) {
+			// Wrap in a dirent object.
 			rootDir := fs.NewDirent(rootInode, "root")
 
-			// Create a mock file that we will walk to from the root.
-			const (
-				name = "foo"
-				mode = p9.PermissionsMask
-			)
-			file := goodMockFile(mode, 0)
-			file.GetAttrMock.Valid = p9.AttrMaskAll()
-
-			// Tell the root mock how to walk to this file.
-			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
-			rootFile.WalkGetAttrMock.File = file
-			rootFile.WalkGetAttrMock.Attr = file.GetAttrMock.Attr
-			rootFile.WalkGetAttrMock.Valid = file.GetAttrMock.Valid
+			// Create a mock file a child of the root. We save when
+			// this is generated, so that when the time changed, we
+			// can update the original entry.
+			var origMocks []*p9test.Mock
+			rootFile.AddChild(file, func(parent *p9test.Mock) *p9test.Mock {
+				// Regular a regular file that has a consistent
+				// path number. This might be used by
+				// validation so we don't change it.
+				m := h.NewMock(parent, 0, p9.Attr{
+					Mode: p9.ModeRegular,
+				})
+				origMocks = append(origMocks, m)
+				return m
+			})
 
 			// Do the walk.
-			dirent, err := rootDir.Walk(ctx, rootDir, name)
+			dirent, err := rootDir.Walk(ctx, rootDir, file)
 			if err != nil {
-				t.Fatalf("Lookup(%q) failed: %v", name, err)
+				t.Fatalf("Lookup failed: %v", err)
 			}
 
-			// Walk again. Depending on the cache policy, we may get a new
-			// dirent.
-			newDirent, err := rootDir.Walk(ctx, rootDir, name)
+			// We must release the dirent, of the test will fail
+			// with a reference leak. This is tracked by p9test.
+			defer dirent.DecRef()
+
+			// Walk again. Depending on the cache policy, we may
+			// get a new dirent.
+			newDirent, err := rootDir.Walk(ctx, rootDir, file)
 			if err != nil {
-				t.Fatalf("Lookup(%q) failed: %v", name, err)
+				t.Fatalf("Lookup failed: %v", err)
 			}
 			if test.preModificationWantReload && dirent == newDirent {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
+				t.Errorf("Lookup with cachePolicy=%s got old dirent %+v, wanted a new dirent", test.cachePolicy, dirent)
 			}
 			if !test.preModificationWantReload && dirent != newDirent {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
+				t.Errorf("Lookup with cachePolicy=%s got new dirent %+v, wanted old dirent %+v", test.cachePolicy, newDirent, dirent)
 			}
+			newDirent.DecRef() // See above.
 
-			// Modify the underlying mocked file's modification time.
+			// Modify the underlying mocked file's modification
+			// time for the next walk that occurs.
 			nowSeconds := time.Now().Unix()
-			rootFile.WalkGetAttrMock.Attr.MTimeSeconds = uint64(nowSeconds)
-			file.GetAttrMock.Attr.MTimeSeconds = uint64(nowSeconds)
+			rootFile.AddChild(file, func(parent *p9test.Mock) *p9test.Mock {
+				// Ensure that the path is the same as above,
+				// but we change only the modification time of
+				// the file.
+				return h.NewMock(parent, 0, p9.Attr{
+					Mode:         p9.ModeRegular,
+					MTimeSeconds: uint64(nowSeconds),
+				})
+			})
+
+			// We also modify the original time, so that GetAttr
+			// behaves as expected for the caching case.
+			for _, m := range origMocks {
+				m.Attr.MTimeSeconds = uint64(nowSeconds)
+			}
 
-			// Walk again. Depending on the cache policy, we may get a new
-			// dirent.
-			newDirent, err = rootDir.Walk(ctx, rootDir, name)
+			// Walk again. Depending on the cache policy, we may
+			// get a new dirent.
+			newDirent, err = rootDir.Walk(ctx, rootDir, file)
 			if err != nil {
-				t.Fatalf("Lookup(%q) failed: %v", name, err)
+				t.Fatalf("Lookup failed: %v", err)
 			}
 			if test.postModificationWantReload && dirent == newDirent {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
+				t.Errorf("Lookup with cachePolicy=%s got old dirent, wanted a new dirent", test.cachePolicy)
 			}
 			if !test.postModificationWantReload && dirent != newDirent {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
+				t.Errorf("Lookup with cachePolicy=%s got new dirent, wanted old dirent", test.cachePolicy)
 			}
 			uattrs, err := newDirent.Inode.UnstableAttr(ctx)
 			if err != nil {
@@ -276,660 +279,25 @@ func TestRevalidation(t *testing.T) {
 			}
 			gotModTimeSeconds := uattrs.ModificationTime.Seconds()
 			if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds {
-				t.Fatalf("Lookup(%q) with cachePolicy=%s got new modification time %v, wanted %v", name, test.cachePolicy, gotModTimeSeconds, nowSeconds)
+				t.Fatalf("Lookup with cachePolicy=%s got new modification time %v, wanted %v", test.cachePolicy, gotModTimeSeconds, nowSeconds)
 			}
+			newDirent.DecRef() // See above.
 
-			// Make WalkGetAttr return ENOENT. This simulates
-			// removing the file from the remote fs.
-			rootFile.WalkGetAttrMock = p9test.WalkGetAttrMock{
-				Err: syscall.ENOENT,
-			}
+			// Remove the file from the remote fs, subsequent walks
+			// should now fail to find anything.
+			rootFile.RemoveChild(file)
 
 			// Walk again. Depending on the cache policy, we may
 			// get ENOENT.
-			newDirent, err = rootDir.Walk(ctx, rootDir, name)
+			newDirent, err = rootDir.Walk(ctx, rootDir, file)
 			if test.postRemovalWantReload && err == nil {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got nil error, wanted ENOENT", name, test.cachePolicy)
+				t.Errorf("Lookup with cachePolicy=%s got nil error, wanted ENOENT", test.cachePolicy)
 			}
 			if !test.postRemovalWantReload && (err != nil || dirent != newDirent) {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v and error %v, wanted old dirent %v and nil error", name, test.cachePolicy, newDirent, err, dirent)
-			}
-		})
-	}
-}
-
-func TestSetTimestamps(t *testing.T) {
-	// Test parameters.
-	type setTimestampsTest struct {
-		// Name of the test.
-		name string
-
-		// Function input parameters.
-		ts fs.TimeSpec
-	}
-
-	ctx := contexttest.Context(t)
-	now := ktime.NowFromContext(ctx)
-	tests := []setTimestampsTest{
-		{
-			name: "mock SetAttr passes (function succeeds)",
-			ts: fs.TimeSpec{
-				ATime: now,
-				MTime: now,
-			},
-		},
-		{
-			name: "mock SetAttr passes, times are 0 (function succeeds)",
-			ts:   fs.TimeSpec{},
-		},
-		{
-			name: "mock SetAttr passes, times are 0 and not system time (function succeeds)",
-			ts: fs.TimeSpec{
-				ATimeSetSystemTime: false,
-				MTimeSetSystemTime: false,
-			},
-		},
-		{
-			name: "mock SetAttr passes, times are set to system time (function succeeds)",
-			ts: fs.TimeSpec{
-				ATimeSetSystemTime: true,
-				MTimeSetSystemTime: true,
-			},
-		},
-		{
-			name: "mock SetAttr passes, times are omitted (function succeeds)",
-			ts: fs.TimeSpec{
-				ATimeOmit: true,
-				MTimeOmit: true,
-			},
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
-			// Call function.
-			err = rootInode.SetTimestamps(ctx, nil /* Dirent */, test.ts)
-
-			// Check return values.
-			if err != nil {
-				t.Errorf("SetTimestamps failed: got error %v, want nil", err)
-			}
-
-			// Check mock parameters.
-			if !(test.ts.ATimeOmit && test.ts.MTimeOmit) && !rootFile.SetAttrMock.Called {
-				t.Errorf("TestSetTimestamps failed: SetAttr not called")
-				return
-			}
-
-			// Check what was passed to the mock function.
-			attr := rootFile.SetAttrMock.Attr
-			atimeGiven := ktime.FromUnix(int64(attr.ATimeSeconds), int64(attr.ATimeNanoSeconds))
-			if test.ts.ATimeOmit {
-				if rootFile.SetAttrMock.Valid.ATime {
-					t.Errorf("ATime got set true in mask, wanted false")
-				}
-			} else {
-				if got, want := rootFile.SetAttrMock.Valid.ATimeNotSystemTime, !test.ts.ATimeSetSystemTime; got != want {
-					t.Errorf("got ATimeNotSystemTime %v, want %v", got, want)
-				}
-				if !test.ts.ATimeSetSystemTime && !test.ts.ATime.Equal(atimeGiven) {
-					t.Errorf("ATime got %v, want %v", atimeGiven, test.ts.ATime)
-				}
-			}
-
-			mtimeGiven := ktime.FromUnix(int64(attr.MTimeSeconds), int64(attr.MTimeNanoSeconds))
-			if test.ts.MTimeOmit {
-				if rootFile.SetAttrMock.Valid.MTime {
-					t.Errorf("MTime got set true in mask, wanted false")
-				}
-			} else {
-				if got, want := rootFile.SetAttrMock.Valid.MTimeNotSystemTime, !test.ts.MTimeSetSystemTime; got != want {
-					t.Errorf("got MTimeNotSystemTime %v, want %v", got, want)
-				}
-				if !test.ts.MTimeSetSystemTime && !test.ts.MTime.Equal(mtimeGiven) {
-					t.Errorf("MTime got %v, want %v", mtimeGiven, test.ts.MTime)
-				}
-			}
-		})
-	}
-}
-
-func TestSetPermissions(t *testing.T) {
-	// Test parameters.
-	type setPermissionsTest struct {
-		// Name of the test.
-		name string
-
-		// SetPermissions input parameters.
-		perms fs.FilePermissions
-
-		// Error that SetAttr mock should return.
-		setAttrErr error
-
-		// Expected return value.
-		want bool
-	}
-
-	tests := []setPermissionsTest{
-		{
-			name:       "SetAttr mock succeeds (function succeeds)",
-			perms:      fs.FilePermissions{User: fs.PermMask{Read: true, Write: true, Execute: true}},
-			want:       true,
-			setAttrErr: nil,
-		},
-		{
-			name:       "SetAttr mock fails (function fails)",
-			perms:      fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}},
-			want:       false,
-			setAttrErr: syscall.ENOENT,
-		},
-	}
-
-	ctx := contexttest.Context(t)
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, 0, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-			rootFile.SetAttrMock.Err = test.setAttrErr
-
-			ok := rootInode.SetPermissions(ctx, nil /* Dirent */, test.perms)
-
-			// Check return value.
-			if ok != test.want {
-				t.Errorf("SetPermissions got %v, want %v", ok, test.want)
-			}
-
-			// Check mock parameters.
-			pattr := rootFile.SetAttrMock.Attr
-			if !rootFile.SetAttrMock.Called {
-				t.Errorf("SetAttr not called")
-				return
-			}
-			if !rootFile.SetAttrMock.Valid.Permissions {
-				t.Errorf("SetAttr did not get right request (got false, expected SetAttrMask.Permissions true)")
-			}
-			if got := fs.FilePermsFromP9(pattr.Permissions); got != test.perms {
-				t.Errorf("SetAttr did not get right permissions -- got %v, want %v", got, test.perms)
-			}
-		})
-	}
-}
-
-func TestClose(t *testing.T) {
-	ctx := contexttest.Context(t)
-	// Set up mock.
-	rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
-	if err != nil {
-		t.Fatalf("error creating root: %v", err)
-	}
-
-	// Call function.
-	rootInode.InodeOperations.Release(ctx)
-
-	// Check mock parameters.
-	if !rootFile.CloseMock.Called {
-		t.Errorf("TestClose failed: Close not called")
-	}
-}
-
-func TestRename(t *testing.T) {
-	// Test parameters.
-	type renameTest struct {
-		// Name of the test.
-		name string
-
-		// Input parameters.
-		newParent *fs.Inode
-		newName   string
-
-		// Rename mock parameters.
-		renameErr    error
-		renameCalled bool
-
-		// Error want to return given the parameters. (Same as what
-		// we expect and tell rename to return.)
-		want error
-	}
-	ctx := contexttest.Context(t)
-	rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
-	if err != nil {
-		t.Fatalf("error creating root: %v", err)
-	}
-
-	tests := []renameTest{
-		{
-			name:         "mock Rename succeeds (function succeeds)",
-			newParent:    rootInode,
-			newName:      "foo2",
-			want:         nil,
-			renameErr:    nil,
-			renameCalled: true,
-		},
-		{
-			name:         "mock Rename fails (function fails)",
-			newParent:    rootInode,
-			newName:      "foo2",
-			want:         syscall.ENOENT,
-			renameErr:    syscall.ENOENT,
-			renameCalled: true,
-		},
-		{
-			name:         "newParent is not inodeOperations but should be (function fails)",
-			newParent:    fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory}),
-			newName:      "foo2",
-			want:         syscall.EXDEV,
-			renameErr:    nil,
-			renameCalled: false,
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			mockFile := goodMockFile(p9.PermissionsMask, 0)
-			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
-			rootFile.WalkGetAttrMock.File = mockFile
-
-			dirent, err := rootInode.Lookup(ctx, "foo")
-			if err != nil {
-				t.Fatalf("root.Walk failed: %v", err)
-			}
-			mockFile.RenameMock.Err = test.renameErr
-			mockFile.RenameMock.Called = false
-
-			// Use a dummy oldParent to acquire write access to that directory.
-			oldParent := &inodeOperations{
-				readdirCache: fs.NewSortedDentryMap(nil),
-			}
-			oldInode := fs.NewInode(oldParent, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory})
-
-			// Call function.
-			err = dirent.Inode.InodeOperations.Rename(ctx, oldInode, "", test.newParent, test.newName)
-
-			// Check return value.
-			if err != test.want {
-				t.Errorf("Rename got %v, want %v", err, test.want)
-			}
-
-			// Check mock parameters.
-			if got, want := mockFile.RenameMock.Called, test.renameCalled; got != want {
-				t.Errorf("renameCalled got %v want %v", got, want)
-			}
-		})
-	}
-}
-
-// This file is read from in TestPreadv.
-type readAtFileFake struct {
-	p9test.FileMock
-
-	// Parameters for faking ReadAt.
-	FileLength int
-	Err        error
-	ChunkSize  int
-	Called     bool
-	LengthRead int
-}
-
-func (r *readAtFileFake) ReadAt(p []byte, offset uint64) (int, error) {
-	r.Called = true
-	log.Warningf("ReadAt fake: length read so far = %d, len(p) = %d, offset = %d", r.LengthRead, len(p), offset)
-	if int(offset) != r.LengthRead {
-		return 0, fmt.Errorf("offset got %d; expected %d", offset, r.LengthRead)
-	}
-
-	if r.Err != nil {
-		return 0, r.Err
-	}
-
-	if r.LengthRead >= r.FileLength {
-		return 0, io.EOF
-	}
-
-	// Read at most ChunkSize and read at most what's left in the file.
-	toBeRead := len(p)
-	if r.LengthRead+toBeRead >= r.FileLength {
-		toBeRead = r.FileLength - int(offset)
-	}
-	if toBeRead > r.ChunkSize {
-		toBeRead = r.ChunkSize
-	}
-
-	r.LengthRead += toBeRead
-	if r.LengthRead == r.FileLength {
-		return toBeRead, io.EOF
-	}
-	return toBeRead, nil
-}
-
-func TestPreadv(t *testing.T) {
-	// Test parameters.
-	type preadvTest struct {
-		// Name of the test.
-		name string
-
-		// Mock parameters
-		mode p9.FileMode
-
-		// Buffer to read into.
-		buffer    [512]byte
-		sliceSize int
-
-		// How much readAt returns at a time.
-		chunkSize int
-
-		// Whether or not we expect ReadAt to be called.
-		readAtCalled bool
-		readAtErr    error
-
-		// Expected return values.
-		want error
-	}
-
-	tests := []preadvTest{
-		{
-			name:         "fake ReadAt succeeds, 512 bytes requested, 512 byte chunks (function succeeds)",
-			want:         nil,
-			readAtErr:    nil,
-			mode:         p9.PermissionsMask,
-			readAtCalled: true,
-			sliceSize:    512,
-			chunkSize:    512,
-		},
-		{
-			name:         "fake ReadAt succeeds, 512 bytes requested, 200 byte chunks (function succeeds)",
-			want:         nil,
-			readAtErr:    nil,
-			mode:         p9.PermissionsMask,
-			readAtCalled: true,
-			sliceSize:    512,
-			chunkSize:    200,
-		},
-		{
-			name:         "fake ReadAt succeeds, 0 bytes requested (function succeeds)",
-			want:         nil,
-			readAtErr:    nil,
-			mode:         p9.PermissionsMask,
-			readAtCalled: false,
-			sliceSize:    0,
-			chunkSize:    100,
-		},
-		{
-			name:         "fake ReadAt returns 0 bytes and EOF (function fails)",
-			want:         io.EOF,
-			readAtErr:    io.EOF,
-			mode:         p9.PermissionsMask,
-			readAtCalled: true,
-			sliceSize:    512,
-			chunkSize:    512,
-		},
-	}
-
-	ctx := contexttest.Context(t)
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, test.mode, 1024)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
-			// Set up the read buffer.
-			dst := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
-
-			// This file will be read from.
-			openFile := &readAtFileFake{
-				Err:        test.readAtErr,
-				FileLength: test.sliceSize,
-				ChunkSize:  test.chunkSize,
-			}
-			rootFile.WalkGetAttrMock.File = openFile
-			rootFile.WalkGetAttrMock.Attr.Mode = test.mode
-			rootFile.WalkGetAttrMock.Valid.Mode = true
-
-			f := NewFile(
-				ctx,
-				fs.NewDirent(rootInode, ""),
-				"",
-				fs.FileFlags{Read: true},
-				rootInode.InodeOperations.(*inodeOperations),
-				&handles{File: contextFile{file: openFile}},
-			)
-
-			// Call function.
-			_, err = f.Preadv(ctx, dst, 0)
-
-			// Check return value.
-			if err != test.want {
-				t.Errorf("Preadv got %v, want %v", err, test.want)
-			}
-
-			// Check mock parameters.
-			if test.readAtCalled != openFile.Called {
-				t.Errorf("ReadAt called: %v, but expected opposite", openFile.Called)
-			}
-		})
-	}
-}
-
-func TestReadlink(t *testing.T) {
-	// Test parameters.
-	type readlinkTest struct {
-		// Name of the test.
-		name string
-
-		// Mock parameters
-		mode p9.FileMode
-
-		// Whether or not we expect ReadAt to be called and what error
-		// it shall return.
-		readlinkCalled bool
-		readlinkErr    error
-
-		// Expected return values.
-		want error
-	}
-
-	tests := []readlinkTest{
-		{
-			name:           "file is not symlink (function fails)",
-			want:           syscall.ENOLINK,
-			mode:           p9.PermissionsMask,
-			readlinkCalled: false,
-			readlinkErr:    nil,
-		},
-		{
-			name:           "mock Readlink succeeds (function succeeds)",
-			want:           nil,
-			mode:           p9.PermissionsMask | p9.ModeSymlink,
-			readlinkCalled: true,
-			readlinkErr:    nil,
-		},
-		{
-			name:           "mock Readlink fails (function fails)",
-			want:           syscall.ENOENT,
-			mode:           p9.PermissionsMask | p9.ModeSymlink,
-			readlinkCalled: true,
-			readlinkErr:    syscall.ENOENT,
-		},
-	}
-
-	ctx := contexttest.Context(t)
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, test.mode, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
-			openFile := goodMockFile(test.mode, 0)
-			rootFile.WalkMock.File = openFile
-			rootFile.ReadlinkMock.Err = test.readlinkErr
-
-			// Call function.
-			_, err = rootInode.Readlink(ctx)
-
-			// Check return value.
-			if err != test.want {
-				t.Errorf("Readlink got %v, want %v", err, test.want)
-			}
-
-			// Check mock parameters.
-			if test.readlinkCalled && !rootFile.ReadlinkMock.Called {
-				t.Errorf("Readlink not called")
-			}
-		})
-	}
-}
-
-// This file is write from in TestPwritev.
-type writeAtFileFake struct {
-	p9test.FileMock
-
-	// Parameters for faking WriteAt.
-	Err           error
-	ChunkSize     int
-	Called        bool
-	LengthWritten int
-}
-
-func (r *writeAtFileFake) WriteAt(p []byte, offset uint64) (int, error) {
-	r.Called = true
-	log.Warningf("WriteAt fake: length written so far = %d, len(p) = %d, offset = %d", r.LengthWritten, len(p), offset)
-	if int(offset) != r.LengthWritten {
-		return 0, fmt.Errorf("offset got %d; want %d", offset, r.LengthWritten)
-	}
-
-	if r.Err != nil {
-		return 0, r.Err
-	}
-
-	// Write at most ChunkSize.
-	toBeWritten := len(p)
-	if toBeWritten > r.ChunkSize {
-		toBeWritten = r.ChunkSize
-	}
-	r.LengthWritten += toBeWritten
-	return toBeWritten, nil
-}
-
-func TestPwritev(t *testing.T) {
-	// Test parameters.
-	type pwritevTest struct {
-		// Name of the test.
-		name string
-
-		// Mock parameters
-		mode p9.FileMode
-
-		allowWrite bool
-
-		// Buffer to write into.
-		buffer    [512]byte
-		sliceSize int
-		chunkSize int
-
-		// Whether or not we expect writeAt to be called.
-		writeAtCalled bool
-		writeAtErr    error
-
-		// Expected return values.
-		want error
-	}
-
-	tests := []pwritevTest{
-		{
-			name:          "fake writeAt succeeds, one chunk (function succeeds)",
-			want:          nil,
-			writeAtErr:    nil,
-			mode:          p9.PermissionsMask,
-			allowWrite:    true,
-			writeAtCalled: true,
-			sliceSize:     512,
-			chunkSize:     512,
-		},
-		{
-			name:          "fake writeAt fails, short write (function fails)",
-			want:          io.ErrShortWrite,
-			writeAtErr:    nil,
-			mode:          p9.PermissionsMask,
-			allowWrite:    true,
-			writeAtCalled: true,
-			sliceSize:     512,
-			chunkSize:     200,
-		},
-		{
-			name:          "fake writeAt succeeds, len 0 (function succeeds)",
-			want:          nil,
-			writeAtErr:    nil,
-			mode:          p9.PermissionsMask,
-			allowWrite:    true,
-			writeAtCalled: false,
-			sliceSize:     0,
-			chunkSize:     0,
-		},
-		{
-			name:          "writeAt can still write despite file permissions read only (function succeeds)",
-			want:          nil,
-			writeAtErr:    nil,
-			mode:          p9.PermissionsMask,
-			allowWrite:    false,
-			writeAtCalled: true,
-			sliceSize:     512,
-			chunkSize:     512,
-		},
-	}
-
-	ctx := contexttest.Context(t)
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			_, rootInode, err := root(ctx, cacheNone, test.mode, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
-			src := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
-
-			// This is the file that will be used for writing.
-			openFile := &writeAtFileFake{
-				Err:       test.writeAtErr,
-				ChunkSize: test.chunkSize,
-			}
-
-			f := NewFile(
-				ctx,
-				fs.NewDirent(rootInode, ""),
-				"",
-				fs.FileFlags{Write: true},
-				rootInode.InodeOperations.(*inodeOperations),
-				&handles{File: contextFile{file: openFile}},
-			)
-
-			// Call function.
-			_, err = f.Pwritev(ctx, src, 0)
-
-			// Check return value.
-			if err != test.want {
-				t.Errorf("Pwritev got %v, want %v", err, test.want)
-			}
-
-			// Check mock parameters.
-			if test.writeAtCalled != openFile.Called {
-				t.Errorf("WriteAt called: %v, but expected opposite", openFile.Called)
-				return
+				t.Errorf("Lookup with cachePolicy=%s got new dirent and error %v, wanted old dirent and nil error", test.cachePolicy, err)
 			}
-			if openFile.Called && test.writeAtErr != nil && openFile.LengthWritten != test.sliceSize {
-				t.Errorf("wrote %d bytes, expected %d bytes written", openFile.LengthWritten, test.sliceSize)
+			if err == nil {
+				newDirent.DecRef() // See above.
 			}
 		})
 	}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 7552216f3..f76a83cd9 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -91,10 +91,6 @@ func (e *endpointMaps) get(key device.MultiDeviceKey) transport.BoundEndpoint {
 type session struct {
 	refs.AtomicRefCount
 
-	// conn is a unet.Socket that wraps the readFD/writeFD mount option,
-	// see fs/gofer/fs.go.
-	conn *unet.Socket `state:"nosave"`
-
 	// msize is the value of the msize mount option, see fs/gofer/fs.go.
 	msize uint32 `state:"wait"`
 
@@ -142,7 +138,7 @@ type session struct {
 
 // Destroy tears down the session.
 func (s *session) Destroy() {
-	s.conn.Close()
+	s.client.Close()
 }
 
 // Revalidate implements MountSource.Revalidate.
@@ -235,7 +231,6 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	// Construct the session.
 	s := &session{
 		connID:          dev,
-		conn:            conn,
 		msize:           o.msize,
 		version:         o.version,
 		cachePolicy:     o.policy,
@@ -252,7 +247,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	m := fs.NewMountSource(s, filesystem, superBlockFlags)
 
 	// Send the Tversion request.
-	s.client, err = p9.NewClient(s.conn, s.msize, s.version)
+	s.client, err = p9.NewClient(conn, s.msize, s.version)
 	if err != nil {
 		// Drop our reference on the session, it needs to be torn down.
 		s.DecRef()
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index f657135fc..d9fd7a221 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -84,13 +84,13 @@ func (s *session) afterLoad() {
 	}
 
 	// Manually restore the connection.
-	s.conn, err = unet.NewSocket(opts.fd)
+	conn, err := unet.NewSocket(opts.fd)
 	if err != nil {
 		panic(fmt.Sprintf("failed to create Socket for FD %d: %v", opts.fd, err))
 	}
 
 	// Manually restore the client.
-	s.client, err = p9.NewClient(s.conn, s.msize, s.version)
+	s.client, err = p9.NewClient(conn, s.msize, s.version)
 	if err != nil {
 		panic(fmt.Sprintf("failed to connect client to server: %v", err))
 	}
diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD
index 34582f275..ff7dacf07 100644
--- a/pkg/sentry/fs/proc/device/BUILD
+++ b/pkg/sentry/fs/proc/device/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "device",
     srcs = ["device.go"],
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index f362d15c8..33197cf14 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "hostcpu",
     srcs = [
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
index fe6fa2260..3f8fa206c 100644
--- a/pkg/sentry/kernel/kdefs/BUILD
+++ b/pkg/sentry/kernel/kdefs/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "kdefs",
     srcs = ["kdefs.go"],
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index 66899910c..e903badd3 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "memevent",
     srcs = ["memory_events.go"],
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
index 125792f39..52e226a39 100644
--- a/pkg/sentry/kernel/sched/BUILD
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sched",
     srcs = [
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 0beb4561b..83cad186a 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,6 +1,7 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_embed_data(
diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD
index 341b30b98..88738d65d 100644
--- a/pkg/sentry/memutil/BUILD
+++ b/pkg/sentry/memutil/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "memutil",
     srcs = [
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index 35121321a..dbafa3204 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "interrupt",
     srcs = [
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 4ef9e20d7..1b71e629f 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index e779e3893..1dffe94a4 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "testutil",
     testonly = 1,
diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/sentry/platform/procid/BUILD
index ba68d48f4..d3398d1e8 100644
--- a/pkg/sentry/platform/procid/BUILD
+++ b/pkg/sentry/platform/procid/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "procid",
     srcs = [
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index debae058b..2eb354ad4 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ptrace",
     srcs = [
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 2485eb2eb..c35d49f2d 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_template(
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 3bce56985..b76d7974e 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -1,6 +1,7 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 7a86e2234..de1b920af 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_template(
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 7dcf6e561..614d9e21e 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "safecopy",
     srcs = [
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
index e96509ce1..87a9bff12 100644
--- a/pkg/sentry/safemem/BUILD
+++ b/pkg/sentry/safemem/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "safemem",
     srcs = [
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index 751176747..41313d334 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sighandling",
     srcs = [
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 38fa54283..06e121946 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "rpcinet",
     srcs = [
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
index c51ca14b1..a16977f29 100644
--- a/pkg/sentry/socket/rpcinet/conn/BUILD
+++ b/pkg/sentry/socket/rpcinet/conn/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # BSD
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # BSD
+
 go_library(
     name = "conn",
     srcs = ["conn.go"],
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index 2ae902b3f..2bab01774 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # BSD
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # BSD
+
 go_library(
     name = "notifier",
     srcs = ["notifier.go"],
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index a57a8298e..f1f6fdb7d 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "state",
     srcs = [
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 674554081..52c7f325c 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "strace",
     srcs = [
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 2a9f0915e..35192ff49 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "syscalls",
     srcs = [
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 9452787fb..5dadb8a2d 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
index 63da5e81f..42e24ace5 100644
--- a/pkg/sentry/unimpl/BUILD
+++ b/pkg/sentry/unimpl/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 proto_library(
     name = "unimplemented_syscall_proto",
     srcs = ["unimplemented_syscall.proto"],
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index 68b82af47..0929497c3 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "uniqueid",
     srcs = ["context.go"],
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index 13bc33eb1..b2c687b20 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "watchdog",
     srcs = ["watchdog.go"],
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index 05e4ca540..338fd9336 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sleep",
     srcs = [
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index 6a5b2d4ff..dd0f250fa 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,7 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+
+package(licenses = ["notice"])  # Apache 2.0
+
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD
index 6be78dc9b..66c8f3807 100644
--- a/pkg/state/statefile/BUILD
+++ b/pkg/state/statefile/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "statefile",
     srcs = ["statefile.go"],
diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
index 4fa959df0..9cb7f66fe 100644
--- a/pkg/sync/atomicptrtest/BUILD
+++ b/pkg/sync/atomicptrtest/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
index 07b4f85ab..54f8e59b1 100644
--- a/pkg/sync/seqatomictest/BUILD
+++ b/pkg/sync/seqatomictest/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD
index 5dd2e90bb..30ae20772 100644
--- a/pkg/syserr/BUILD
+++ b/pkg/syserr/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "syserr",
     srcs = [
diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD
index e050c2043..d4c6da97a 100644
--- a/pkg/syserror/BUILD
+++ b/pkg/syserror/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "syserror",
     srcs = ["syserror.go"],
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index bf618831a..723ad668f 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "gonet",
     srcs = ["gonet.go"],
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index e8a524918..a1de808b9 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "checker",
     testonly = 1,
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 9a6f49c45..25f6c1457 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "channel",
     srcs = ["channel.go"],
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 6e75e9f47..94391433c 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "fdbased",
     srcs = ["endpoint.go"],
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index cc4247ffd..a46ba7f11 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "loopback",
     srcs = ["loopback.go"],
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 10b35a37e..829ea7c42 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "rawfile",
     srcs = [
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index 5390257c5..d7f1e66ef 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sharedmem",
     srcs = [
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index ff798ae6f..12e813509 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "pipe",
     srcs = [
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index c4a7879c4..661037bb2 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "queue",
     srcs = [
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 7155aea66..52e237c25 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sniffer",
     srcs = [
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index a8bb03661..5ec01cec9 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "tun",
     srcs = ["tun_unsafe.go"],
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index 7582df32e..ba495c437 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "waitable",
     srcs = [
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 25a3c98b6..a2a07f533 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_test(
     name = "ip_test",
     size = "small",
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index 44f2b66e5..f6fb7daf7 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "arp",
     srcs = ["arp.go"],
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
index 1c22c52fc..401dce646 100644
--- a/pkg/tcpip/network/hash/BUILD
+++ b/pkg/tcpip/network/hash/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "hash",
     srcs = ["hash.go"],
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 90d65d531..e72317e9f 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ipv4",
     srcs = [
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index 2f19a659e..808c37df3 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ipv6",
     srcs = [
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 3c3374275..c69fc0744 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ports",
     srcs = ["ports.go"],
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index 21d32245d..32baf2115 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "tun_tcp_connect",
     srcs = ["main.go"],
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index d7402aaa2..760445843 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "tun_tcp_echo",
     srcs = ["main.go"],
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
index 7a95594ef..814e5c1ea 100644
--- a/pkg/tcpip/transport/tcp/testing/context/BUILD
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "context",
     testonly = 1,
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index 46da3e6f1..ac1a94d4d 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "tcpconntrack",
     srcs = ["tcp_conntrack.go"],
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index d18338fff..c20df7005 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "tmutex",
     srcs = ["tmutex.go"],
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index acdfd7cb6..f90e43c89 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "unet",
     srcs = [
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index d32c57d1a..21008cf6c 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "urpc",
     srcs = ["urpc.go"],
diff --git a/pkg/waiter/fdnotifier/BUILD b/pkg/waiter/fdnotifier/BUILD
index 4e582755d..af6baa303 100644
--- a/pkg/waiter/fdnotifier/BUILD
+++ b/pkg/waiter/fdnotifier/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "fdnotifier",
     srcs = [
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 04cc0e854..07afce807 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "boot",
     srcs = [
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index 48f2c8024..004222242 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "filter",
     srcs = [
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index 10a8e5feb..bf2f373a9 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "cgroup",
     srcs = ["cgroup.go"],
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 7040eb4ec..394bb0e1f 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "cmd",
     srcs = [
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
index fa1a7d430..ff4ccff69 100644
--- a/runsc/console/BUILD
+++ b/runsc/console/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "console",
     srcs = ["console.go"],
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index f4c6f1525..bdd93aaba 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "container",
     srcs = [
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 24e172f48..f28e4fa77 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "fsgofer",
     srcs = [
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index 40f4f2205..c7848d10c 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "filter",
     srcs = [
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index e03bb7752..fd913831a 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -26,7 +26,6 @@ import (
 	"math"
 	"os"
 	"path"
-	"strings"
 	"sync"
 	"syscall"
 
@@ -181,18 +180,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 	}
 }
 
-func isNameValid(name string) bool {
-	if name == "" || name == "." || name == ".." {
-		log.Warningf("Invalid name: %s", name)
-		return false
-	}
-	if strings.IndexByte(name, '/') >= 0 {
-		log.Warningf("Invalid name: %s", name)
-		return false
-	}
-	return true
-}
-
 // localFile implements p9.File wrapping a local file. The underlying file
 // is opened during Walk() and stored in 'controlFile' to be used with other
 // operations. The mode in which the file is opened varies depending on the
@@ -228,11 +215,7 @@ type localFile struct {
 	// attachPoint is the attachPoint that serves this localFile.
 	attachPoint *attachPoint
 
-	// mu protects 'hostPath' when file is renamed.
-	mu sync.Mutex
-
-	// TODO: hostPath is not safe to use as path needs to be walked
-	// everytime (and can change underneath us). Remove all usages.
+	// hostPath will be safely updated by the Renamed hook.
 	hostPath string
 
 	// controlFile is opened when localFile is created and it's never nil.
@@ -246,6 +229,7 @@ type localFile struct {
 	// if localFile isn't opened.
 	mode p9.OpenFlags
 
+	// ft is the fileType for this file.
 	ft fileType
 
 	// readDirMu protects against concurrent Readdir calls.
@@ -296,10 +280,7 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 		return nil, "", extractErrno(err)
 	}
 
-	parent.mu.Lock()
-	defer parent.mu.Unlock()
 	newPath := path.Join(parent.hostPath, name)
-
 	return os.NewFile(uintptr(fd), newPath), newPath, nil
 }
 
@@ -382,13 +363,10 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
 		var err error
 
-		l.mu.Lock()
 		newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
 		if err != nil {
-			l.mu.Unlock()
 			return nil, p9.QID{}, 0, extractErrno(err)
 		}
-		l.mu.Unlock()
 	}
 
 	stat, err := stat(int(newFile.Fd()))
@@ -418,9 +396,6 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		}
 		return nil, nil, p9.QID{}, 0, syscall.EBADF
 	}
-	if !isNameValid(name) {
-		return nil, nil, p9.QID{}, 0, syscall.EINVAL
-	}
 
 	// Use a single file for both 'controlFile' and 'openedFile'. Mode must include read for control
 	// and whichever else was requested by caller. Note that resulting file might have a wider mode
@@ -452,9 +427,6 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 
-	l.mu.Lock()
-	defer l.mu.Unlock()
-
 	cPath := path.Join(l.hostPath, name)
 	f := os.NewFile(uintptr(fd), cPath)
 	c := &localFile{
@@ -477,10 +449,6 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if !isNameValid(name) {
-		return p9.QID{}, syscall.EINVAL
-	}
-
 	if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -517,9 +485,6 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 			return nil, nil, extractErrno(err)
 		}
 
-		l.mu.Lock()
-		defer l.mu.Unlock()
-
 		c := &localFile{
 			attachPoint: l.attachPoint,
 			hostPath:    l.hostPath,
@@ -532,10 +497,6 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	var qids []p9.QID
 	last := l
 	for _, name := range names {
-		if !isNameValid(name) {
-			return nil, nil, syscall.EINVAL
-		}
-
 		f, path, err := openAnyFile(last, name)
 		if err != nil {
 			return nil, nil, extractErrno(err)
@@ -761,15 +722,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	return err
 }
 
-// Remove implements p9.File.
-//
-// This is deprecated in favor of UnlinkAt.
-func (*localFile) Remove() error {
-	return syscall.ENOSYS
+// Rename implements p9.File; this should never be called.
+func (l *localFile) Rename(p9.File, string) error {
+	panic("rename called directly")
 }
 
-// Rename implements p9.File.
-func (l *localFile) Rename(directory p9.File, name string) error {
+// RenameAt implements p9.File.RenameAt.
+//
+// TODO: change to renameat(2).
+func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error {
 	conf := l.attachPoint.conf
 	if conf.ROMount {
 		if conf.PanicOnWrite {
@@ -777,34 +738,16 @@ func (l *localFile) Rename(directory p9.File, name string) error {
 		}
 		return syscall.EBADF
 	}
-	if !isNameValid(name) {
-		return syscall.EINVAL
-	}
-
-	l.mu.Lock()
-	defer l.mu.Unlock()
 
-	// TODO: change to renameat(2)
-	parent := directory.(*localFile)
-	newPath := path.Join(parent.hostPath, name)
-	if err := syscall.Rename(l.hostPath, newPath); err != nil {
+	newParent := directory.(*localFile)
+	oldPath := path.Join(l.hostPath, oldName)
+	newPath := path.Join(newParent.hostPath, newName)
+	if err := syscall.Rename(oldPath, newPath); err != nil {
 		return extractErrno(err)
 	}
-
-	// Update path on success.
-	// TODO: this doesn't cover cases where any of the
-	// parents have been renamed.
-	l.hostPath = newPath
 	return nil
 }
 
-// RenameAt implements p9.File.RenameAt.
-//
-// Code still uses [deprecated] Rename().
-func (*localFile) RenameAt(_ string, _ p9.File, _ string) error {
-	return syscall.ENOSYS
-}
-
 // ReadAt implements p9.File.
 func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
@@ -848,9 +791,6 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 		}
 		return p9.QID{}, syscall.EBADF
 	}
-	if !isNameValid(newName) {
-		return p9.QID{}, syscall.EINVAL
-	}
 
 	if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
@@ -882,9 +822,6 @@ func (l *localFile) Link(target p9.File, newName string) error {
 		}
 		return syscall.EBADF
 	}
-	if !isNameValid(newName) {
-		return syscall.EINVAL
-	}
 
 	targetFile := target.(*localFile)
 	if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil {
@@ -909,9 +846,7 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 		}
 		return syscall.EBADF
 	}
-	if !isNameValid(name) {
-		return syscall.EINVAL
-	}
+
 	if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil {
 		return extractErrno(err)
 	}
@@ -1000,6 +935,11 @@ func (l *localFile) Close() error {
 	return err
 }
 
+// Renamed implements p9.Renamed.
+func (l *localFile) Renamed(newDir p9.File, newName string) {
+	l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
+}
+
 // extractErrno tries to determine the errno.
 func extractErrno(err error) syscall.Errno {
 	if err == nil {
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 48860f952..34033245b 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -415,22 +415,22 @@ func TestLink(t *testing.T) {
 
 func TestROMountChecks(t *testing.T) {
 	runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
-		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
 			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+		if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
 			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if err := s.file.Rename(s.file, ".."); err != syscall.EBADF {
+		if err := s.file.RenameAt("some_file", s.file, "other_file"); err != syscall.EBADF {
 			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+		if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
 			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if err := s.file.UnlinkAt("..", 0); err != syscall.EBADF {
+		if err := s.file.UnlinkAt("some_file", 0); err != syscall.EBADF {
 			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if err := s.file.Link(s.file, ".."); err != syscall.EBADF {
+		if err := s.file.Link(s.file, "some_link"); err != syscall.EBADF {
 			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
 
@@ -445,12 +445,12 @@ func TestROMountChecks(t *testing.T) {
 func TestROMountPanics(t *testing.T) {
 	conf := Config{ROMount: true, PanicOnWrite: true}
 	runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
-		assertPanic(t, func() { s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.Rename(s.file, "..") })
-		assertPanic(t, func() { s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.UnlinkAt("..", 0) })
-		assertPanic(t, func() { s.file.Link(s.file, "..") })
+		assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") })
+		assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) })
+		assertPanic(t, func() { s.file.Link(s.file, "some_link") })
 
 		valid := p9.SetAttrMask{Size: true}
 		attr := p9.SetAttr{Size: 0}
@@ -458,60 +458,6 @@ func TestROMountPanics(t *testing.T) {
 	})
 }
 
-func TestInvalidName(t *testing.T) {
-	runCustom(t, []fileType{regular}, rwConfs, func(t *testing.T, s state) {
-		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
-			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if _, _, err := s.file.Walk([]string{".."}); err != syscall.EINVAL {
-			t.Errorf("%v: Walk() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
-			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if err := s.file.Rename(s.file, ".."); err != syscall.EINVAL {
-			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
-			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if err := s.file.UnlinkAt("..", 0); err != syscall.EINVAL {
-			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if err := s.file.Link(s.file, ".."); err != syscall.EINVAL {
-			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-	})
-}
-
-func TestIsNameValid(t *testing.T) {
-	valid := []string{
-		"name",
-		"123",
-		"!@#$%^&*()",
-		".name",
-		"..name",
-		"...",
-	}
-	for _, s := range valid {
-		if got := isNameValid(s); !got {
-			t.Errorf("isNameValid(%s) failed, got: %v, expected: true", s, got)
-		}
-	}
-	invalid := []string{
-		".",
-		"..",
-		"name/name",
-		"/name",
-		"name/",
-	}
-	for _, s := range invalid {
-		if got := isNameValid(s); got {
-			t.Errorf("isNameValid(%s) failed, got: %v, expected: false", s, got)
-		}
-	}
-}
-
 func TestWalkNotFound(t *testing.T) {
 	runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
 		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index eb9c4cd76..d6043bcf7 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sandbox",
     srcs = [
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index e73b2293f..a1e5da3f5 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "specutils",
     srcs = [
diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index c41161d50..22b3ebd2a 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_test(
     name = "image_test",
     size = "large",
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index 726ebf49e..e7204dc66 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_test(
     name = "integration_test",
     size = "large",
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
index c69249b52..c2567ef23 100644
--- a/runsc/test/root/BUILD
+++ b/runsc/test/root/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "root",
     srcs = ["root.go"],
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index da2535bfa..128bd80fb 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "testutil",
     srcs = [
diff --git a/runsc/tools/dockercfg/BUILD b/runsc/tools/dockercfg/BUILD
index 5abb0c90a..a80b3abab 100644
--- a/runsc/tools/dockercfg/BUILD
+++ b/runsc/tools/dockercfg/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "dockercfg",
     srcs = ["dockercfg.go"],
diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD
index 1afc58625..22c2e62c3 100644
--- a/tools/go_generics/BUILD
+++ b/tools/go_generics/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "go_generics",
     srcs = [
diff --git a/tools/go_generics/globals/BUILD b/tools/go_generics/globals/BUILD
index a238becab..c26ac56d2 100644
--- a/tools/go_generics/globals/BUILD
+++ b/tools/go_generics/globals/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "globals",
     srcs = [
diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD
index 2d9a6fa9d..23b2d656d 100644
--- a/tools/go_generics/rules_tests/BUILD
+++ b/tools/go_generics/rules_tests/BUILD
@@ -1,6 +1,7 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_template_instance(
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index edbeb4e2d..68d37f5d7 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "stateify",
     srcs = ["main.go"],
-- 
cgit v1.2.3


From a5fe397cf806cbc488757dbd73a3d83eb5da11cd Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 24 Oct 2018 23:06:51 -0700
Subject: Re-enable TestPythonHello now that ptrace seccomp issue is resolved.

PiperOrigin-RevId: 218636184
Change-Id: I44deac3f32276d06955c5fb1e28c5970bb08f5fd
---
 runsc/test/image/image_test.go | 6 ------
 runsc/test/testutil/docker.go  | 9 ---------
 2 files changed, 15 deletions(-)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 763152b47..f7e750d71 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -192,12 +192,6 @@ func TestMysql(t *testing.T) {
 }
 
 func TestPythonHello(t *testing.T) {
-	// TODO: This test occasionally hangs on the ptrace
-	// platform. Re-enable it once we have this issue fixed.
-	if testutil.IsPtracePlatform() {
-		t.Skipf("Skipping PythonHello test on ptrace platform")
-	}
-
 	if err := testutil.Pull("google/python-hello"); err != nil {
 		t.Fatalf("docker pull failed: %v", err)
 	}
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 3f74e0770..9a76397be 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -49,15 +49,6 @@ func IsPauseResumeSupported() bool {
 	return !strings.Contains(getRuntime(), "hostnet")
 }
 
-// IsPtracePlatform returns true if the runtime is using ptrace platform.
-//
-// TODO: Tests should not depend on the platform, but
-// TestPythonHello sometimes hangs on ptrace. Once that is debugged, this
-// method should go away.
-func IsPtracePlatform() bool {
-	return !strings.Contains(getRuntime(), "kvm")
-}
-
 // EnsureSupportedDockerVersion checks if correct docker is installed.
 func EnsureSupportedDockerVersion() {
 	cmd := exec.Command("docker", "version")
-- 
cgit v1.2.3


From 479cd52a6075066e93ce0c1bd0f183bb5df4fcc7 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 25 Oct 2018 11:45:37 -0700
Subject: Uninstall() should not fail if a cgroup directory doesn't exist

It can be occurred if two controllers are mounted together or if Uninstall() is called on a error path.

PiperOrigin-RevId: 218723886
Change-Id: I69d7a3c0685a7da38527ea8b7b301dbe96268285
---
 runsc/cgroup/cgroup.go      |  6 +++++-
 runsc/cgroup/cgroup_test.go | 11 +++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 0ceeb3f28..15071387b 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -229,7 +229,11 @@ func (c *Cgroup) Uninstall() error {
 		defer cancel()
 		b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 		if err := backoff.Retry(func() error {
-			return syscall.Rmdir(path)
+			err := syscall.Rmdir(path)
+			if os.IsNotExist(err) {
+				return nil
+			}
+			return err
 		}, b); err != nil {
 			return fmt.Errorf("error removing cgroup path %q: %v", path, err)
 		}
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index 4a4713d4f..ecc184f74 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -18,6 +18,17 @@ import (
 	"testing"
 )
 
+func TestUninstallEnoent(t *testing.T) {
+	c := Cgroup{
+		// set a non-existent name
+		Name: "runsc-test-uninstall-656e6f656e740a",
+		Own:  true,
+	}
+	if err := c.Uninstall(); err != nil {
+		t.Errorf("Uninstall() failed: %v", err)
+	}
+}
+
 func TestCountCpuset(t *testing.T) {
 	for _, tc := range []struct {
 		str   string
-- 
cgit v1.2.3


From 624cc329d89bff5f2b0e787d255e718514ec585b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 26 Oct 2018 12:17:51 -0700
Subject: Order feature strings by block

PiperOrigin-RevId: 218894181
Change-Id: I97d0c74175f4aa528363f768a0a85d6953ea0bfd
---
 pkg/cpuid/cpuid.go | 172 ++++++++++++++++++++++++++++++-----------------------
 runsc/cgroup/BUILD |   1 +
 2 files changed, 99 insertions(+), 74 deletions(-)

(limited to 'runsc')

diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index 5b083a5fb..9eec45717 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -228,64 +228,69 @@ var linuxBlockOrder = []block{1, 6, 0, 5, 2, 4}
 // names of the basic features in Linux defined in
 // arch/x86/kernel/cpu/capflags.c.
 var x86FeatureStrings = map[Feature]string{
-	X86FeatureFPU:        "fpu",
-	X86FeatureVME:        "vme",
-	X86FeatureDE:         "de",
-	X86FeaturePSE:        "pse",
-	X86FeatureTSC:        "tsc",
-	X86FeatureMSR:        "msr",
-	X86FeaturePAE:        "pae",
-	X86FeatureMCE:        "mce",
-	X86FeatureCX8:        "cx8",
-	X86FeatureAPIC:       "apic",
-	X86FeatureSEP:        "sep",
-	X86FeatureMTRR:       "mtrr",
-	X86FeaturePGE:        "pge",
-	X86FeatureMCA:        "mca",
-	X86FeatureCMOV:       "cmov",
-	X86FeaturePAT:        "pat",
-	X86FeaturePSE36:      "pse36",
-	X86FeaturePSN:        "pn",
-	X86FeatureCLFSH:      "clflush",
-	X86FeatureDS:         "dts",
-	X86FeatureACPI:       "acpi",
-	X86FeatureMMX:        "mmx",
-	X86FeatureFXSR:       "fxsr",
-	X86FeatureSSE:        "sse",
-	X86FeatureSSE2:       "sse2",
-	X86FeatureSS:         "ss",
-	X86FeatureHTT:        "ht",
-	X86FeatureTM:         "tm",
-	X86FeatureIA64:       "ia64",
-	X86FeaturePBE:        "pbe",
-	X86FeatureSSE3:       "pni",
-	X86FeaturePCLMULDQ:   "pclmulqdq",
-	X86FeatureDTES64:     "dtes64",
-	X86FeatureMONITOR:    "monitor",
-	X86FeatureDSCPL:      "ds_cpl",
-	X86FeatureVMX:        "vmx",
-	X86FeatureSMX:        "smx",
-	X86FeatureEST:        "est",
-	X86FeatureTM2:        "tm2",
-	X86FeatureSSSE3:      "ssse3",
-	X86FeatureCNXTID:     "cid",
-	X86FeatureFMA:        "fma",
-	X86FeatureCX16:       "cx16",
-	X86FeatureXTPR:       "xtpr",
-	X86FeaturePDCM:       "pdcm",
-	X86FeaturePCID:       "pcid",
-	X86FeatureDCA:        "dca",
-	X86FeatureSSE4_1:     "sse4_1",
-	X86FeatureSSE4_2:     "sse4_2",
-	X86FeatureX2APIC:     "x2apic",
-	X86FeatureMOVBE:      "movbe",
-	X86FeaturePOPCNT:     "popcnt",
-	X86FeatureTSCD:       "tsc_deadline_timer",
-	X86FeatureAES:        "aes",
-	X86FeatureXSAVE:      "xsave",
-	X86FeatureAVX:        "avx",
-	X86FeatureF16C:       "f16c",
-	X86FeatureRDRAND:     "rdrand",
+	// Block 0.
+	X86FeatureSSE3:     "pni",
+	X86FeaturePCLMULDQ: "pclmulqdq",
+	X86FeatureDTES64:   "dtes64",
+	X86FeatureMONITOR:  "monitor",
+	X86FeatureDSCPL:    "ds_cpl",
+	X86FeatureVMX:      "vmx",
+	X86FeatureSMX:      "smx",
+	X86FeatureEST:      "est",
+	X86FeatureTM2:      "tm2",
+	X86FeatureSSSE3:    "ssse3",
+	X86FeatureCNXTID:   "cid",
+	X86FeatureFMA:      "fma",
+	X86FeatureCX16:     "cx16",
+	X86FeatureXTPR:     "xtpr",
+	X86FeaturePDCM:     "pdcm",
+	X86FeaturePCID:     "pcid",
+	X86FeatureDCA:      "dca",
+	X86FeatureSSE4_1:   "sse4_1",
+	X86FeatureSSE4_2:   "sse4_2",
+	X86FeatureX2APIC:   "x2apic",
+	X86FeatureMOVBE:    "movbe",
+	X86FeaturePOPCNT:   "popcnt",
+	X86FeatureTSCD:     "tsc_deadline_timer",
+	X86FeatureAES:      "aes",
+	X86FeatureXSAVE:    "xsave",
+	X86FeatureAVX:      "avx",
+	X86FeatureF16C:     "f16c",
+	X86FeatureRDRAND:   "rdrand",
+
+	// Block 1.
+	X86FeatureFPU:   "fpu",
+	X86FeatureVME:   "vme",
+	X86FeatureDE:    "de",
+	X86FeaturePSE:   "pse",
+	X86FeatureTSC:   "tsc",
+	X86FeatureMSR:   "msr",
+	X86FeaturePAE:   "pae",
+	X86FeatureMCE:   "mce",
+	X86FeatureCX8:   "cx8",
+	X86FeatureAPIC:  "apic",
+	X86FeatureSEP:   "sep",
+	X86FeatureMTRR:  "mtrr",
+	X86FeaturePGE:   "pge",
+	X86FeatureMCA:   "mca",
+	X86FeatureCMOV:  "cmov",
+	X86FeaturePAT:   "pat",
+	X86FeaturePSE36: "pse36",
+	X86FeaturePSN:   "pn",
+	X86FeatureCLFSH: "clflush",
+	X86FeatureDS:    "dts",
+	X86FeatureACPI:  "acpi",
+	X86FeatureMMX:   "mmx",
+	X86FeatureFXSR:  "fxsr",
+	X86FeatureSSE:   "sse",
+	X86FeatureSSE2:  "sse2",
+	X86FeatureSS:    "ss",
+	X86FeatureHTT:   "ht",
+	X86FeatureTM:    "tm",
+	X86FeatureIA64:  "ia64",
+	X86FeaturePBE:   "pbe",
+
+	// Block 2.
 	X86FeatureFSGSBase:   "fsgsbase",
 	X86FeatureTSC_ADJUST: "tsc_adjust",
 	X86FeatureBMI1:       "bmi1",
@@ -305,33 +310,52 @@ var x86FeatureStrings = map[Feature]string{
 	X86FeatureADX:        "adx",
 	X86FeatureSMAP:       "smap",
 	X86FeatureCLWB:       "clwb",
+	X86FeatureAVX512PF:   "avx512pf",
+	X86FeatureAVX512ER:   "avx512er",
 	X86FeatureAVX512CD:   "avx512cd",
+	X86FeatureSHA:        "sha_ni",
 	X86FeatureAVX512BW:   "avx512bw",
 	X86FeatureAVX512VL:   "avx512vl",
-	X86FeatureSYSCALL:    "syscall",
-	X86FeatureNX:         "nx",
-	X86FeatureGBPAGES:    "pdpe1gb",
-	X86FeatureRDTSCP:     "rdtscp",
-	X86FeatureLM:         "lm",
-	X86FeatureXSAVEOPT:   "xsaveopt",
-	X86FeatureXSAVEC:     "xsavec",
-	X86FeatureXGETBV1:    "xgetbv1",
-	X86FeatureLAHF64:     "lahf_lm", // LAHF/SAHF in long mode
-	X86FeatureLZCNT:      "abm",     // Advanced bit manipulation
-	X86FeaturePREFETCHW:  "3dnowprefetch",
+
+	// Block 4.
+	X86FeatureXSAVEOPT: "xsaveopt",
+	X86FeatureXSAVEC:   "xsavec",
+	X86FeatureXGETBV1:  "xgetbv1",
+
+	// Block 5.
+	X86FeatureLAHF64:    "lahf_lm", // LAHF/SAHF in long mode
+	X86FeatureLZCNT:     "abm",     // Advanced bit manipulation
+	X86FeaturePREFETCHW: "3dnowprefetch",
+
+	// Block 6.
+	X86FeatureSYSCALL: "syscall",
+	X86FeatureNX:      "nx",
+	X86FeatureGBPAGES: "pdpe1gb",
+	X86FeatureRDTSCP:  "rdtscp",
+	X86FeatureLM:      "lm",
 }
 
 // These flags are parse only---they can be used for setting / unsetting the
 // flags, but will not get printed out in /proc/cpuinfo.
 var x86FeatureParseOnlyStrings = map[Feature]string{
-	X86FeaturePKU:             "pku",
-	X86FeatureXSAVES:          "xsaves",
+	// Block 0.
+	X86FeatureSDBG:    "sdbg",
+	X86FeatureOSXSAVE: "osxsave",
+
+	// Block 2.
+	X86FeatureFDP_EXCPTN_ONLY: "fdp_excptn_only",
 	X86FeatureFPCSDS:          "fpcsds",
-	X86FeatureOSXSAVE:         "osxsave",
 	X86FeatureIPT:             "pt",
-	X86FeatureSDBG:            "sdbg",
-	X86FeatureFDP_EXCPTN_ONLY: "fdp_excptn_only",
 	X86FeatureCLFLUSHOPT:      "clfushopt",
+
+	// Block 3.
+	X86FeaturePREFETCHWT1: "prefetchwt1",
+	X86FeatureAVX512VBMI:  "avx512vbmi",
+	X86FeatureUMIP:        "umip",
+	X86FeaturePKU:         "pku",
+
+	// Block 4.
+	X86FeatureXSAVES: "xsaves",
 }
 
 // These are the default values of various FeatureSet fields.
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index bf2f373a9..4f9a25a25 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -22,4 +22,5 @@ go_test(
     size = "small",
     srcs = ["cgroup_test.go"],
     embed = [":cgroup"],
+    tags = ["local"],
 )
-- 
cgit v1.2.3


From b42a2a32038a8d9098d94c0435fe99e1e2b9a7f2 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 29 Oct 2018 10:30:58 -0700
Subject: Removes outdated TODO.

PiperOrigin-RevId: 219151173
Change-Id: I73014ea648ae485692ea0d44860c87f4365055cb
---
 runsc/boot/loader.go | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index fa3de0133..abb347835 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -668,13 +668,6 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 }
 
 func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
-	// TODO: Containers all currently share a PID namespace.
-	// When per-container PID namespaces are supported, wait should use cid
-	// to find the appropriate PID namespace.
-	/*if cid != l.sandboxID {
-		return errors.New("non-sandbox PID namespaces are not yet implemented")
-	}*/
-
 	// If the process was started via runsc exec, it will have an
 	// entry in l.processes.
 	l.mu.Lock()
-- 
cgit v1.2.3


From 0091db9cbddb6c9fb4c96fbde980780c98006eda Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 30 Oct 2018 22:45:51 -0700
Subject: kvm: use private futexes.

Use private futexes for performance and to align with other runtime uses.

PiperOrigin-RevId: 219422634
Change-Id: Ief2af5e8302847ea6dc246e8d1ee4d64684ca9dd
---
 pkg/sentry/platform/kvm/machine_unsafe.go | 4 ++--
 runsc/boot/filter/config.go               | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 4f5b01321..38c1f102f 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -88,7 +88,7 @@ func (c *vCPU) notify() {
 	_, _, errno := syscall.RawSyscall6(
 		syscall.SYS_FUTEX,
 		uintptr(unsafe.Pointer(&c.state)),
-		linux.FUTEX_WAKE,
+		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
 		^uintptr(0), // Number of waiters.
 		0, 0, 0)
 	if errno != 0 {
@@ -106,7 +106,7 @@ func (c *vCPU) waitUntilNot(state uint32) {
 	_, _, errno := syscall.Syscall6(
 		syscall.SYS_FUTEX,
 		uintptr(unsafe.Pointer(&c.state)),
-		linux.FUTEX_WAIT,
+		linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG,
 		uintptr(state),
 		0, 0, 0)
 	if errno != 0 && errno != syscall.EINTR && errno != syscall.EAGAIN {
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 378396b9b..83c1fbcce 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -438,7 +438,6 @@ func ptraceFilters() seccomp.SyscallRules {
 func kvmFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ARCH_PRCTL:      {},
-		syscall.SYS_FUTEX:           {},
 		syscall.SYS_IOCTL:           {},
 		syscall.SYS_MMAP:            {},
 		syscall.SYS_RT_SIGSUSPEND:   {},
-- 
cgit v1.2.3


From ccc3d7ca11a2a623587c651a6690aaa46d2c2665 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 31 Oct 2018 11:27:10 -0700
Subject: Make lazy open the mode of operation for fsgofer

With recent changes to 9P server, path walks are now safe inside
open, create, rename and setattr calls. To simplify the code, remove
the lazyopen=false mode that was used for bind mounts, and converge
all mounts to using lazy open.

PiperOrigin-RevId: 219508628
Change-Id: I073e7e1e2e9a9972d150eaf4cb29e553997a9b76
---
 runsc/cmd/gofer.go              |  8 +---
 runsc/fsgofer/BUILD             |  1 +
 runsc/fsgofer/fsgofer.go        | 97 +++++++++++++----------------------------
 runsc/fsgofer/fsgofer_test.go   | 12 ++---
 runsc/fsgofer/fsgofer_unsafe.go | 71 +++++++++++++++++++++++++-----
 5 files changed, 97 insertions(+), 92 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 3842fdf64..7cc666e10 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -124,9 +124,6 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	ats = append(ats, fsgofer.NewAttachPoint("/", fsgofer.Config{
 		ROMount:      spec.Root.Readonly,
 		PanicOnWrite: g.panicOnWrite,
-		// Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when
-		// each file is opened as writable. Thus, we open files lazily to avoid copy-up.
-		LazyOpenForWrite: true,
 	}))
 	log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly)
 
@@ -134,9 +131,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			cfg := fsgofer.Config{
-				ROMount:          isReadonlyMount(m.Options),
-				PanicOnWrite:     g.panicOnWrite,
-				LazyOpenForWrite: false,
+				ROMount:      isReadonlyMount(m.Options),
+				PanicOnWrite: g.panicOnWrite,
 			}
 			ats = append(ats, fsgofer.NewAttachPoint(m.Destination, cfg))
 
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index f28e4fa77..ab12388ab 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -17,6 +17,7 @@ go_library(
         "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/syserr",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index fd913831a..4412d7e2f 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -77,13 +77,6 @@ type Config struct {
 
 	// PanicOnWrite panics on attempts to write to RO mounts.
 	PanicOnWrite bool
-
-	// LazyOpenForWrite makes the underlying file to be opened in RDONLY
-	// mode initially and be reopened in case write access is desired.
-	// This is done to workaround the behavior in 'overlay2' that
-	// copies the entire file up eagerly when it's opened in write mode
-	// even if the file is never actually written to.
-	LazyOpenForWrite bool
 }
 
 type attachPoint struct {
@@ -182,9 +175,10 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 
 // localFile implements p9.File wrapping a local file. The underlying file
 // is opened during Walk() and stored in 'controlFile' to be used with other
-// operations. The mode in which the file is opened varies depending on the
-// configuration (see below). 'controlFile' is dup'ed when Walk(nil) is called
-// to clone the file.
+// operations. The control file is opened as readonly, unless it's a symlink
+// which requires O_PATH. 'controlFile' is dup'ed when Walk(nil) is called
+// to clone the file. This reduces the number of walks that need to be done by
+// the host file system when files are reused.
 //
 // 'openedFile' is assigned when Open() is called. If requested open mode is
 // a subset of controlFile's mode, it's possible to use the same file. If mode
@@ -193,22 +187,10 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // operations. Before the file is opened and after it's closed, 'mode' is set to
 // an invalid value to prevent an unopened file from being used.
 //
-// localFile has 2 modes of operation based on the configuration:
-//
-// ** conf.lazyRWOpen == false **
-// This is the preferred mode. 'controlFile' is opened in RW mode in Walk()
-// and used across all functions. The file is never reopened as the mode will
-// always be a super set of the requested open mode. This reduces the number of
-// syscalls required per operation and makes it resilient to renames anywhere
-// in the path to the file.
-//
-// ** conf.lazyRWOpen == true **
-// This mode is used for better performance with 'overlay2' storage driver.
-// overlay2 eagerly copies the entire file up when it's opened in write mode
-// which makes the mode above perform badly when serveral of files are opened
-// for read (esp. startup). In this mode, 'controlFile' is opened as readonly
-// (or O_PATH for symlinks). Reopening the file is required if write mode
-// is requested in Open().
+// The reason that the control file is never opened as read-write is for better
+// performance with 'overlay2' storage driver. overlay2 eagerly copies the
+// entire file up when it's opened in write mode, and would perform badly when
+// multiple files are being opened for read-only (esp. startup).
 type localFile struct {
 	p9.DefaultWalkGetAttr
 
@@ -238,23 +220,14 @@ type localFile struct {
 
 func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 	// Attempt to open file in the following mode in order:
-	//   1. RDWR: for files with rw mounts and LazyOpenForWrite disabled
-	//   2. RDONLY: for directories, ro mounts or LazyOpenForWrite enabled
-	//   3. PATH: for symlinks
-	modes := []int{syscall.O_RDWR, syscall.O_RDONLY, unix.O_PATH}
-	symlinkIdx := len(modes) - 1
-
-	startIdx := 0
-	conf := parent.attachPoint.conf
-	if conf.ROMount || conf.LazyOpenForWrite {
-		// Skip attempt to open in RDWR based on configuration.
-		startIdx = 1
-	}
+	//   1. RDONLY: for all files, works for directories and ro mounts too
+	//   2. PATH: for symlinks
+	modes := []int{syscall.O_RDONLY, unix.O_PATH}
 
 	var err error
 	var fd int
-	for i := startIdx; i < len(modes); i++ {
-		fd, err = syscall.Openat(parent.controlFD(), name, openFlags|modes[i], 0)
+	for i, mode := range modes {
+		fd, err = syscall.Openat(parent.controlFD(), name, openFlags|mode, 0)
 		if err == nil {
 			// openat succeeded, we're done.
 			break
@@ -263,16 +236,10 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 		case syscall.ENOENT:
 			// File doesn't exist, no point in retrying.
 			return nil, "", e
-		case syscall.ELOOP:
-			if i < symlinkIdx {
-				// File was opened with O_NOFOLLOW, so this error can only happen when
-				// trying ot open a symlink. Jump straight to flags compatible with symlink.
-				i = symlinkIdx - 1
-			}
 		}
-		// openat failed. Try again with next mode, preserving 'err' in
-		// case this was the last attempt.
-		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|modes[i], parent.controlFile.Name(), name, err)
+		// openat failed. Try again with next mode, preserving 'err' in case this
+		// was the last attempt.
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|mode, parent.controlFile.Name(), name, err)
 	}
 	if err != nil {
 		// All attempts to open file have failed, return the last error.
@@ -353,13 +320,13 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 
 	// Check if control file can be used or if a new open must be created.
 	var newFile *os.File
-	if mode == p9.ReadOnly || !l.attachPoint.conf.LazyOpenForWrite {
+	if mode == p9.ReadOnly {
 		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name())
 		newFile = l.controlFile
 	} else {
-		// Ideally reopen would call name_to_handle_at (with empty name) and open_by_handle_at
-		// to reopen the file without using 'hostPath'. However, name_to_handle_at and
-		// open_by_handle_at aren't supported by overlay2.
+		// Ideally reopen would call name_to_handle_at (with empty name) and
+		// open_by_handle_at to reopen the file without using 'hostPath'. However,
+		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
 		log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
 		var err error
 
@@ -397,9 +364,10 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		return nil, nil, p9.QID{}, 0, syscall.EBADF
 	}
 
-	// Use a single file for both 'controlFile' and 'openedFile'. Mode must include read for control
-	// and whichever else was requested by caller. Note that resulting file might have a wider mode
-	// than needed for each particular case.
+	// Use a single file for both 'controlFile' and 'openedFile'. Mode must
+	// include read for control and whichever else was requested by caller. Note
+	// that resulting file might have a wider mode than needed for each particular
+	// case.
 	flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
 	if mode == p9.WriteOnly {
 		flags |= syscall.O_RDWR
@@ -622,9 +590,9 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	}
 
 	fd := l.controlFD()
-	if conf.LazyOpenForWrite && l.ft == regular {
-		// Regular files are opened in RO mode when lazy open is set.
-		// Thus it needs to be reopened here for write.
+	if l.ft == regular {
+		// Regular files are opened in RO mode, thus it needs to be reopened here
+		// for write.
 		f, err := os.OpenFile(l.hostPath, openFlags|os.O_WRONLY, 0)
 		if err != nil {
 			return extractErrno(err)
@@ -728,8 +696,6 @@ func (l *localFile) Rename(p9.File, string) error {
 }
 
 // RenameAt implements p9.File.RenameAt.
-//
-// TODO: change to renameat(2).
 func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error {
 	conf := l.attachPoint.conf
 	if conf.ROMount {
@@ -740,9 +706,7 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
 	}
 
 	newParent := directory.(*localFile)
-	oldPath := path.Join(l.hostPath, oldName)
-	newPath := path.Join(newParent.hostPath, newName)
-	if err := syscall.Rename(oldPath, newPath); err != nil {
+	if err := renameat(l.controlFD(), oldName, newParent.controlFD(), newName); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -863,7 +827,8 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	}
 
 	// Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
-	// reading all directory contents. Take a lock because this operation is stateful.
+	// reading all directory contents. Take a lock because this operation is
+	// stateful.
 	l.readDirMu.Lock()
 	if _, err := l.openedFile.Seek(0, 0); err != nil {
 		l.readDirMu.Unlock()
@@ -944,7 +909,7 @@ func (l *localFile) Renamed(newDir p9.File, newName string) {
 func extractErrno(err error) syscall.Errno {
 	if err == nil {
 		// This should never happen. The likely result will be that
-		// some user gets the frustration "error: SUCCESS" message.
+		// some user gets the frustrating "error: SUCCESS" message.
 		log.Warningf("extractErrno called with nil error!")
 		return 0
 	}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 34033245b..f799b1e25 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -48,14 +48,8 @@ var (
 	// allConfs is set in init() above.
 	allConfs []Config
 
-	rwConfs = []Config{
-		{ROMount: false, LazyOpenForWrite: false},
-		{ROMount: false, LazyOpenForWrite: true},
-	}
-	roConfs = []Config{
-		{ROMount: true, LazyOpenForWrite: false},
-		{ROMount: true, LazyOpenForWrite: true},
-	}
+	rwConfs = []Config{{ROMount: false}}
+	roConfs = []Config{{ROMount: true}}
 )
 
 type state struct {
@@ -66,7 +60,7 @@ type state struct {
 }
 
 func (s state) String() string {
-	return fmt.Sprintf("lazyopen(%v)-%v", s.conf.LazyOpenForWrite, s.ft)
+	return fmt.Sprintf("type(%v)", s.ft)
 }
 
 func runAll(t *testing.T, test func(*testing.T, state)) {
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index 99bc25ec1..94413db86 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -19,20 +19,29 @@ import (
 	"unsafe"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 )
 
 func statAt(dirFd int, name string) (syscall.Stat_t, error) {
 	nameBytes, err := syscall.BytePtrFromString(name)
 	if err != nil {
-		return syscall.Stat_t{}, extractErrno(err)
+		return syscall.Stat_t{}, err
 	}
-	namePtr := uintptr(unsafe.Pointer(nameBytes))
+	namePtr := unsafe.Pointer(nameBytes)
 
 	var stat syscall.Stat_t
-	statPtr := uintptr(unsafe.Pointer(&stat))
+	statPtr := unsafe.Pointer(&stat)
 
-	if _, _, err := syscall.Syscall6(syscall.SYS_NEWFSTATAT, uintptr(dirFd), namePtr, statPtr, linux.AT_SYMLINK_NOFOLLOW, 0, 0); err != 0 {
-		return syscall.Stat_t{}, err
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_NEWFSTATAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(statPtr),
+		linux.AT_SYMLINK_NOFOLLOW,
+		0,
+		0); errno != 0 {
+
+		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
 	}
 	return stat, nil
 }
@@ -40,19 +49,59 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
 func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
 	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
 	// operate directly on 'dirFd' unlike other *at syscalls.
-	var namePtr uintptr
+	var namePtr unsafe.Pointer
 	if name != "" {
 		nameBytes, err := syscall.BytePtrFromString(name)
 		if err != nil {
-			return extractErrno(err)
+			return err
+		}
+		namePtr = unsafe.Pointer(nameBytes)
+	}
+
+	timesPtr := unsafe.Pointer(&times[0])
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(timesPtr),
+		uintptr(flags),
+		0,
+		0); errno != 0 {
+
+		return syserr.FromHost(errno).ToError()
+	}
+	return nil
+}
+
+func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error {
+	var oldNamePtr unsafe.Pointer
+	if oldName != "" {
+		nameBytes, err := syscall.BytePtrFromString(oldName)
+		if err != nil {
+			return err
+		}
+		oldNamePtr = unsafe.Pointer(nameBytes)
+	}
+	var newNamePtr unsafe.Pointer
+	if newName != "" {
+		nameBytes, err := syscall.BytePtrFromString(newName)
+		if err != nil {
+			return err
 		}
-		namePtr = uintptr(unsafe.Pointer(nameBytes))
+		newNamePtr = unsafe.Pointer(nameBytes)
 	}
 
-	timesPtr := uintptr(unsafe.Pointer(&times[0]))
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_RENAMEAT,
+		uintptr(oldDirFD),
+		uintptr(oldNamePtr),
+		uintptr(newDirFD),
+		uintptr(newNamePtr),
+		0,
+		0); errno != 0 {
 
-	if _, _, err := syscall.Syscall6(syscall.SYS_UTIMENSAT, uintptr(dirFd), namePtr, timesPtr, uintptr(flags), 0, 0); err != 0 {
-		return err
+		return syserr.FromHost(errno).ToError()
 	}
 	return nil
 }
-- 
cgit v1.2.3


From a4cc93c7bf40679e62a2b0eaa2419a4a9536cc14 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 1 Nov 2018 10:35:04 -0700
Subject: Close http.Response.Body after Get request.

From https://golang.org/pkg/net/http/#Get:
"When err is nil, resp always contains a non-nil resp.Body. Caller should close
resp.Body when done reading from it."

PiperOrigin-RevId: 219658052
Change-Id: I556e88ac4f2c90cd36ab16cd3163d1a52afc32b7
---
 runsc/test/testutil/testutil.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 1b5a02c0f..fd558e2d5 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -230,8 +230,12 @@ func Poll(cb func() error, timeout time.Duration) error {
 // WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
 func WaitForHTTP(port int, timeout time.Duration) error {
 	cb := func() error {
-		_, err := http.Get(fmt.Sprintf("http://localhost:%d/", port))
-		return err
+		resp, err := http.Get(fmt.Sprintf("http://localhost:%d/", port))
+		if err != nil {
+			return err
+		}
+		resp.Body.Close()
+		return nil
 	}
 	return Poll(cb, timeout)
 }
-- 
cgit v1.2.3


From 9d69d85bc13d4f0956a39951b5cd6777f938cffd Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Thu, 1 Nov 2018 17:39:20 -0700
Subject: Make error messages a bit more user friendly.

Updated error messages so that it doesn't print full Go struct representations
when running a new container in a sandbox. For example, this occurs frequently
when commands are not found when doing a 'kubectl exec'.

PiperOrigin-RevId: 219729141
Change-Id: Ic3a7bc84cd7b2167f495d48a1da241d621d3ca09
---
 pkg/sentry/control/proc.go | 11 +++++++++++
 runsc/boot/loader.go       |  4 ++--
 runsc/sandbox/sandbox.go   |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index b6ac2f312..923399fb2 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -19,6 +19,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"sort"
+	"strings"
 	"text/tabwriter"
 	"time"
 
@@ -88,6 +89,16 @@ type ExecArgs struct {
 	ContainerID string
 }
 
+// String prints the arguments as a string.
+func (args ExecArgs) String() string {
+	a := make([]string, len(args.Argv))
+	copy(a, args.Argv)
+	if args.Filename != "" {
+		a[0] = args.Filename
+	}
+	return strings.Join(a, " ")
+}
+
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 	newTG, _, _, err := proc.execAsync(args)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index abb347835..380fa3fbf 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -618,7 +618,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	ep, ok := l.processes[rootKey]
 	l.mu.Unlock()
 	if !ok {
-		return 0, fmt.Errorf("cannot exec in container %q: no such container", args.ContainerID)
+		return 0, fmt.Errorf("no such container: %q", args.ContainerID)
 	}
 	ep.tg.Leader().WithMuLocked(func(t *kernel.Task) {
 		args.Root = t.FSContext().RootDirectory()
@@ -631,7 +631,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	proc := control.Proc{Kernel: l.k}
 	tg, tgid, ttyFile, err := control.ExecAsync(&proc, args)
 	if err != nil {
-		return 0, fmt.Errorf("error executing: %+v: %v", args, err)
+		return 0, err
 	}
 
 	// Insert the process into processes so that we can wait on it
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index df235c5e9..9421bd63e 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -229,7 +229,7 @@ func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
 	// Send a message to the sandbox control server to start the container.
 	var pid int32
 	if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil {
-		return 0, fmt.Errorf("error executing in sandbox: %v", err)
+		return 0, fmt.Errorf("error executing command %q in sandbox: %v", args, err)
 	}
 	return pid, nil
 }
-- 
cgit v1.2.3


From b6b81fd04ba93db3268ff649c9d23a25c9b89db5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 1 Nov 2018 17:43:50 -0700
Subject: Add new log format that is compatible with Kubernetes

Fluentd configuration uses 'log' for the log message
while containerd uses 'msg'. Since we can't have a single
JSON format for both, add another log format and make
debug log configurable.

PiperOrigin-RevId: 219729658
Change-Id: I2a6afc4034d893ab90bafc63b394c4fb62b2a7a0
---
 pkg/log/BUILD        |  1 +
 pkg/log/json_k8s.go  | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/compat.go |  2 +-
 runsc/boot/config.go |  6 +++++-
 runsc/main.go        | 39 +++++++++++++++++++++++----------------
 5 files changed, 77 insertions(+), 18 deletions(-)
 create mode 100644 pkg/log/json_k8s.go

(limited to 'runsc')

diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index bf85b4494..94ac66db3 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -8,6 +8,7 @@ go_library(
         "glog.go",
         "glog_unsafe.go",
         "json.go",
+        "json_k8s.go",
         "log.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/log",
diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go
new file mode 100644
index 000000000..9c2f8d2b7
--- /dev/null
+++ b/pkg/log/json_k8s.go
@@ -0,0 +1,47 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package log
+
+import (
+	"encoding/json"
+	"fmt"
+	"time"
+)
+
+type k8sJSONLog struct {
+	Log   string    `json:"log"`
+	Level Level     `json:"level"`
+	Time  time.Time `json:"time"`
+}
+
+// K8sJSONEmitter logs messages in json format that is compatible with
+// Kubernetes fluent configuration.
+type K8sJSONEmitter struct {
+	Writer
+}
+
+// Emit implements Emitter.Emit.
+func (e K8sJSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+	j := k8sJSONLog{
+		Log:   fmt.Sprintf(format, v...),
+		Level: level,
+		Time:  timestamp,
+	}
+	b, err := json.Marshal(j)
+	if err != nil {
+		panic(err)
+	}
+	e.Writer.Write(b)
+}
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index d18c2f802..4c49e90e3 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -66,7 +66,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 
 	if logFD > 0 {
 		f := os.NewFile(uintptr(logFD), "user log file")
-		target := log.MultiEmitter{c.sink, log.GoogleEmitter{&log.Writer{Next: f}}}
+		target := log.MultiEmitter{c.sink, log.K8sJSONEmitter{log.Writer{Next: f}}}
 		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
 	}
 	return c, nil
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 9ebbde424..2d89ad87e 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -157,12 +157,15 @@ type Config struct {
 	// LogFilename is the filename to log to, if not empty.
 	LogFilename string
 
-	// LogFormat is the log format, "text" or "json".
+	// LogFormat is the log format.
 	LogFormat string
 
 	// DebugLog is the path to log debug information to, if not empty.
 	DebugLog string
 
+	// DebugLogFormat is the log format for debug.
+	DebugLogFormat string
+
 	// FileAccess indicates how the filesystem is accessed.
 	FileAccess FileAccessType
 
@@ -214,6 +217,7 @@ func (c *Config) ToFlags() []string {
 		"--log=" + c.LogFilename,
 		"--log-format=" + c.LogFormat,
 		"--debug-log=" + c.DebugLog,
+		"--debug-log-format=" + c.DebugLogFormat,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
 		"--network=" + c.Network.String(),
diff --git a/runsc/main.go b/runsc/main.go
index 4a92db7c0..c0ee04216 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -38,17 +38,18 @@ var (
 	// Docker, and thus should not be changed.
 	rootDir     = flag.String("root", "", "root directory for storage of container state")
 	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
-	logFormat   = flag.String("log-format", "text", "log format: text (default) or json")
+	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s")
 	debug       = flag.Bool("debug", false, "enable debug logging")
 
 	// These flags are unique to runsc, and are used to configure parts of the
 	// system that are not covered by the runtime spec.
 
 	// Debugging flags.
-	debugLog   = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-	logPackets = flag.Bool("log-packets", false, "enable network packet logging")
-	logFD      = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
-	debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	debugLog       = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+	logPackets     = flag.Bool("log-packets", false, "enable network packet logging")
+	logFD          = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD     = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
 
 	// Debugging flags: strace related
 	strace         = flag.Bool("strace", false, "enable strace")
@@ -133,6 +134,7 @@ func main() {
 		LogFilename:    *logFilename,
 		LogFormat:      *logFormat,
 		DebugLog:       *debugLog,
+		DebugLogFormat: *debugLogFormat,
 		FileAccess:     fsAccess,
 		Overlay:        *overlay,
 		Network:        netType,
@@ -166,15 +168,7 @@ func main() {
 		logFile = f
 	}
 
-	var e log.Emitter
-	switch *logFormat {
-	case "text":
-		e = log.GoogleEmitter{&log.Writer{Next: logFile}}
-	case "json":
-		e = log.JSONEmitter{log.Writer{Next: logFile}}
-	default:
-		cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat)
-	}
+	e := newEmitter(*logFormat, logFile)
 
 	subcommand := flag.CommandLine.Arg(0)
 	if *debugLogFD > -1 {
@@ -195,13 +189,13 @@ func main() {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
 		}
 
-		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
+		e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
 	} else if *debugLog != "" {
 		f, err := specutils.DebugLogFile(*debugLog, subcommand)
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
 		}
-		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
+		e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
 	}
 
 	log.SetTarget(e)
@@ -236,6 +230,19 @@ func main() {
 	os.Exit(128)
 }
 
+func newEmitter(format string, logFile io.Writer) log.Emitter {
+	switch format {
+	case "text":
+		return &log.GoogleEmitter{&log.Writer{Next: logFile}}
+	case "json":
+		return &log.JSONEmitter{log.Writer{Next: logFile}}
+	case "json-k8s":
+		return &log.K8sJSONEmitter{log.Writer{Next: logFile}}
+	}
+	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
+	panic("unreachable")
+}
+
 func init() {
 	// Set default root dir to something (hopefully) user-writeable.
 	*rootDir = "/var/run/runsc"
-- 
cgit v1.2.3


From 5cd55cd90fd5a32685807a57617cde6f5f76d22b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 1 Nov 2018 17:51:22 -0700
Subject: Use spec with clean paths for gofer

Otherwise the gofer's attach point may be different from sandbox when there
symlinks in the path.

PiperOrigin-RevId: 219730492
Change-Id: Ia9c4c2d16228c6a1a9e790e0cb673fd881003fe1
---
 runsc/boot/loader_test.go     |  5 ++++-
 runsc/cmd/gofer.go            | 21 +++++++++++++++++----
 runsc/container/container.go  |  3 +++
 runsc/fsgofer/fsgofer.go      | 24 ++++++++++++------------
 runsc/fsgofer/fsgofer_test.go | 20 ++++++++++++++++----
 5 files changed, 52 insertions(+), 21 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index c342ee005..d5cee5608 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -77,8 +77,11 @@ func startGofer(root string) (int, func(), error) {
 		syscall.Close(goferEnd)
 		return 0, nil, fmt.Errorf("error creating server on FD %d: %v", goferEnd, err)
 	}
+	at, err := fsgofer.NewAttachPoint(root, fsgofer.Config{ROMount: true})
+	if err != nil {
+		return 0, nil, err
+	}
 	go func() {
-		at := fsgofer.NewAttachPoint(root, fsgofer.Config{ROMount: true})
 		s := p9.NewServer(at)
 		if err := s.Handle(socket); err != nil {
 			log.Infof("Gofer is stopping. FD: %d, err: %v\n", goferEnd, err)
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 7cc666e10..4ec3dba9c 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -99,7 +99,12 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		panic("unreachable")
 	}
 
-	spec, err := specutils.ReadSpec(g.bundleDir)
+	specFile, err := specutils.OpenCleanSpec(g.bundleDir)
+	if err != nil {
+		Fatalf("error opening spec: %v", err)
+	}
+	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
+	specFile.Close()
 	if err != nil {
 		Fatalf("error reading spec: %v", err)
 	}
@@ -121,10 +126,14 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	// Start with root mount, then add any other additional mount as needed.
 	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
-	ats = append(ats, fsgofer.NewAttachPoint("/", fsgofer.Config{
+	ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
 		ROMount:      spec.Root.Readonly,
 		PanicOnWrite: g.panicOnWrite,
-	}))
+	})
+	if err != nil {
+		Fatalf("Error creating attach point: %v", err)
+	}
+	ats = append(ats, ap)
 	log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly)
 
 	mountIdx := 1 // first one is the root
@@ -134,7 +143,11 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 				ROMount:      isReadonlyMount(m.Options),
 				PanicOnWrite: g.panicOnWrite,
 			}
-			ats = append(ats, fsgofer.NewAttachPoint(m.Destination, cfg))
+			ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
+			if err != nil {
+				Fatalf("Error creating attach point: %v", err)
+			}
+			ats = append(ats, ap)
 
 			if mountIdx >= len(g.ioFDs) {
 				Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 9da25a863..4c542ccb9 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -374,6 +374,9 @@ func (c *Container) Start(conf *boot.Config) error {
 			return fmt.Errorf("setup mounts: %v", err)
 		}
 		c.Spec.Mounts = cleanMounts
+		if err := specutils.WriteCleanSpec(c.BundleDir, c.Spec); err != nil {
+			return fmt.Errorf("writing clean spec: %v", err)
+		}
 
 		// Create the gofer process.
 		ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 4412d7e2f..b5746447f 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -26,6 +26,7 @@ import (
 	"math"
 	"os"
 	"path"
+	"path/filepath"
 	"sync"
 	"syscall"
 
@@ -99,24 +100,28 @@ type attachPoint struct {
 }
 
 // NewAttachPoint creates a new attacher that gives local file
-// access to all files under 'prefix'.
-func NewAttachPoint(prefix string, c Config) p9.Attacher {
+// access to all files under 'prefix'. 'prefix' must be an absolute path.
+func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) {
+	// Sanity check the prefix.
+	if !filepath.IsAbs(prefix) {
+		return nil, fmt.Errorf("attach point prefix must be absolute %q", prefix)
+	}
 	return &attachPoint{
 		prefix:  prefix,
 		conf:    c,
 		devices: make(map[uint64]uint8),
-	}
+	}, nil
 }
 
 // Attach implements p9.Attacher.
 func (a *attachPoint) Attach() (p9.File, error) {
-	// Sanity check the prefix.
-	fi, err := os.Stat(a.prefix)
+	// dirFD (1st argument) is ignored because 'prefix' is always absolute.
+	stat, err := statAt(-1, a.prefix)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err)
 	}
 	mode := os.O_RDWR
-	if a.conf.ROMount || fi.IsDir() {
+	if a.conf.ROMount || stat.Mode&syscall.S_IFDIR != 0 {
 		mode = os.O_RDONLY
 	}
 
@@ -125,11 +130,6 @@ func (a *attachPoint) Attach() (p9.File, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err)
 	}
-	stat, err := stat(int(f.Fd()))
-	if err != nil {
-		f.Close()
-		return nil, fmt.Errorf("failed to stat file %q, err: %v", a.prefix, err)
-	}
 
 	a.attachedMu.Lock()
 	defer a.attachedMu.Unlock()
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index f799b1e25..47b5380dc 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -80,7 +80,10 @@ func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testin
 			}
 			defer os.RemoveAll(path)
 
-			a := NewAttachPoint(path, c)
+			a, err := NewAttachPoint(path, c)
+			if err != nil {
+				t.Fatalf("NewAttachPoint failed: %v", err)
+			}
 			root, err := a.Attach()
 			if err != nil {
 				t.Fatalf("Attach failed, err: %v", err)
@@ -107,7 +110,10 @@ func setup(ft fileType) (string, string, error) {
 	}
 
 	// First attach with writable configuration to setup tree.
-	a := NewAttachPoint(path, Config{})
+	a, err := NewAttachPoint(path, Config{})
+	if err != nil {
+		return "", "", err
+	}
 	root, err := a.Attach()
 	if err != nil {
 		return "", "", fmt.Errorf("Attach failed, err: %v", err)
@@ -556,7 +562,10 @@ func TestAttachFile(t *testing.T) {
 		t.Fatalf("os.Create(%q) failed, err: %v", path, err)
 	}
 
-	a := NewAttachPoint(path, conf)
+	a, err := NewAttachPoint(path, conf)
+	if err != nil {
+		t.Fatalf("NewAttachPoint failed: %v", err)
+	}
 	root, err := a.Attach()
 	if err != nil {
 		t.Fatalf("Attach failed, err: %v", err)
@@ -595,7 +604,10 @@ func TestDoubleAttachError(t *testing.T) {
 		t.Fatalf("ioutil.TempDir() failed, err: %v", err)
 	}
 	defer os.RemoveAll(root)
-	a := NewAttachPoint(root, conf)
+	a, err := NewAttachPoint(root, conf)
+	if err != nil {
+		t.Fatalf("NewAttachPoint failed: %v", err)
+	}
 
 	if _, err := a.Attach(); err != nil {
 		t.Fatalf("Attach failed: %v", err)
-- 
cgit v1.2.3


From 704b56a40d0a041a4e6f814c3dbb1f9ec15f9002 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 1 Nov 2018 18:28:12 -0700
Subject: First crictl integration tests.

More tests will come, but it's worth getting what's done so far reviewed.

PiperOrigin-RevId: 219734531
Change-Id: If15ca6e6855e3d1cc28c83b5f9c3a72cb65b2e59
---
 kokoro/run_tests.sh                           |   9 +-
 runsc/test/root/BUILD                         |   2 +
 runsc/test/root/chroot_test.go                |   2 +-
 runsc/test/root/crictl_test.go                | 201 ++++++++++++++++++++++
 runsc/test/root/testdata/BUILD                |  17 ++
 runsc/test/root/testdata/containerd_config.go |  39 +++++
 runsc/test/root/testdata/httpd.go             |  32 ++++
 runsc/test/root/testdata/httpd_mount_paths.go |  53 ++++++
 runsc/test/root/testdata/sandbox.go           |  30 ++++
 runsc/test/testutil/BUILD                     |   1 +
 runsc/test/testutil/crictl.go                 | 229 ++++++++++++++++++++++++++
 runsc/test/testutil/testutil.go               |  35 +++-
 12 files changed, 643 insertions(+), 7 deletions(-)
 create mode 100644 runsc/test/root/crictl_test.go
 create mode 100644 runsc/test/root/testdata/BUILD
 create mode 100644 runsc/test/root/testdata/containerd_config.go
 create mode 100644 runsc/test/root/testdata/httpd.go
 create mode 100644 runsc/test/root/testdata/httpd_mount_paths.go
 create mode 100644 runsc/test/root/testdata/sandbox.go
 create mode 100644 runsc/test/testutil/crictl.go

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index b4952cedd..bfdb3fe09 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -80,7 +80,7 @@ installCrictl() (
   chmod +x ${shim_path}
   sudo -n -E mv ${shim_path} /usr/local/bin
 
-  # Configure containerd.
+  # Configure containerd-shim.
   local shim_config_path=/etc/containerd
   local shim_config_tmp_path=/tmp/gvisor-containerd-shim.toml
   sudo -n -E mkdir -p ${shim_config_path}
@@ -89,11 +89,14 @@ installCrictl() (
 
     [runsc_config]
       debug = "true"
-      debug-log = "/tmp/runsc-log/"
+      debug-log = "/tmp/runsc-logs/"
       strace = "true"
       file-access = "shared"
 EOF
   sudo mv ${shim_config_tmp_path} ${shim_config_path}
+
+  # Configure CNI.
+  sudo -n -E env PATH=${PATH} ${GOPATH}/src/github.com/containerd/containerd/script/setup/install-cni
 )
 
 # Install containerd and crictl.
@@ -128,7 +131,7 @@ if [[ ${exit_code} -eq 0 ]]; then
     echo "root_test executable not found"
     exit 1
   fi
-  sudo -n -E RUNSC_RUNTIME=${runtime} ${root_test}
+  sudo -n -E RUNSC_RUNTIME=${runtime} RUNSC_EXEC=/tmp/${runtime}/runsc ${root_test}
   exit_code=${?}
 fi
 
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
index c2567ef23..77dcbd79e 100644
--- a/runsc/test/root/BUILD
+++ b/runsc/test/root/BUILD
@@ -14,6 +14,7 @@ go_test(
     srcs = [
         "cgroup_test.go",
         "chroot_test.go",
+        "crictl_test.go",
     ],
     embed = [":root"],
     tags = [
@@ -24,6 +25,7 @@ go_test(
     ],
     deps = [
         "//runsc/specutils",
+        "//runsc/test/root/testdata",
         "//runsc/test/testutil",
         "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 0ffaaf87b..9f705c860 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // Package root is used for tests that requires sysadmin privileges run. First,
-// follow the setup instruction in runsc/test/README.md. To run these test:
+// follow the setup instruction in runsc/test/README.md. To run these tests:
 //
 //     bazel build //runsc/test/root:root_test
 //     root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
new file mode 100644
index 000000000..88e24782a
--- /dev/null
+++ b/runsc/test/root/crictl_test.go
@@ -0,0 +1,201 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package root
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+	"gvisor.googlesource.com/gvisor/runsc/test/root/testdata"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// Tests for crictl have to be run as root (rather than in a user namespace)
+// because crictl creates named network namespaces in /var/run/netns/.
+func TestCrictlSanity(t *testing.T) {
+	// Setup containerd and crictl.
+	crictl, cleanup, err := setup(t)
+	if err != nil {
+		t.Fatalf("failed to setup crictl: %v", err)
+	}
+	defer cleanup()
+	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.Httpd)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Look for the httpd page.
+	if err = httpGet(crictl, podID, "index.html"); err != nil {
+		t.Fatalf("failed to get page: %v", err)
+	}
+
+	// Stop everything.
+	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
+		t.Fatal(err)
+	}
+}
+func TestMountPaths(t *testing.T) {
+	// Setup containerd and crictl.
+	crictl, cleanup, err := setup(t)
+	if err != nil {
+		t.Fatalf("failed to setup crictl: %v", err)
+	}
+	defer cleanup()
+	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.HttpdMountPaths)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Look for the directory available at /test.
+	if err = httpGet(crictl, podID, "test"); err != nil {
+		t.Fatalf("failed to get page: %v", err)
+	}
+
+	// Stop everything.
+	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// setup sets up before a test. Specifically it:
+// * Creates directories and a socket for containerd to utilize.
+// * Runs containerd and waits for it to reach a "ready" state for testing.
+// * Returns a cleanup function that should be called at the end of the test.
+func setup(t *testing.T) (*testutil.Crictl, func(), error) {
+	var cleanups []func()
+	cleanupFunc := func() {
+		for i := len(cleanups) - 1; i >= 0; i-- {
+			cleanups[i]()
+		}
+	}
+	cleanup := specutils.MakeCleanup(cleanupFunc)
+	defer cleanup.Clean()
+
+	// Create temporary containerd root and state directories, and a socket
+	// via which crictl and containerd communicate.
+	containerdRoot, err := ioutil.TempDir(testutil.TmpDir(), "containerd-root")
+	if err != nil {
+		t.Fatalf("failed to create containerd root: %v", err)
+	}
+	cleanups = append(cleanups, func() { os.RemoveAll(containerdRoot) })
+	containerdState, err := ioutil.TempDir(testutil.TmpDir(), "containerd-state")
+	if err != nil {
+		t.Fatalf("failed to create containerd state: %v", err)
+	}
+	cleanups = append(cleanups, func() { os.RemoveAll(containerdState) })
+	sockAddr := filepath.Join(testutil.TmpDir(), "containerd-test.sock")
+
+	// Start containerd.
+	config, err := testutil.WriteTmpFile("containerd-config", testdata.ContainerdConfig(getRunsc()))
+	if err != nil {
+		t.Fatalf("failed to write containerd config")
+	}
+	cleanups = append(cleanups, func() { os.RemoveAll(config) })
+	containerd := exec.Command(getContainerd(),
+		"--config", config,
+		"--log-level", "debug",
+		"--root", containerdRoot,
+		"--state", containerdState,
+		"--address", sockAddr)
+	cleanups = append(cleanups, func() {
+		if err := testutil.KillCommand(containerd); err != nil {
+			log.Printf("error killing containerd: %v", err)
+		}
+	})
+	containerdStderr, err := containerd.StderrPipe()
+	if err != nil {
+		t.Fatalf("failed to get containerd stderr: %v", err)
+	}
+	containerdStdout, err := containerd.StdoutPipe()
+	if err != nil {
+		t.Fatalf("failed to get containerd stdout: %v", err)
+	}
+	if err := containerd.Start(); err != nil {
+		t.Fatalf("failed running containerd: %v", err)
+	}
+
+	// Wait for containerd to boot. Then put all containerd output into a
+	// buffer to be logged at the end of the test.
+	testutil.WaitUntilRead(containerdStderr, "Start streaming server", nil, 10*time.Second)
+	stdoutBuf := &bytes.Buffer{}
+	stderrBuf := &bytes.Buffer{}
+	go func() { io.Copy(stdoutBuf, containerdStdout) }()
+	go func() { io.Copy(stderrBuf, containerdStderr) }()
+	cleanups = append(cleanups, func() {
+		t.Logf("containerd stdout: %s", string(stdoutBuf.Bytes()))
+		t.Logf("containerd stderr: %s", string(stderrBuf.Bytes()))
+	})
+
+	cleanup.Release()
+	return testutil.NewCrictl(20*time.Second, sockAddr), cleanupFunc, nil
+}
+
+// httpGet GETs the contents of a file served from a pod on port 80.
+func httpGet(crictl *testutil.Crictl, podID, filePath string) error {
+	// Get the IP of the httpd server.
+	ip, err := crictl.PodIP(podID)
+	if err != nil {
+		return fmt.Errorf("failed to get IP from pod %q: %v", podID, err)
+	}
+
+	// GET the page. We may be waiting for the server to start, so retry
+	// with a timeout.
+	var resp *http.Response
+	cb := func() error {
+		r, err := http.Get(fmt.Sprintf("http://%s", path.Join(ip, filePath)))
+		resp = r
+		return err
+	}
+	if err := testutil.Poll(cb, 20*time.Second); err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != 200 {
+		return fmt.Errorf("bad status returned: %d", resp.StatusCode)
+	}
+	return nil
+}
+
+func getContainerd() string {
+	// Bazel doesn't pass PATH through, assume the location of containerd
+	// unless specified by environment variable.
+	c := os.Getenv("CONTAINERD_PATH")
+	if c == "" {
+		return "/usr/local/bin/containerd"
+	}
+	return c
+}
+
+func getRunsc() string {
+	// Bazel doesn't pass PATH through, assume the location of runsc unless
+	// specified by environment variable.
+	c := os.Getenv("RUNSC_EXEC")
+	if c == "" {
+		return "/tmp/runsc-test/runsc"
+	}
+	return c
+}
diff --git a/runsc/test/root/testdata/BUILD b/runsc/test/root/testdata/BUILD
new file mode 100644
index 000000000..a22635129
--- /dev/null
+++ b/runsc/test/root/testdata/BUILD
@@ -0,0 +1,17 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])  # Apache 2.0
+
+go_library(
+    name = "testdata",
+    srcs = [
+        "containerd_config.go",
+        "httpd.go",
+        "httpd_mount_paths.go",
+        "sandbox.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/test/root/testdata",
+    visibility = [
+        "//visibility:public",
+    ],
+)
diff --git a/runsc/test/root/testdata/containerd_config.go b/runsc/test/root/testdata/containerd_config.go
new file mode 100644
index 000000000..949354987
--- /dev/null
+++ b/runsc/test/root/testdata/containerd_config.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testdata contains data required for root tests.
+package testdata
+
+import "fmt"
+
+// containerdConfigTemplate is a .toml config for containerd. It contains a
+// formatting verb so the runtime field can be set via fmt.Sprintf.
+const containerdConfigTemplate = `
+disabled_plugins = ["restart"]
+[plugins.linux]
+  runtime = "%s"
+  runtime_root = "/tmp/test-containerd/runsc"
+  shim = "/usr/local/bin/gvisor-containerd-shim"
+  shim_debug = true
+
+[plugins.cri.containerd.runtimes.runsc]
+  runtime_type = "io.containerd.runtime.v1.linux"
+  runtime_engine = "%s"
+`
+
+// ContainerdConfig returns a containerd config file with the specified
+// runtime.
+func ContainerdConfig(runtime string) string {
+	return fmt.Sprintf(containerdConfigTemplate, runtime, runtime)
+}
diff --git a/runsc/test/root/testdata/httpd.go b/runsc/test/root/testdata/httpd.go
new file mode 100644
index 000000000..f65b1da5d
--- /dev/null
+++ b/runsc/test/root/testdata/httpd.go
@@ -0,0 +1,32 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testdata
+
+// Httpd is a JSON config for an httpd container.
+const Httpd = `
+{
+  "metadata": {
+    "name": "httpd"
+  },
+  "image":{
+    "image": "httpd"
+  },
+  "mounts": [
+  ],
+  "linux": {
+  },
+  "log_path": "httpd.log"
+}
+`
diff --git a/runsc/test/root/testdata/httpd_mount_paths.go b/runsc/test/root/testdata/httpd_mount_paths.go
new file mode 100644
index 000000000..5ca14340e
--- /dev/null
+++ b/runsc/test/root/testdata/httpd_mount_paths.go
@@ -0,0 +1,53 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testdata
+
+// HttpdMountPaths is a JSON config for an httpd container with additional
+// mounts.
+const HttpdMountPaths = `
+{
+  "metadata": {
+    "name": "httpd"
+  },
+  "image":{
+    "image": "httpd"
+  },
+  "mounts": [
+      {
+        "container_path": "/var/run/secrets/kubernetes.io/serviceaccount",
+        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx",
+        "readonly": true
+      },
+      {
+        "container_path": "/etc/hosts",
+        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts",
+        "readonly": false
+      },
+      {
+        "container_path": "/dev/termination-log",
+        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580",
+        "readonly": false
+      },
+      {
+        "container_path": "/usr/local/apache2/htdocs/test",
+        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064",
+        "readonly": true
+      }
+  ],
+  "linux": {
+  },
+  "log_path": "httpd.log"
+}
+`
diff --git a/runsc/test/root/testdata/sandbox.go b/runsc/test/root/testdata/sandbox.go
new file mode 100644
index 000000000..194242a27
--- /dev/null
+++ b/runsc/test/root/testdata/sandbox.go
@@ -0,0 +1,30 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testdata
+
+// Sandbox is a default JSON config for a sandbox.
+const Sandbox = `
+{
+    "metadata": {
+        "name": "default-sandbox",
+        "namespace": "default",
+        "attempt": 1,
+        "uid": "hdishd83djaidwnduwk28bcsb"
+    },
+    "linux": {
+    },
+    "log_directory": "/tmp"
+}
+`
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 128bd80fb..3ed235393 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])  # Apache 2.0
 go_library(
     name = "testutil",
     srcs = [
+        "crictl.go",
         "docker.go",
         "testutil.go",
         "testutil_race.go",
diff --git a/runsc/test/testutil/crictl.go b/runsc/test/testutil/crictl.go
new file mode 100644
index 000000000..9740ea6b5
--- /dev/null
+++ b/runsc/test/testutil/crictl.go
@@ -0,0 +1,229 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+	"time"
+)
+
+const endpointPrefix = "unix://"
+
+// Crictl contains information required to run the crictl utility.
+type Crictl struct {
+	executable      string
+	timeout         time.Duration
+	imageEndpoint   string
+	runtimeEndpoint string
+}
+
+// NewCrictl returns a Crictl configured with a timeout and an endpoint over
+// which it will talk to containerd.
+func NewCrictl(timeout time.Duration, endpoint string) *Crictl {
+	// Bazel doesn't pass PATH through, assume the location of crictl
+	// unless specified by environment variable.
+	executable := os.Getenv("CRICTL_PATH")
+	if executable == "" {
+		executable = "/usr/local/bin/crictl"
+	}
+	return &Crictl{
+		executable:      executable,
+		timeout:         timeout,
+		imageEndpoint:   endpointPrefix + endpoint,
+		runtimeEndpoint: endpointPrefix + endpoint,
+	}
+}
+
+// Pull pulls an container image. It corresponds to `crictl pull`.
+func (cc *Crictl) Pull(imageName string) error {
+	_, err := cc.run("pull", imageName)
+	return err
+}
+
+// RunPod creates a sandbox. It corresponds to `crictl runp`.
+func (cc *Crictl) RunPod(sbSpecFile string) (string, error) {
+	podID, err := cc.run("runp", sbSpecFile)
+	if err != nil {
+		return "", fmt.Errorf("runp failed: %v", err)
+	}
+	// Strip the trailing newline from crictl output.
+	return strings.TrimSpace(podID), nil
+}
+
+// Create creates a container within a sandbox. It corresponds to `crictl
+// create`.
+func (cc *Crictl) Create(podID, contSpecFile, sbSpecFile string) (string, error) {
+	podID, err := cc.run("create", podID, contSpecFile, sbSpecFile)
+	if err != nil {
+		return "", fmt.Errorf("create failed: %v", err)
+	}
+	// Strip the trailing newline from crictl output.
+	return strings.TrimSpace(podID), nil
+}
+
+// Start starts a container. It corresponds to `crictl start`.
+func (cc *Crictl) Start(contID string) (string, error) {
+	output, err := cc.run("start", contID)
+	if err != nil {
+		return "", fmt.Errorf("start failed: %v", err)
+	}
+	return output, nil
+}
+
+// Stop stops a container. It corresponds to `crictl stop`.
+func (cc *Crictl) Stop(contID string) error {
+	_, err := cc.run("stop", contID)
+	return err
+}
+
+// Rm removes a container. It corresponds to `crictl rm`.
+func (cc *Crictl) Rm(contID string) error {
+	_, err := cc.run("rm", contID)
+	return err
+}
+
+// StopPod stops a pod. It corresponds to `crictl stopp`.
+func (cc *Crictl) StopPod(podID string) error {
+	_, err := cc.run("stopp", podID)
+	return err
+}
+
+// containsConfig is a minimal copy of
+// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto
+// It only contains fields needed for testing.
+type containerConfig struct {
+	Status containerStatus
+}
+
+type containerStatus struct {
+	Network containerNetwork
+}
+
+type containerNetwork struct {
+	IP string
+}
+
+// PodIP returns a pod's IP address.
+func (cc *Crictl) PodIP(podID string) (string, error) {
+	output, err := cc.run("inspectp", podID)
+	if err != nil {
+		return "", err
+	}
+	conf := &containerConfig{}
+	if err := json.Unmarshal([]byte(output), conf); err != nil {
+		return "", fmt.Errorf("failed to unmarshal JSON: %v, %s", err, output)
+	}
+	if conf.Status.Network.IP == "" {
+		return "", fmt.Errorf("no IP found in config: %s", output)
+	}
+	return conf.Status.Network.IP, nil
+}
+
+// RmPod removes a container. It corresponds to `crictl rmp`.
+func (cc *Crictl) RmPod(podID string) error {
+	_, err := cc.run("rmp", podID)
+	return err
+}
+
+// StartPodAndContainer pulls an image, then starts a sandbox and container in
+// that sandbox. It returns the pod ID and container ID.
+func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
+	if err := cc.Pull(image); err != nil {
+		return "", "", fmt.Errorf("failed to pull %s: %v", image, err)
+	}
+
+	// Write the specs to files that can be read by crictl.
+	sbSpecFile, err := WriteTmpFile("sbSpec", sbSpec)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to write sandbox spec: %v", err)
+	}
+	contSpecFile, err := WriteTmpFile("contSpec", contSpec)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to write container spec: %v", err)
+	}
+
+	podID, err := cc.RunPod(sbSpecFile)
+	if err != nil {
+		return "", "", err
+	}
+
+	contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
+	}
+
+	if _, err := cc.Start(contID); err != nil {
+		return "", "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
+	}
+
+	return podID, contID, nil
+}
+
+// StopPodAndContainer stops a container and pod.
+func (cc *Crictl) StopPodAndContainer(podID, contID string) error {
+	if err := cc.Stop(contID); err != nil {
+		return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err)
+	}
+
+	if err := cc.Rm(contID); err != nil {
+		return fmt.Errorf("failed to remove container %q in pod %q: %v", contID, podID, err)
+	}
+
+	if err := cc.StopPod(podID); err != nil {
+		return fmt.Errorf("failed to stop pod %q: %v", podID, err)
+	}
+
+	if err := cc.RmPod(podID); err != nil {
+		return fmt.Errorf("failed to remove pod %q: %v", podID, err)
+	}
+
+	return nil
+}
+
+// run runs crictl with the given args and returns an error if it takes longer
+// than cc.Timeout to run.
+func (cc *Crictl) run(args ...string) (string, error) {
+	defaultArgs := []string{
+		"--image-endpoint", cc.imageEndpoint,
+		"--runtime-endpoint", cc.runtimeEndpoint,
+	}
+	cmd := exec.Command(cc.executable, append(defaultArgs, args...)...)
+
+	// Run the command with a timeout.
+	done := make(chan string)
+	errCh := make(chan error)
+	go func() {
+		output, err := cmd.CombinedOutput()
+		if err != nil {
+			errCh <- fmt.Errorf("error: \"%v\", output: %s", err, string(output))
+		}
+		done <- string(output)
+	}()
+	select {
+	case output := <-done:
+		return output, nil
+	case err := <-errCh:
+		return "", err
+	case <-time.After(cc.timeout):
+		if err := KillCommand(cmd); err != nil {
+			return "", fmt.Errorf("timed out, then couldn't kill process %+v: %v", cmd, err)
+		}
+		return "", fmt.Errorf("timed out: %+v", cmd)
+	}
+}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index fd558e2d5..59dc55887 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -72,7 +72,7 @@ func FindFile(path string) (string, error) {
 	}
 
 	// The test root is demarcated by a path element called "__main__". Search for
-	// it backwards from the in the working directory.
+	// it backwards from the working directory.
 	root := wd
 	for {
 		dir, name := filepath.Split(root)
@@ -242,7 +242,7 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 
 // RunAsRoot ensures the test runs with CAP_SYS_ADMIN and CAP_SYS_CHROOT. If
 // needed it will create a new user namespace and re-execute the test as root
-// inside of the namespace. This functionr returns when it's running as root. If
+// inside of the namespace. This function returns when it's running as root. If
 // it needs to create another process, it will exit from there and not return.
 func RunAsRoot() {
 	if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
@@ -288,7 +288,7 @@ func RunAsRoot() {
 	os.Exit(0)
 }
 
-// StartReaper starts a gorouting that will reap all children processes created
+// StartReaper starts a goroutine that will reap all children processes created
 // by the tests. Caller must call the returned function to stop it.
 func StartReaper() func() {
 	ch := make(chan os.Signal, 1)
@@ -356,3 +356,32 @@ func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time
 		return nil
 	}
 }
+
+// KillCommand kills the process running cmd unless it hasn't been started. It
+// returns an error if it cannot kill the process unless the reason is that the
+// process has already exited.
+func KillCommand(cmd *exec.Cmd) error {
+	if cmd.Process == nil {
+		return nil
+	}
+	if err := cmd.Process.Kill(); err != nil {
+		if !strings.Contains(err.Error(), "process already finished") {
+			return fmt.Errorf("failed to kill process %v: %v", cmd, err)
+		}
+	}
+	return nil
+}
+
+// WriteTmpFile writes text to a temporary file, closes the file, and returns
+// the name of the file.
+func WriteTmpFile(pattern, text string) (string, error) {
+	file, err := ioutil.TempFile(TmpDir(), pattern)
+	if err != nil {
+		return "", err
+	}
+	defer file.Close()
+	if _, err := file.Write([]byte(text)); err != nil {
+		return "", err
+	}
+	return file.Name(), nil
+}
-- 
cgit v1.2.3


From a467f092616122f1f718df2a375ba66e97997594 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 5 Nov 2018 17:41:22 -0800
Subject: Log when external signal is received

PiperOrigin-RevId: 220204591
Change-Id: I21a9c6f5c12a376d18da5d10c1871837c4f49ad2
---
 runsc/boot/controller.go | 12 ++++++++++++
 runsc/boot/loader.go     |  1 +
 2 files changed, 13 insertions(+)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 6dd7fadd9..96a848197 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -445,6 +445,18 @@ const (
 	DeliverToForegroundProcessGroup
 )
 
+func (s SignalDeliveryMode) String() string {
+	switch s {
+	case DeliverToProcess:
+		return "Process"
+	case DeliverToAllProcesses:
+		return "All"
+	case DeliverToForegroundProcessGroup:
+		return "Foreground Process Group"
+	}
+	return fmt.Sprintf("unknown signal delivery mode: %d", s)
+}
+
 // SignalArgs are arguments to the Signal method.
 type SignalArgs struct {
 	// CID is the container ID.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 380fa3fbf..10fec5b59 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -336,6 +336,7 @@ func New(args Args) (*Loader, error) {
 			// properly.
 			deliveryMode = DeliverToForegroundProcessGroup
 		}
+		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
 		if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
 			log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
 		}
-- 
cgit v1.2.3


From 86b3f0cd243918f92bd59cfc5de3204d960b5917 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 5 Nov 2018 21:28:45 -0800
Subject: Fix race between start and destroy

Before this change, a container starting up could race with
destroy (aka delete) and leave processes behind.

Now, whenever a container is created, Loader.processes gets
a new entry. Start now expects the entry to be there, and if
it's not it means that the container was deleted.

I've also fixed Loader.waitPID to search for the process using
the init process's PID namespace.

We could use a few more tests for signal and wait. I'll send
them in another cl.

PiperOrigin-RevId: 220224290
Change-Id: I15146079f69904dc07d43c3b66cc343a2dab4cc4
---
 runsc/boot/controller.go                |  20 ++-
 runsc/boot/loader.go                    | 306 +++++++++++++++++++-------------
 runsc/container/container.go            |   5 +-
 runsc/container/container_test.go       |  64 +++++++
 runsc/container/multi_container_test.go | 118 ++++++++++++
 runsc/sandbox/sandbox.go                |  24 ++-
 6 files changed, 402 insertions(+), 135 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 96a848197..f884f8c6b 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -37,6 +37,9 @@ const (
 	// ContainerCheckpoint checkpoints a container.
 	ContainerCheckpoint = "containerManager.Checkpoint"
 
+	// ContainerCreate creates a container.
+	ContainerCreate = "containerManager.Create"
+
 	// ContainerDestroy is used to stop a non-root container and free all
 	// associated resources in the sandbox.
 	ContainerDestroy = "containerManager.Destroy"
@@ -175,17 +178,16 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
 	return nil
 }
 
-// ProcessesArgs container arguments to Processes method.
-type ProcessesArgs struct {
-	// CID restricts the result to processes belonging to
-	// the given container. Empty means all.
-	CID string
+// Processes retrieves information about processes running in the sandbox.
+func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error {
+	log.Debugf("containerManager.Processes: %q", *cid)
+	return control.Processes(cm.l.k, *cid, out)
 }
 
-// Processes retrieves information about processes running in the sandbox.
-func (cm *containerManager) Processes(args *ProcessesArgs, out *[]*control.Process) error {
-	log.Debugf("containerManager.Processes")
-	return control.Processes(cm.l.k, args.CID, out)
+// Create creates a container within a sandbox.
+func (cm *containerManager) Create(cid *string, _ *struct{}) error {
+	log.Debugf("containerManager.Create: %q", *cid)
+	return cm.l.createContainer(*cid)
 }
 
 // StartArgs contains arguments to the Start method.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 10fec5b59..946ddfd47 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -117,7 +117,7 @@ type Loader struct {
 	processes map[execID]*execProcess
 }
 
-// execID uniquely identifies a sentry process.
+// execID uniquely identifies a sentry process that is executed in a container.
 type execID struct {
 	cid string
 	pid kernel.ThreadID
@@ -125,6 +125,7 @@ type execID struct {
 
 // execProcess contains the thread group and host TTY of a sentry process.
 type execProcess struct {
+	// tg will be nil for containers that haven't started yet.
 	tg *kernel.ThreadGroup
 
 	// tty will be nil if the process is not attached to a terminal.
@@ -299,6 +300,7 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("init compat logs: %v", err)
 	}
 
+	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:            k,
 		ctrl:         ctrl,
@@ -310,7 +312,7 @@ func New(args Args) (*Loader, error) {
 		stdioFDs:     args.StdioFDs,
 		rootProcArgs: procArgs,
 		sandboxID:    args.ID,
-		processes:    make(map[execID]*execProcess),
+		processes:    map[execID]*execProcess{eid: &execProcess{}},
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -476,16 +478,20 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
 	eid := execID{cid: l.sandboxID}
-	ep := execProcess{tg: l.k.GlobalInit()}
+	ep := l.processes[eid]
+	if ep == nil {
+		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
+	}
+	ep.tg = l.k.GlobalInit()
 	if l.console {
 		ttyFile := l.rootProcArgs.FDMap.GetFile(0)
 		defer ttyFile.DecRef()
 		ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
 	}
-	l.mu.Lock()
-	l.processes[eid] = &ep
-	l.mu.Unlock()
 
 	// Start signal forwarding only after an init process is created.
 	l.stopSignalForwarding = l.startSignalForwarding()
@@ -495,6 +501,19 @@ func (l *Loader) run() error {
 	return l.k.Start()
 }
 
+// createContainer creates a new container inside the sandbox.
+func (l *Loader) createContainer(cid string) error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; ok {
+		return fmt.Errorf("container %q already exists", cid)
+	}
+	l.processes[eid] = &execProcess{}
+	return nil
+}
+
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process.
 func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
@@ -567,33 +586,39 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
 	}
 
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; !ok {
+		return fmt.Errorf("trying to start a deleted container %q", cid)
+	}
+
 	tg, _, err := l.k.CreateProcess(procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to create process in sentry: %v", err)
 	}
-
 	// CreateProcess takes a reference on FDMap if successful.
 	procArgs.FDMap.DecRef()
 
-	l.mu.Lock()
-	defer l.mu.Unlock()
-	eid := execID{cid: cid}
-	l.processes[eid] = &execProcess{tg: tg}
-
+	l.processes[eid].tg = tg
 	return nil
 }
 
 // destroyContainer stops a container if it is still running and cleans up its
 // filesystem.
 func (l *Loader) destroyContainer(cid string) error {
-	// First kill and wait for all processes in the container.
-	if err := l.signal(cid, 0, int32(linux.SIGKILL), DeliverToAllProcesses); err != nil {
-		return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
-	}
-
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
+	// Has the container started?
+	if _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}); err == nil {
+		// If the container has started, kill and wait for all processes.
+		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+			return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
+		}
+	}
+
 	// Remove all container thread groups from the map.
 	for key := range l.processes {
 		if key.cid == cid {
@@ -612,16 +637,19 @@ func (l *Loader) destroyContainer(cid string) error {
 }
 
 func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
-	// Get the container Root Dirent from the Task, since we must run this
-	// process with the same Root.
+	// Hold the lock for the entire operation to ensure that exec'd process is
+	// added to 'processes' in case it races with destroyContainer().
 	l.mu.Lock()
-	rootKey := execID{cid: args.ContainerID}
-	ep, ok := l.processes[rootKey]
-	l.mu.Unlock()
-	if !ok {
+	defer l.mu.Unlock()
+
+	tg, _, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
+	if err != nil {
 		return 0, fmt.Errorf("no such container: %q", args.ContainerID)
 	}
-	ep.tg.Leader().WithMuLocked(func(t *kernel.Task) {
+
+	// Get the container Root Dirent from the Task, since we must run this
+	// process with the same Root.
+	tg.Leader().WithMuLocked(func(t *kernel.Task) {
 		args.Root = t.FSContext().RootDirectory()
 	})
 	if args.Root != nil {
@@ -630,18 +658,14 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
-	tg, tgid, ttyFile, err := control.ExecAsync(&proc, args)
+	newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
 	if err != nil {
 		return 0, err
 	}
 
-	// Insert the process into processes so that we can wait on it
-	// later.
-	l.mu.Lock()
-	defer l.mu.Unlock()
 	eid := execID{cid: args.ContainerID, pid: tgid}
 	l.processes[eid] = &execProcess{
-		tg:  tg,
+		tg:  newTG,
 		tty: ttyFile,
 	}
 	log.Debugf("updated processes: %v", l.processes)
@@ -653,33 +677,32 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
-	l.mu.Lock()
-	eid := execID{cid: cid}
-	ep, ok := l.processes[eid]
-	l.mu.Unlock()
-	if !ok {
-		return fmt.Errorf("can't find process for container %q in %v", cid, l.processes)
+	tg, _, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("can't wait for container %q: %v", cid, err)
 	}
 
 	// If the thread either has already exited or exits during waiting,
 	// consider the container exited.
-	ws := l.wait(ep.tg)
+	ws := l.wait(tg)
 	*waitStatus = ws
 	return nil
 }
 
 func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
-	// If the process was started via runsc exec, it will have an
-	// entry in l.processes.
-	l.mu.Lock()
+	if tgid <= 0 {
+		return fmt.Errorf("PID (%d) must be positive", tgid)
+	}
+
+	// Try to find a process that was exec'd
 	eid := execID{cid: cid, pid: tgid}
-	ep, ok := l.processes[eid]
-	l.mu.Unlock()
-	if ok {
-		ws := l.wait(ep.tg)
+	execTG, _, err := l.threadGroupFromID(eid)
+	if err == nil {
+		ws := l.wait(execTG)
 		*waitStatus = ws
+
+		// Remove tg from the cache if caller requested it.
 		if clearStatus {
-			// Remove tg from the cache.
 			l.mu.Lock()
 			delete(l.processes, eid)
 			log.Debugf("updated processes (removal): %v", l.processes)
@@ -688,11 +711,18 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 		return nil
 	}
 
-	// This process wasn't created by runsc exec or start, so just find it
-	// by PID and hope it hasn't exited yet.
-	tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
+	// The caller may be waiting on a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace.
+	initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("failed to wait for PID %d: %v", tgid, err)
+	}
+	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
 	if tg == nil {
-		return fmt.Errorf("no thread group with ID %d", tgid)
+		return fmt.Errorf("failed to wait for PID %d: no such process", tgid)
+	}
+	if tg.Leader().ContainerID() != cid {
+		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
 	}
 	ws := l.wait(tg)
 	*waitStatus = ws
@@ -757,90 +787,126 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
 		return fmt.Errorf("failed to signal container %q PID %d: PID must be positive", cid, pid)
 	}
 
-	eid := execID{
-		cid: cid,
-		pid: kernel.ThreadID(pid),
-	}
-	l.mu.Lock()
-	ep, ok := l.processes[eid]
-	l.mu.Unlock()
-
 	switch mode {
 	case DeliverToProcess:
-		if ok {
-			// Send signal directly to the identified process.
-			return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
-		}
-
-		// The caller may be signaling a process not started directly via exec.
-		// In this case, find the process in the container's PID namespace and
-		// signal it.
-		ep, ok := l.processes[execID{cid: cid}]
-		if !ok {
-			return fmt.Errorf("no container with ID: %q", cid)
-		}
-		tg := ep.tg.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
-		if tg == nil {
-			return fmt.Errorf("failed to signal container %q PID %d: no such process", cid, pid)
-		}
-		if tg.Leader().ContainerID() != cid {
-			return fmt.Errorf("process %d is part of a different container: %q", pid, tg.Leader().ContainerID())
-		}
-		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.signalProcess(cid, kernel.ThreadID(pid), signo)
 
 	case DeliverToForegroundProcessGroup:
-		if !ok {
-			return fmt.Errorf("failed to signal foreground process group for container %q PID %d: no such PID", cid, pid)
-		}
+		return l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo)
 
-		// Lookup foreground process group from the TTY for the given process,
-		// and send the signal to it.
-		if ep.tty == nil {
-			return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, pid)
+	case DeliverToAllProcesses:
+		if pid != 0 {
+			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
 		}
-		pg := ep.tty.ForegroundProcessGroup()
-		if pg == nil {
-			// No foreground process group has been set. Signal the
-			// original thread group.
-			log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, pid, pid)
-			return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		// Check that the container has actually started before signaling it.
+		_, _, err := l.threadGroupFromID(execID{cid: cid})
+		if err != nil {
+			return fmt.Errorf("failed to signal container %q: %v", cid, err)
 		}
-		// Send the signal to all processes in the process group.
-		var lastErr error
-		for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
-			if tg.ProcessGroup() != pg {
-				continue
-			}
-			if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
-				lastErr = err
-			}
+		return l.signalAllProcesses(cid, signo)
+
+	default:
+		panic(fmt.Sprintf("unknown signal signal delivery mode %v", mode))
+	}
+}
+
+func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
+	execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	if err == nil {
+		// Send signal directly to the identified process.
+		return execTG.SendSignal(&arch.SignalInfo{Signo: signo})
+	}
+
+	// The caller may be signaling a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace and
+	// signal it.
+	initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("failed to signal container %q: %v", cid, err)
+	}
+	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+	if tg == nil {
+		return fmt.Errorf("failed to signal container %q PID %d: no such process", cid, tgid)
+	}
+	if tg.Leader().ContainerID() != cid {
+		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+	}
+	return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+}
+
+func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
+	// Lookup foreground process group from the TTY for the given process,
+	// and send the signal to it.
+	tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	if err != nil {
+		return fmt.Errorf("failed to signal foreground process group for container %q PID %d: %v", cid, tgid, err)
+	}
+	if tty == nil {
+		return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, tgid)
+	}
+	pg := tty.ForegroundProcessGroup()
+	if pg == nil {
+		// No foreground process group has been set. Signal the
+		// original thread group.
+		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
+		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+	}
+	// Send the signal to all processes in the process group.
+	var lastErr error
+	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+		if tg.ProcessGroup() != pg {
+			continue
 		}
-		return lastErr
-	case DeliverToAllProcesses:
-		if !ok {
-			return fmt.Errorf("failed to signal all processes in container %q PID %d: no such PID", cid, pid)
+		if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+			lastErr = err
 		}
+	}
+	return lastErr
+}
 
-		// Pause the kernel to prevent new processes from being created while
-		// the signal is delivered. This prevents process leaks when SIGKILL is
-		// sent to the entire container.
-		l.k.Pause()
-		if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
-			l.k.Unpause()
-			return err
-		}
+// signalAllProcesses that belong to specified container. It's a noop if the
+// container hasn't started or has exited.
+func (l *Loader) signalAllProcesses(cid string, signo int32) error {
+	// Pause the kernel to prevent new processes from being created while
+	// the signal is delivered. This prevents process leaks when SIGKILL is
+	// sent to the entire container.
+	l.k.Pause()
+	if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
 		l.k.Unpause()
+		return err
+	}
+	l.k.Unpause()
 
-		// If SIGKILLing all processes, wait for them to exit.
-		if linux.Signal(signo) == linux.SIGKILL {
-			for _, t := range l.k.TaskSet().Root.Tasks() {
-				if t.ContainerID() == cid {
-					t.ThreadGroup().WaitExited()
-				}
+	// If SIGKILLing all processes, wait for them to exit.
+	if linux.Signal(signo) == linux.SIGKILL {
+		for _, t := range l.k.TaskSet().Root.Tasks() {
+			if t.ContainerID() == cid {
+				t.ThreadGroup().WaitExited()
 			}
 		}
-		return nil
-	default:
-		panic(fmt.Sprintf("unknown signal signal delivery mode %v", mode))
 	}
+	return nil
+}
+
+// threadGroupFromID same as threadGroupFromIDLocked except that it acquires
+// mutex before calling it.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	return l.threadGroupFromIDLocked(key)
+}
+
+// threadGroupFromIDLocked returns the thread group and TTY for the given
+// execution ID. TTY may be nil if the process is not attached to a terminal.
+// Returns error if execution ID is invalid or if container/process has not
+// started yet. Caller must hold 'mu'.
+func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, nil, fmt.Errorf("container not found")
+	}
+	if ep.tg == nil {
+		return nil, nil, fmt.Errorf("container not started")
+	}
+	return ep.tg, ep.tty, nil
 }
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 4c542ccb9..11c440f09 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -321,6 +321,9 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 			return nil, err
 		}
 		c.Sandbox = sb.Sandbox
+		if err := c.Sandbox.CreateContainer(c.ID); err != nil {
+			return nil, err
+		}
 	}
 	c.changeStatus(Created)
 
@@ -383,7 +386,7 @@ func (c *Container) Start(conf *boot.Config) error {
 		if err != nil {
 			return err
 		}
-		if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil {
+		if err := c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles); err != nil {
 			return err
 		}
 		if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil {
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 243528d35..64def7eed 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1625,6 +1625,70 @@ func TestWaitOnExitedSandbox(t *testing.T) {
 	}
 }
 
+func TestDestroyNotStarted(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create the container and check that it can be destroyed.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	if err := c.Destroy(); err != nil {
+		t.Fatalf("deleting non-started container failed: %v", err)
+	}
+}
+
+// TestDestroyStarting attempts to force a race between start and destroy.
+func TestDestroyStarting(t *testing.T) {
+	for i := 0; i < 10; i++ {
+		spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
+		conf := testutil.TestConfig()
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// Create the container and check that it can be destroyed.
+		id := testutil.UniqueContainerID()
+		c, err := Create(id, spec, conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+
+		// Container is not thread safe, so load another instance to run in
+		// concurrently.
+		startCont, err := Load(rootDir, id)
+		if err != nil {
+			t.Fatalf("error loading container: %v", err)
+		}
+		wg := sync.WaitGroup{}
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			// Ignore failures, start can fail if destroy runs first.
+			startCont.Start(conf)
+		}()
+
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			if err := c.Destroy(); err != nil {
+				t.Errorf("deleting non-started container failed: %v", err)
+			}
+		}()
+		wg.Wait()
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 4548eb106..8af3d535d 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -612,3 +612,121 @@ func TestMultiContainerKillAll(t *testing.T) {
 		}
 	}
 }
+
+func TestMultiContainerDestroyNotStarted(t *testing.T) {
+	specs, ids := createSpecs(
+		[]string{"/bin/sleep", "100"},
+		[]string{"/bin/sleep", "100"})
+	conf := testutil.TestConfig()
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// Create and start root container.
+	rootBundleDir, err := testutil.SetupContainerInRoot(rootDir, specs[0], conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootBundleDir)
+
+	root, err := Create(ids[0], specs[0], conf, rootBundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating root container: %v", err)
+	}
+	defer root.Destroy()
+	if err := root.Start(conf); err != nil {
+		t.Fatalf("error starting root container: %v", err)
+	}
+
+	// Create and destroy sub-container.
+	bundleDir, err := testutil.SetupContainerInRoot(rootDir, specs[1], conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(bundleDir)
+
+	cont, err := Create(ids[1], specs[1], conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+
+	// Check that container can be destroyed.
+	if err := cont.Destroy(); err != nil {
+		t.Fatalf("deleting non-started container failed: %v", err)
+	}
+}
+
+// TestMultiContainerDestroyStarting attempts to force a race between start
+// and destroy.
+func TestMultiContainerDestroyStarting(t *testing.T) {
+	cmds := make([][]string, 10)
+	for i := range cmds {
+		cmds[i] = []string{"/bin/sleep", "100"}
+	}
+	specs, ids := createSpecs(cmds...)
+	conf := testutil.TestConfig()
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// Create and start root container.
+	rootBundleDir, err := testutil.SetupContainerInRoot(rootDir, specs[0], conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootBundleDir)
+
+	root, err := Create(ids[0], specs[0], conf, rootBundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating root container: %v", err)
+	}
+	defer root.Destroy()
+	if err := root.Start(conf); err != nil {
+		t.Fatalf("error starting root container: %v", err)
+	}
+
+	wg := sync.WaitGroup{}
+	for i := range cmds {
+		if i == 0 {
+			continue // skip root container
+		}
+
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, specs[i], conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+
+		cont, err := Create(ids[i], specs[i], conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+
+		// Container is not thread safe, so load another instance to run in
+		// concurrently.
+		startCont, err := Load(rootDir, ids[i])
+		if err != nil {
+			t.Fatalf("error loading container: %v", err)
+		}
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			startCont.Start(conf) // ignore failures, start can fail if destroy runs first.
+		}()
+
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			if err := cont.Destroy(); err != nil {
+				t.Errorf("deleting non-started container failed: %v", err)
+			}
+		}()
+	}
+	wg.Wait()
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 9421bd63e..084d79d06 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -102,6 +102,21 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	return s, nil
 }
 
+// CreateContainer creates a non-root container inside the sandbox.
+func (s *Sandbox) CreateContainer(cid string) error {
+	log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
+	sandboxConn, err := s.sandboxConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to sandbox: %v", err)
+	}
+	defer sandboxConn.Close()
+
+	if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil {
+		return fmt.Errorf("creating non-root container %q: %v", cid, err)
+	}
+	return nil
+}
+
 // StartRoot starts running the root container process inside the sandbox.
 func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 	log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
@@ -125,13 +140,13 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
-// Start starts running a non-root container inside the sandbox.
-func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+// StartContainer starts running a non-root container inside the sandbox.
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
 	for _, f := range goferFiles {
 		defer f.Close()
 	}
 
-	log.Debugf("Start non-root container sandbox %q, PID: %d", s.ID, s.Pid)
+	log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
 	sandboxConn, err := s.sandboxConnect()
 	if err != nil {
 		return fmt.Errorf("couldn't connect to sandbox: %v", err)
@@ -208,9 +223,8 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	}
 	defer conn.Close()
 
-	args := boot.ProcessesArgs{CID: cid}
 	var pl []*control.Process
-	if err := conn.Call(boot.ContainerProcesses, &args, &pl); err != nil {
+	if err := conn.Call(boot.ContainerProcesses, &cid, &pl); err != nil {
 		return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
 	}
 	return pl, nil
-- 
cgit v1.2.3


From a81111d5448346098af375de82aec44459239689 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 6 Nov 2018 16:17:16 -0800
Subject: Fix problem where crictl tests would signal both error and done
 channels

PiperOrigin-RevId: 220372291
Change-Id: I054ba56a23c402c7244b476d7d6fe72084942a0e
---
 runsc/test/testutil/crictl.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'runsc')

diff --git a/runsc/test/testutil/crictl.go b/runsc/test/testutil/crictl.go
index 9740ea6b5..4f82e8234 100644
--- a/runsc/test/testutil/crictl.go
+++ b/runsc/test/testutil/crictl.go
@@ -212,6 +212,7 @@ func (cc *Crictl) run(args ...string) (string, error) {
 		output, err := cmd.CombinedOutput()
 		if err != nil {
 			errCh <- fmt.Errorf("error: \"%v\", output: %s", err, string(output))
+			return
 		}
 		done <- string(output)
 	}()
-- 
cgit v1.2.3


From c92b9b7086b89fd8e7f5913bf74d04761163e24b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 7 Nov 2018 13:32:26 -0800
Subject: Add more logging to controller.go

PiperOrigin-RevId: 220519632
Change-Id: Iaeec007fc1aa3f0b72569b288826d45f2534c4bf
---
 runsc/boot/controller.go | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index f884f8c6b..568aad117 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -169,7 +169,7 @@ type containerManager struct {
 
 // StartRoot will start the root container process.
 func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
-	log.Debugf("containerManager.StartRoot")
+	log.Debugf("containerManager.StartRoot %q", *cid)
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
 	if err := <-cm.startResultChan; err != nil {
@@ -239,6 +239,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 
 	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
+		log.Debugf("containerManager.Start failed %q: %+v", args.CID, args)
 		return err
 	}
 	log.Debugf("Container %q started", args.CID)
@@ -259,6 +260,7 @@ func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) err
 	log.Debugf("containerManager.ExecuteAsync: %+v", args)
 	tgid, err := cm.l.executeAsync(args)
 	if err != nil {
+		log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err)
 		return err
 	}
 	*pid = int32(tgid)
@@ -277,6 +279,7 @@ func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
 
 // Pause suspends a container.
 func (cm *containerManager) Pause(_, _ *struct{}) error {
+	log.Debugf("containerManager.Pause")
 	cm.l.k.Pause()
 	return nil
 }
@@ -398,6 +401,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 
 // Resume unpauses a container.
 func (cm *containerManager) Resume(_, _ *struct{}) error {
+	log.Debugf("containerManager.Resume")
 	cm.l.k.Unpause()
 	return nil
 }
@@ -405,7 +409,9 @@ func (cm *containerManager) Resume(_, _ *struct{}) error {
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
 	log.Debugf("containerManager.Wait")
-	return cm.l.waitContainer(*cid, waitStatus)
+	err := cm.l.waitContainer(*cid, waitStatus)
+	log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err)
+	return err
 }
 
 // WaitPIDArgs are arguments to the WaitPID method.
-- 
cgit v1.2.3


From 13b48f2e6a186321084fa8159e8cc2659ed221a2 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 7 Nov 2018 13:54:47 -0800
Subject: AsyncBarrier should be run after all defers in destroyContainerFS.

destroyContainerFS must wait for all async operations to finish before
returning. In an attempt to do this, we call fs.AsyncBarrier() at the end of
the function. However, there are many defer'd DecRefs which end up running
AFTER the AsyncBarrier() call.

This CL fixes this by calling fs.AsyncBarrier() in the first defer statement,
thus ensuring that it runs at the end of the function, after all other defers.

PiperOrigin-RevId: 220523545
Change-Id: I5e96ee9ea6d86eeab788ff964484c50ef7f64a2f
---
 runsc/boot/fs.go | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e52c89fe4..3f3f9bef6 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -638,6 +638,19 @@ func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *ke
 // destroyContainerFS cleans up the filesystem by unmounting all mounts for the
 // given container and deleting the container root directory.
 func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
+	defer func() {
+		// Flushing dirent references triggers many async close
+		// operations. We must wait for those to complete before
+		// returning, otherwise the caller may kill the gofer before
+		// they complete, causing a cascade of failing RPCs.
+		//
+		// This must take place in the first deferred function, so that
+		// it runs after all the other deferred DecRef() calls in this
+		// function.
+		log.Infof("Waiting for async filesystem operations to complete")
+		fs.AsyncBarrier()
+	}()
+
 	// First get a reference to the container root directory.
 	mns := k.RootMountNamespace()
 	mnsRoot := mns.Root()
@@ -687,12 +700,5 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 		return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
 	}
 
-	// Flushing dirent references triggers many async close operations. We
-	// must wait for those to complete before returning, otherwise the
-	// caller may kill the gofer before they complete, causing a cascade of
-	// failing RPCs.
-	log.Infof("Waiting for async filesystem operations to complete")
-	fs.AsyncBarrier()
-
 	return nil
 }
-- 
cgit v1.2.3


From d12a0dd6b8afaca9fbb5fe60fb84a3ae0502261a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 7 Nov 2018 21:30:11 -0800
Subject: Fix test --race violation

SetupContainerInRoot was setting Config.RootDir unnecessarily
and causing a --race violation in TestMultiContainerDestroyStarting.

PiperOrigin-RevId: 220580073
Change-Id: Ie0b28c19846106c7458a92681b708ae70f87d25a
---
 runsc/container/container_test.go       | 17 +++++++++--------
 runsc/container/multi_container_test.go | 18 ++++++++++--------
 runsc/test/testutil/testutil.go         | 20 +++++++++++++-------
 3 files changed, 32 insertions(+), 23 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 64def7eed..598b96a08 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1287,24 +1287,25 @@ func TestReadonlyMount(t *testing.T) {
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfigWithRoot(rootDir)
+
 	cids := []string{
 		"foo-" + testutil.UniqueContainerID(),
 		"bar-" + testutil.UniqueContainerID(),
 		"baz-" + testutil.UniqueContainerID(),
 	}
-
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
 	for _, cid := range cids {
 		spec := testutil.NewSpecWithArgs("sleep", "100")
-		conf := testutil.TestConfig()
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		bundleDir, err := testutil.SetupBundleDir(spec)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
-		defer os.RemoveAll(rootDir)
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 8af3d535d..d1fe687a9 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -63,6 +63,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 	if err != nil {
 		return nil, nil, fmt.Errorf("error creating root dir: %v", err)
 	}
+	conf.RootDir = rootDir
 
 	var containers []*Container
 	var bundles []string
@@ -76,7 +77,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		os.RemoveAll(rootDir)
 	}
 	for i, spec := range specs {
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		bundleDir, err := testutil.SetupBundleDir(spec)
 		if err != nil {
 			cleanup()
 			return nil, nil, fmt.Errorf("error setting up container: %v", err)
@@ -617,16 +618,16 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 	specs, ids := createSpecs(
 		[]string{"/bin/sleep", "100"},
 		[]string{"/bin/sleep", "100"})
-	conf := testutil.TestConfig()
-
 	rootDir, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
 	defer os.RemoveAll(rootDir)
 
+	conf := testutil.TestConfigWithRoot(rootDir)
+
 	// Create and start root container.
-	rootBundleDir, err := testutil.SetupContainerInRoot(rootDir, specs[0], conf)
+	rootBundleDir, err := testutil.SetupBundleDir(specs[0])
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -642,7 +643,7 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 	}
 
 	// Create and destroy sub-container.
-	bundleDir, err := testutil.SetupContainerInRoot(rootDir, specs[1], conf)
+	bundleDir, err := testutil.SetupBundleDir(specs[1])
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -667,7 +668,6 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 		cmds[i] = []string{"/bin/sleep", "100"}
 	}
 	specs, ids := createSpecs(cmds...)
-	conf := testutil.TestConfig()
 
 	rootDir, err := testutil.SetupRootDir()
 	if err != nil {
@@ -675,8 +675,10 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
+	conf := testutil.TestConfigWithRoot(rootDir)
+
 	// Create and start root container.
-	rootBundleDir, err := testutil.SetupContainerInRoot(rootDir, specs[0], conf)
+	rootBundleDir, err := testutil.SetupBundleDir(specs[0])
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -697,7 +699,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 			continue // skip root container
 		}
 
-		bundleDir, err := testutil.SetupContainerInRoot(rootDir, specs[i], conf)
+		bundleDir, err := testutil.SetupBundleDir(specs[i])
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 59dc55887..3490bd11e 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -104,7 +104,8 @@ func FindFile(path string) (string, error) {
 	return matches[0], nil
 }
 
-// TestConfig return the default configuration to use in tests.
+// TestConfig returns the default configuration to use in tests. Note that
+// 'RootDir' must be set by caller if required.
 func TestConfig() *boot.Config {
 	return &boot.Config{
 		Debug:      true,
@@ -117,6 +118,13 @@ func TestConfig() *boot.Config {
 	}
 }
 
+// TestConfigWithRoot returns the default configuration to use in tests.
+func TestConfigWithRoot(rootDir string) *boot.Config {
+	conf := TestConfig()
+	conf.RootDir = rootDir
+	return conf
+}
+
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
 // in tests.
 func NewSpecWithArgs(args ...string) *specs.Spec {
@@ -162,13 +170,13 @@ func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir str
 	if err != nil {
 		return "", "", err
 	}
-	bundleDir, err = SetupContainerInRoot(rootDir, spec, conf)
+	conf.RootDir = rootDir
+	bundleDir, err = SetupBundleDir(spec)
 	return rootDir, bundleDir, err
 }
 
-// SetupContainerInRoot creates a bundle for the container, generates a test
-// config, and writes the spec to config.json in the bundle dir.
-func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (bundleDir string, err error) {
+// SetupBundleDir creates a bundle dir and writes the spec to config.json.
+func SetupBundleDir(spec *specs.Spec) (bundleDir string, err error) {
 	bundleDir, err = ioutil.TempDir(TmpDir(), "bundle")
 	if err != nil {
 		return "", fmt.Errorf("error creating bundle dir: %v", err)
@@ -177,8 +185,6 @@ func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (
 	if err = writeSpec(bundleDir, spec); err != nil {
 		return "", fmt.Errorf("error writing spec: %v", err)
 	}
-
-	conf.RootDir = rootDir
 	return bundleDir, nil
 }
 
-- 
cgit v1.2.3


From 90e81b2e5c665b9fc149f97dcf15142c190260c6 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 7 Nov 2018 23:28:37 -0800
Subject: Add test that volume can be mounted on top of a symlink

PiperOrigin-RevId: 220588094
Change-Id: I18915e892ceac86eac1f89ebcadffb4fdf8d0cf6
---
 runsc/test/root/crictl_test.go      | 38 +++++++++++++++++++++++++++++++++++++
 runsc/test/root/testdata/BUILD      |  1 +
 runsc/test/root/testdata/busybox.go | 32 +++++++++++++++++++++++++++++++
 runsc/test/testutil/crictl.go       | 11 +++++++++++
 4 files changed, 82 insertions(+)
 create mode 100644 runsc/test/root/testdata/busybox.go

(limited to 'runsc')

diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
index 88e24782a..556d95fff 100644
--- a/runsc/test/root/crictl_test.go
+++ b/runsc/test/root/crictl_test.go
@@ -25,6 +25,7 @@ import (
 	"os/exec"
 	"path"
 	"path/filepath"
+	"strings"
 	"testing"
 	"time"
 
@@ -79,6 +80,43 @@ func TestMountPaths(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+func TestMountOverSymlinks(t *testing.T) {
+	// Setup containerd and crictl.
+	crictl, cleanup, err := setup(t)
+	if err != nil {
+		t.Fatalf("failed to setup crictl: %v", err)
+	}
+	defer cleanup()
+	podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, testdata.MountOverSymlink)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	out, err := crictl.Exec(contID, "readlink", "/etc/resolv.conf")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if want := "/tmp/resolv.conf"; !strings.Contains(string(out), want) {
+		t.Fatalf("/etc/resolv.conf is not pointing to %q: %q", want, string(out))
+	}
+
+	etc, err := crictl.Exec(contID, "cat", "/etc/resolv.conf")
+	if err != nil {
+		t.Fatal(err)
+	}
+	tmp, err := crictl.Exec(contID, "cat", "/tmp/resolv.conf")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if tmp != etc {
+		t.Fatalf("file content doesn't match:\n\t/etc/resolv.conf: %s\n\t/tmp/resolv.conf: %s", string(etc), string(tmp))
+	}
+
+	// Stop everything.
+	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
+		t.Fatal(err)
+	}
+}
 
 // setup sets up before a test. Specifically it:
 // * Creates directories and a socket for containerd to utilize.
diff --git a/runsc/test/root/testdata/BUILD b/runsc/test/root/testdata/BUILD
index a22635129..6c9fe0aea 100644
--- a/runsc/test/root/testdata/BUILD
+++ b/runsc/test/root/testdata/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])  # Apache 2.0
 go_library(
     name = "testdata",
     srcs = [
+        "busybox.go",
         "containerd_config.go",
         "httpd.go",
         "httpd_mount_paths.go",
diff --git a/runsc/test/root/testdata/busybox.go b/runsc/test/root/testdata/busybox.go
new file mode 100644
index 000000000..544571c63
--- /dev/null
+++ b/runsc/test/root/testdata/busybox.go
@@ -0,0 +1,32 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testdata
+
+// MountOverSymlink is a JSON config for a container that /etc/resolv.conf is a
+// symlink to /tmp/resolv.conf.
+var MountOverSymlink = `
+{
+        "metadata": {
+                "name": "busybox"
+        },
+        "image": {
+                "image": "k8s.gcr.io/busybox"
+        },
+        "command": [
+                "sleep",
+                "1000"
+        ]
+}
+`
diff --git a/runsc/test/testutil/crictl.go b/runsc/test/testutil/crictl.go
index 4f82e8234..84bb4475a 100644
--- a/runsc/test/testutil/crictl.go
+++ b/runsc/test/testutil/crictl.go
@@ -92,6 +92,17 @@ func (cc *Crictl) Stop(contID string) error {
 	return err
 }
 
+// Exec execs a program inside a container. It corresponds to `crictl exec`.
+func (cc *Crictl) Exec(contID string, args ...string) (string, error) {
+	a := []string{"exec", contID}
+	a = append(a, args...)
+	output, err := cc.run(a...)
+	if err != nil {
+		return "", fmt.Errorf("exec failed: %v", err)
+	}
+	return output, nil
+}
+
 // Rm removes a container. It corresponds to `crictl rm`.
 func (cc *Crictl) Rm(contID string) error {
 	_, err := cc.run("rm", contID)
-- 
cgit v1.2.3


From 93e88760b0d0c9c6656f7773f68540b1853d169b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 9 Nov 2018 10:57:45 -0800
Subject: Add tests multicontainer start/stop

Each container has its respective gofer. Test that
gofer can be shutdown when a container stops and that
it doesn't affect other containers.

PiperOrigin-RevId: 220829898
Change-Id: I2a44a3cf2a88577e6ad1133afc622bbf4a5f6591
---
 runsc/container/multi_container_test.go | 91 +++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

(limited to 'runsc')

diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index d1fe687a9..e431f5aec 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -732,3 +732,94 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	}
 	wg.Wait()
 }
+
+// TestMultiContainerGoferStop tests that IO operations continue to work after
+// containers have been stopped and gofers killed.
+func TestMultiContainerGoferStop(t *testing.T) {
+	app, err := testutil.FindFile("runsc/container/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	// Setup containers. Root container just reaps children, while the others
+	// perform some IOs. Children are executed in 3 batches of 10. Within the
+	// batch there is overlap between containers starting and being destroyed. In
+	// between batches all containers stop before starting another batch.
+	cmds := [][]string{{app, "reaper"}}
+	const batchSize = 10
+	for i := 0; i < 3*batchSize; i++ {
+		cmds = append(cmds, []string{"sh", "-c", "find /bin -type f | head | xargs -I SRC cp SRC /tmp/output"})
+	}
+	allSpecs, allIDs := createSpecs(cmds...)
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// Split up the specs and IDs.
+	rootSpec := allSpecs[0]
+	rootID := allIDs[0]
+	childrenSpecs := allSpecs[1:]
+	childrenIDs := allIDs[1:]
+
+	bundleDir, err := testutil.SetupBundleDir(rootSpec)
+	if err != nil {
+		t.Fatalf("error setting up bundle dir: %v", err)
+	}
+	defer os.RemoveAll(bundleDir)
+
+	// Start root container.
+	conf := testutil.TestConfigWithRoot(rootDir)
+	root, err := Create(rootID, rootSpec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating root container: %v", err)
+	}
+	if err := root.Start(conf); err != nil {
+		t.Fatalf("error starting root container: %v", err)
+	}
+	defer root.Destroy()
+
+	// Run batches. Each batch starts containers in parallel, then wait and
+	// destroy them before starting another batch.
+	for i := 0; i < len(childrenSpecs); i += batchSize {
+		t.Logf("Starting batch from %d to %d", i, i+batchSize)
+		specs := childrenSpecs[i : i+batchSize]
+		ids := childrenIDs[i : i+batchSize]
+
+		var children []*Container
+		for j, spec := range specs {
+			bundleDir, err := testutil.SetupBundleDir(spec)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(bundleDir)
+
+			child, err := Create(ids[j], spec, conf, bundleDir, "", "", "")
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			children = append(children, child)
+
+			if err := child.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Give a small gap between containers.
+			time.Sleep(50 * time.Millisecond)
+		}
+		for _, child := range children {
+			ws, err := child.Wait()
+			if err != nil {
+				t.Fatalf("waiting for container: %v", err)
+			}
+			if !ws.Exited() || ws.ExitStatus() != 0 {
+				t.Fatalf("container failed, waitStatus: %x (%d)", ws, ws.ExitStatus())
+			}
+			if err := child.Destroy(); err != nil {
+				t.Fatalf("error destroying container: %v", err)
+			}
+		}
+	}
+}
-- 
cgit v1.2.3


From d97ccfa346d23d99dcbe634a10fa5d81b089100d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 9 Nov 2018 14:53:24 -0800
Subject: Close donated files if containerManager.Start() fails

PiperOrigin-RevId: 220869535
Change-Id: I9917e5daf02499f7aab6e2aa4051c54ff4461b9a
---
 runsc/boot/controller.go | 6 ++++++
 runsc/boot/loader.go     | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 568aad117..7a1f42119 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -213,6 +213,12 @@ type StartArgs struct {
 func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	log.Debugf("containerManager.Start: %+v", args)
 
+	defer func() {
+		for _, f := range args.FilePayload.Files {
+			f.Close()
+		}
+	}()
+
 	// Validate arguments.
 	if args == nil {
 		return errors.New("start missing arguments")
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 946ddfd47..d953bb783 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -515,7 +515,8 @@ func (l *Loader) createContainer(cid string) error {
 }
 
 // startContainer starts a child container. It returns the thread group ID of
-// the newly created process.
+// the newly created process. Caller owns 'files' and may close them after
+// this method returns.
 func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
@@ -553,7 +554,6 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		if err != nil {
 			return fmt.Errorf("failed to dup file: %v", err)
 		}
-		f.Close()
 		ioFDs = append(ioFDs, fd)
 	}
 
-- 
cgit v1.2.3


From 4cd4b60352bc8a572a0a9482c58564397c49446c Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Mon, 12 Nov 2018 11:11:47 -0800
Subject: runsc: generate exec pidfile after everything is ready.

PiperOrigin-RevId: 221123160
Change-Id: Ia7061d60d114d69f49aba853fe6bae3c733522b5
---
 runsc/cmd/exec.go | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 9a395e6f1..35aa5499e 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -146,12 +146,6 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return ex.execAndWait(waitStatus)
 	}
 
-	if ex.pidFile != "" {
-		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
-			Fatalf("error writing pid file: %v", err)
-		}
-	}
-
 	// Start the new process and get it pid.
 	pid, err := c.Execute(e)
 	if err != nil {
@@ -173,6 +167,15 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 
+	// Generate the pid file after the internal pid file is generated, so that users
+	// can safely assume that the internal pid file is ready after `runsc exec -d`
+	// returns.
+	if ex.pidFile != "" {
+		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
+			Fatalf("error writing pid file: %v", err)
+		}
+	}
+
 	// Wait for the process to exit.
 	ws, err := c.WaitPID(pid, ex.clearStatus)
 	if err != nil {
-- 
cgit v1.2.3


From c57b92a0c701122bd0cfcaa256f9fac942317548 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 12 Nov 2018 16:28:17 -0800
Subject: Internal change.

PiperOrigin-RevId: 221178413
Change-Id: I0e615c5e945cb924d8df767c894a9e402f0b8ff2
---
 runsc/container/BUILD     | 1 +
 runsc/test/testutil/BUILD | 1 +
 2 files changed, 2 insertions(+)

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index bdd93aaba..551e5bc99 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -13,6 +13,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/container",
     visibility = [
         "//runsc:__subpackages__",
+        "//third_party/golang/gvisor/test:__subpackages__",
     ],
     deps = [
         "//pkg/log",
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 3ed235393..8d4839318 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -13,6 +13,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/test/testutil",
     visibility = [
         "//runsc:__subpackages__",
+        "//third_party/golang/gvisor/test:__subpackages__",
     ],
     deps = [
         "//runsc/boot",
-- 
cgit v1.2.3


From 6c2d320138300dbea519028d16e12d66baa23c9d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 13 Nov 2018 11:13:22 -0800
Subject: Internal change.

PiperOrigin-RevId: 221299066
Change-Id: I8ae352458f9976c329c6946b1efa843a3de0eaa4
---
 runsc/boot/loader.go            | 2 +-
 runsc/test/testutil/testutil.go | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index d953bb783..7cac346c9 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -258,7 +258,7 @@ func New(args Args) (*Loader, error) {
 		NetworkStack:                networkStack,
 		ApplicationCores:            uint(args.NumCPU),
 		Vdso:                        vdso,
-		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, "", creds.UserNamespace),
+		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
 		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
 		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
 	}); err != nil {
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 3490bd11e..d323d7899 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -151,6 +151,7 @@ func NewSpecWithArgs(args ...string) *specs.Spec {
 				Source:      TmpDir(),
 			},
 		},
+		Hostname: "runsc-test-" + strings.Join(args, "_"),
 	}
 }
 
-- 
cgit v1.2.3


From 7f558eda44bf93c31dfbbe621c2bb84d55b5701f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 13 Nov 2018 15:16:11 -0800
Subject: Internal change.

PiperOrigin-RevId: 221343421
Change-Id: I418b5204c5ed4fe1e0af25ef36ee66b9b571928e
---
 runsc/specutils/specutils.go    | 15 +++++++++++++++
 runsc/test/testutil/testutil.go |  1 +
 2 files changed, 16 insertions(+)

(limited to 'runsc')

diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index ab14ed1fc..0e0961801 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -216,6 +216,21 @@ func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, er
 	return &caps, nil
 }
 
+// AllCapabilities returns a LinuxCapabilities struct with all capabilities.
+func AllCapabilities() *specs.LinuxCapabilities {
+	var names []string
+	for n := range capFromName {
+		names = append(names, n)
+	}
+	return &specs.LinuxCapabilities{
+		Bounding:    names,
+		Effective:   names,
+		Inheritable: names,
+		Permitted:   names,
+		Ambient:     names,
+	}
+}
+
 var capFromName = map[string]linux.Capability{
 	"CAP_CHOWN":            linux.CAP_CHOWN,
 	"CAP_DAC_OVERRIDE":     linux.CAP_DAC_OVERRIDE,
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index d323d7899..c816de3f0 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -139,6 +139,7 @@ func NewSpecWithArgs(args ...string) *specs.Spec {
 			Env: []string{
 				"PATH=" + os.Getenv("PATH"),
 			},
+			Capabilities: specutils.AllCapabilities(),
 		},
 		Mounts: []specs.Mount{
 			// Root is readonly, but many tests want to write to tmpdir.
-- 
cgit v1.2.3


From 40f843fc7802271654314e1c339c372e72900845 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 13 Nov 2018 15:17:26 -0800
Subject: Internal change.

PiperOrigin-RevId: 221343626
Change-Id: I03d57293a555cf4da9952a81803b9f8463173c89
---
 runsc/boot/BUILD | 1 +
 1 file changed, 1 insertion(+)

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 07afce807..bdaa7a0c3 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -21,6 +21,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
     visibility = [
         "//runsc:__subpackages__",
+        "//third_party/golang/gvisor/test:__subpackages__",
     ],
     deps = [
         "//pkg/abi",
-- 
cgit v1.2.3


From 7b938ef78c2b180d1d0554534972069ec393322d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 14 Nov 2018 09:57:34 -0800
Subject: Internal change.

PiperOrigin-RevId: 221462069
Change-Id: Id469ed21fe12e582c78340189b932989afa13c67
---
 runsc/specutils/BUILD        | 1 +
 runsc/specutils/specutils.go | 1 +
 2 files changed, 2 insertions(+)

(limited to 'runsc')

diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index a1e5da3f5..034628c92 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -11,6 +11,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
     visibility = [
         "//runsc:__subpackages__",
+        "//third_party/golang/gvisor/test:__subpackages__",
     ],
     deps = [
         "//pkg/abi/linux",
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 0e0961801..055076475 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -52,6 +52,7 @@ func LogSpec(spec *specs.Spec) {
 	}
 	log.Debugf("Spec.Process: %+v", spec.Process)
 	log.Debugf("Spec.Root: %+v", spec.Root)
+	log.Debugf("Spec.Mounts: %+v", spec.Mounts)
 }
 
 // ValidateSpec validates that the spec is compatible with runsc.
-- 
cgit v1.2.3


From adf8138e069a99aa7f3fe190a9a7c17a4e88b99a Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 15 Nov 2018 15:34:38 -0800
Subject: Allow sandbox.Wait to be called after the sandbox has exited.

sandbox.Wait is racey, as the sandbox may have exited before it is called, or
even during.

We already had code to handle the case that the sandbox exits during the Wait
call, but we were not properly handling the case where the sandbox has exited
before the call.

The best we can do in such cases is return the sandbox exit code as the
application exit code.

PiperOrigin-RevId: 221702517
Change-Id: I290d0333cc094c7c1c3b4ce0f17f61a3e908d787
---
 runsc/container/container.go      |  3 --
 runsc/container/container_test.go | 52 +++++++++++++++++++--------------
 runsc/sandbox/sandbox.go          | 24 +++++++++------
 runsc/test/testutil/testutil.go   | 61 ++++++++++++++++++++++++++++++++-------
 4 files changed, 97 insertions(+), 43 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 11c440f09..80a27df4a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -476,9 +476,6 @@ func (c *Container) SandboxPid() int {
 // and wait returns immediately.
 func (c *Container) Wait() (syscall.WaitStatus, error) {
 	log.Debugf("Wait on container %q", c.ID)
-	if !c.isSandboxRunning() {
-		return 0, fmt.Errorf("sandbox is not running")
-	}
 	return c.Sandbox.Wait(c.ID)
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 598b96a08..45a36e583 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -39,12 +39,8 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
-func init() {
-	log.SetLevel(log.Debug)
-	if err := testutil.ConfigureExePath(); err != nil {
-		panic(err.Error())
-	}
-}
+// childReaper reaps child processes.
+var childReaper *testutil.Reaper
 
 // waitForProcessList waits for the given process list to show up in the container.
 func waitForProcessList(cont *Container, want []*control.Process) error {
@@ -1580,12 +1576,17 @@ func TestUserLog(t *testing.T) {
 }
 
 func TestWaitOnExitedSandbox(t *testing.T) {
+	// Disable the childReaper for this test.
+	childReaper.Stop()
+	defer childReaper.Start()
+
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		// Run a shell that exits immediately with a non-zero code.
+		// Run a shell that sleeps for 1 second and then exits with a
+		// non-zero code.
 		const wantExit = 17
-		cmd := fmt.Sprintf("exit %d", wantExit)
+		cmd := fmt.Sprintf("sleep 1; exit %d", wantExit)
 		spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
@@ -1604,22 +1605,23 @@ func TestWaitOnExitedSandbox(t *testing.T) {
 			t.Fatalf("error starting container: %v", err)
 		}
 
-		// Wait for the sandbox to stop running.
-		if err := testutil.Poll(func() error {
-			if c.Sandbox.IsRunning() {
-				return nil
-			}
-			return fmt.Errorf("sandbox still running")
-		}, 10*time.Second); err != nil {
-			t.Fatalf("error waiting for sandbox to exit: %v", err)
-		}
-
-		// Now call Wait.
+		// Wait on the sandbox. This will make an RPC to the sandbox
+		// and get the actual exit status of the application.
 		ws, err := c.Wait()
 		if err != nil {
 			t.Fatalf("error waiting on container: %v", err)
 		}
+		if got := ws.ExitStatus(); got != wantExit {
+			t.Errorf("got exit status %d, want %d", got, wantExit)
+		}
 
+		// Now the sandbox has exited, but the zombie sandbox process
+		// still exists. Calling Wait() now will return the sandbox
+		// exit status.
+		ws, err = c.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
 		if got := ws.ExitStatus(); got != wantExit {
 			t.Errorf("got exit status %d, want %d", got, wantExit)
 		}
@@ -1704,8 +1706,16 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 }
 
 func TestMain(m *testing.M) {
+	log.SetLevel(log.Debug)
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
+	}
 	testutil.RunAsRoot()
-	stop := testutil.StartReaper()
-	defer stop()
+
+	// Start the child reaper.
+	childReaper = &testutil.Reaper{}
+	childReaper.Start()
+	defer childReaper.Stop()
+
 	os.Exit(m.Run())
 }
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 084d79d06..3f00eba94 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -612,17 +612,23 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
 	var ws syscall.WaitStatus
-	conn, err := s.sandboxConnect()
-	if err != nil {
-		return ws, err
-	}
-	defer conn.Close()
 
-	// First try the Wait RPC to the sandbox.
-	if err := conn.Call(boot.ContainerWait, &cid, &ws); err == nil {
-		return ws, nil
+	if conn, err := s.sandboxConnect(); err != nil {
+		// The sandbox may have exited while before we had a chance to
+		// wait on it.
+		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+	} else {
+		defer conn.Close()
+		// Try the Wait RPC to the sandbox.
+		err = conn.Call(boot.ContainerWait, &cid, &ws)
+		if err == nil {
+			// It worked!
+			return ws, nil
+		}
+		// The sandbox may have exited after we connected, but before
+		// or during the Wait RPC.
+		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 	}
-	log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 
 	// The sandbox may have already exited, or exited while handling the
 	// Wait RPC. The best we can do is ask Linux what the sandbox exit
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index c816de3f0..7a17d0552 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -29,6 +29,7 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"syscall"
 	"time"
@@ -296,18 +297,37 @@ func RunAsRoot() {
 	os.Exit(0)
 }
 
-// StartReaper starts a goroutine that will reap all children processes created
-// by the tests. Caller must call the returned function to stop it.
-func StartReaper() func() {
-	ch := make(chan os.Signal, 1)
-	signal.Notify(ch, syscall.SIGCHLD)
-	stop := make(chan struct{})
+// Reaper reaps child processes.
+type Reaper struct {
+	// mu protects ch, which will be nil if the reaper is not running.
+	mu sync.Mutex
+	ch chan os.Signal
+}
+
+// Start starts reaping child processes.
+func (r *Reaper) Start() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch != nil {
+		panic("reaper.Start called on a running reaper")
+	}
+
+	r.ch = make(chan os.Signal, 1)
+	signal.Notify(r.ch, syscall.SIGCHLD)
 
 	go func() {
 		for {
-			select {
-			case <-ch:
-			case <-stop:
+			r.mu.Lock()
+			ch := r.ch
+			r.mu.Unlock()
+			if ch == nil {
+				return
+			}
+
+			_, ok := <-ch
+			if !ok {
+				// Channel closed.
 				return
 			}
 			for {
@@ -318,7 +338,28 @@ func StartReaper() func() {
 			}
 		}
 	}()
-	return func() { stop <- struct{}{} }
+}
+
+// Stop stops reaping child processes.
+func (r *Reaper) Stop() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch == nil {
+		panic("reaper.Stop called on a stopped reaper")
+	}
+
+	signal.Stop(r.ch)
+	close(r.ch)
+	r.ch = nil
+}
+
+// StartReaper is a helper that starts a new Reaper and returns a function to
+// stop it.
+func StartReaper() func() {
+	r := &Reaper{}
+	r.Start()
+	return r.Stop
 }
 
 // RetryEintr retries the function until an error different than EINTR is
-- 
cgit v1.2.3


From 845836c5783cb237c28b91f2f9a8f52a8219228e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 16 Nov 2018 14:01:46 -0800
Subject: Internal change.

PiperOrigin-RevId: 221848471
Change-Id: I882fbe5ce7737048b2e1f668848e9c14ed355665
---
 runsc/boot/BUILD          | 1 -
 runsc/container/BUILD     | 1 -
 runsc/specutils/BUILD     | 1 -
 runsc/test/testutil/BUILD | 1 -
 4 files changed, 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index bdaa7a0c3..07afce807 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -21,7 +21,6 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
     visibility = [
         "//runsc:__subpackages__",
-        "//third_party/golang/gvisor/test:__subpackages__",
     ],
     deps = [
         "//pkg/abi",
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 551e5bc99..bdd93aaba 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -13,7 +13,6 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/container",
     visibility = [
         "//runsc:__subpackages__",
-        "//third_party/golang/gvisor/test:__subpackages__",
     ],
     deps = [
         "//pkg/log",
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 034628c92..a1e5da3f5 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -11,7 +11,6 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
     visibility = [
         "//runsc:__subpackages__",
-        "//third_party/golang/gvisor/test:__subpackages__",
     ],
     deps = [
         "//pkg/abi/linux",
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 8d4839318..3ed235393 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -13,7 +13,6 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/test/testutil",
     visibility = [
         "//runsc:__subpackages__",
-        "//third_party/golang/gvisor/test:__subpackages__",
     ],
     deps = [
         "//runsc/boot",
-- 
cgit v1.2.3


From 237f9c7a5e7078b46303f1262b77372a2f6a7f7b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 16 Nov 2018 18:07:52 -0800
Subject: Don't fail when destroyContainerFS is called more than once

This can happen when destroy is called multiple times or when destroy
failed previously and is being called again.

PiperOrigin-RevId: 221882034
Change-Id: I8d069af19cf66c4e2419bdf0d4b789c5def8d19e
---
 runsc/boot/fs.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 3f3f9bef6..1e355fe4e 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -673,10 +673,11 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 		defer root.DecRef()
 
 		// Do a best-effort unmount by flushing the refs and unmount
-		// with "detach only = true".
+		// with "detach only = true". Unmount returns EINVAL when the mount point
+		// doesn't exist, i.e. it has already been unmounted.
 		log.Debugf("Unmounting container submount %q", root.BaseName())
 		m.FlushDirentRefs()
-		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
+		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
 			return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
 		}
 	}
-- 
cgit v1.2.3


From fadffa2ff831034ff63146abf408ff71462b9f43 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 19 Nov 2018 15:25:00 -0800
Subject: Add unsupported syscall events for get/setsockopt

PiperOrigin-RevId: 222148953
Change-Id: I21500a9f08939c45314a6414e0824490a973e5aa
---
 pkg/abi/linux/BUILD                    |   1 +
 pkg/abi/linux/ip.go                    | 107 ++++++
 pkg/abi/linux/netlink.go               |  14 +
 pkg/abi/linux/socket.go                |  87 ++++-
 pkg/abi/linux/tcp.go                   |  54 +++
 pkg/sentry/socket/epsocket/epsocket.go | 675 ++++++++++++++++++++++-----------
 pkg/sentry/socket/netlink/socket.go    |  33 ++
 pkg/sentry/socket/socket.go            |  91 +++++
 runsc/boot/compat.go                   |  10 +-
 runsc/boot/compat_amd64.go             |  55 ++-
 runsc/boot/compat_test.go              |  39 +-
 11 files changed, 902 insertions(+), 264 deletions(-)
 create mode 100644 pkg/abi/linux/tcp.go

(limited to 'runsc')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index f8f82c0da..1f6e43605 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -43,6 +43,7 @@ go_library(
         "shm.go",
         "signal.go",
         "socket.go",
+        "tcp.go",
         "time.go",
         "timer.go",
         "tty.go",
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index fcec16965..77ac1062c 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -42,3 +42,110 @@ const (
 	IPPROTO_MPLS    = 137
 	IPPROTO_RAW     = 255
 )
+
+// Socket options from uapi/linux/in.h
+const (
+	IP_TOS                    = 1
+	IP_TTL                    = 2
+	IP_HDRINCL                = 3
+	IP_OPTIONS                = 4
+	IP_ROUTER_ALERT           = 5
+	IP_RECVOPTS               = 6
+	IP_RETOPTS                = 7
+	IP_PKTINFO                = 8
+	IP_PKTOPTIONS             = 9
+	IP_MTU_DISCOVER           = 10
+	IP_RECVERR                = 11
+	IP_RECVTTL                = 12
+	IP_RECVTOS                = 13
+	IP_MTU                    = 14
+	IP_FREEBIND               = 15
+	IP_IPSEC_POLICY           = 16
+	IP_XFRM_POLICY            = 17
+	IP_PASSSEC                = 18
+	IP_TRANSPARENT            = 19
+	IP_ORIGDSTADDR            = 20
+	IP_RECVORIGDSTADDR        = IP_ORIGDSTADDR
+	IP_MINTTL                 = 21
+	IP_NODEFRAG               = 22
+	IP_CHECKSUM               = 23
+	IP_BIND_ADDRESS_NO_PORT   = 24
+	IP_RECVFRAGSIZE           = 25
+	IP_MULTICAST_IF           = 32
+	IP_MULTICAST_TTL          = 33
+	IP_MULTICAST_LOOP         = 34
+	IP_ADD_MEMBERSHIP         = 35
+	IP_DROP_MEMBERSHIP        = 36
+	IP_UNBLOCK_SOURCE         = 37
+	IP_BLOCK_SOURCE           = 38
+	IP_ADD_SOURCE_MEMBERSHIP  = 39
+	IP_DROP_SOURCE_MEMBERSHIP = 40
+	IP_MSFILTER               = 41
+	MCAST_JOIN_GROUP          = 42
+	MCAST_BLOCK_SOURCE        = 43
+	MCAST_UNBLOCK_SOURCE      = 44
+	MCAST_LEAVE_GROUP         = 45
+	MCAST_JOIN_SOURCE_GROUP   = 46
+	MCAST_LEAVE_SOURCE_GROUP  = 47
+	MCAST_MSFILTER            = 48
+	IP_MULTICAST_ALL          = 49
+	IP_UNICAST_IF             = 50
+)
+
+// Socket options from uapi/linux/in6.h
+const (
+	IPV6_ADDRFORM         = 1
+	IPV6_2292PKTINFO      = 2
+	IPV6_2292HOPOPTS      = 3
+	IPV6_2292DSTOPTS      = 4
+	IPV6_2292RTHDR        = 5
+	IPV6_2292PKTOPTIONS   = 6
+	IPV6_CHECKSUM         = 7
+	IPV6_2292HOPLIMIT     = 8
+	IPV6_NEXTHOP          = 9
+	IPV6_FLOWINFO         = 11
+	IPV6_UNICAST_HOPS     = 16
+	IPV6_MULTICAST_IF     = 17
+	IPV6_MULTICAST_HOPS   = 18
+	IPV6_MULTICAST_LOOP   = 19
+	IPV6_ADD_MEMBERSHIP   = 20
+	IPV6_DROP_MEMBERSHIP  = 21
+	IPV6_ROUTER_ALERT     = 22
+	IPV6_MTU_DISCOVER     = 23
+	IPV6_MTU              = 24
+	IPV6_RECVERR          = 25
+	IPV6_V6ONLY           = 26
+	IPV6_JOIN_ANYCAST     = 27
+	IPV6_LEAVE_ANYCAST    = 28
+	IPV6_MULTICAST_ALL    = 29
+	IPV6_FLOWLABEL_MGR    = 32
+	IPV6_FLOWINFO_SEND    = 33
+	IPV6_IPSEC_POLICY     = 34
+	IPV6_XFRM_POLICY      = 35
+	IPV6_HDRINCL          = 36
+	IPV6_RECVPKTINFO      = 49
+	IPV6_PKTINFO          = 50
+	IPV6_RECVHOPLIMIT     = 51
+	IPV6_HOPLIMIT         = 52
+	IPV6_RECVHOPOPTS      = 53
+	IPV6_HOPOPTS          = 54
+	IPV6_RTHDRDSTOPTS     = 55
+	IPV6_RECVRTHDR        = 56
+	IPV6_RTHDR            = 57
+	IPV6_RECVDSTOPTS      = 58
+	IPV6_DSTOPTS          = 59
+	IPV6_RECVPATHMTU      = 60
+	IPV6_PATHMTU          = 61
+	IPV6_DONTFRAG         = 62
+	IPV6_RECVTCLASS       = 66
+	IPV6_TCLASS           = 67
+	IPV6_AUTOFLOWLABEL    = 70
+	IPV6_ADDR_PREFERENCES = 72
+	IPV6_MINHOPCOUNT      = 73
+	IPV6_ORIGDSTADDR      = 74
+	IPV6_RECVORIGDSTADDR  = IPV6_ORIGDSTADDR
+	IPV6_TRANSPARENT      = 75
+	IPV6_UNICAST_IF       = 76
+	IPV6_RECVFRAGSIZE     = 77
+	IPV6_FREEBIND         = 78
+)
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index 10ceb5bf2..25c5e17fd 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -108,3 +108,17 @@ const NetlinkAttrHeaderSize = 4
 // NLA_ALIGNTO is the alignment of netlink attributes, from
 // uapi/linux/netlink.h.
 const NLA_ALIGNTO = 4
+
+// Socket options, from uapi/linux/netlink.h.
+const (
+	NETLINK_ADD_MEMBERSHIP   = 1
+	NETLINK_DROP_MEMBERSHIP  = 2
+	NETLINK_PKTINFO          = 3
+	NETLINK_BROADCAST_ERROR  = 4
+	NETLINK_NO_ENOBUFS       = 5
+	NETLINK_LISTEN_ALL_NSID  = 8
+	NETLINK_LIST_MEMBERSHIPS = 9
+	NETLINK_CAP_ACK          = 10
+	NETLINK_EXT_ACK          = 11
+	NETLINK_DUMP_STRICT_CHK  = 12
+)
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index af0761a3b..929814752 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -89,8 +89,18 @@ const (
 	MSG_CMSG_CLOEXEC     = 0x40000000
 )
 
-// SOL_SOCKET is from socket.h
-const SOL_SOCKET = 1
+// Set/get socket option levels, from socket.h.
+const (
+	SOL_IP      = 0
+	SOL_SOCKET  = 1
+	SOL_TCP     = 6
+	SOL_UDP     = 17
+	SOL_IPV6    = 41
+	SOL_ICMPV6  = 58
+	SOL_RAW     = 255
+	SOL_PACKET  = 263
+	SOL_NETLINK = 270
+)
 
 // Socket types, from linux/net.h.
 const (
@@ -122,22 +132,63 @@ const (
 
 // Socket options from socket.h.
 const (
-	SO_ERROR       = 4
-	SO_KEEPALIVE   = 9
-	SO_LINGER      = 13
-	SO_MARK        = 36
-	SO_PASSCRED    = 16
-	SO_PEERCRED    = 17
-	SO_PEERNAME    = 28
-	SO_PROTOCOL    = 38
-	SO_RCVBUF      = 8
-	SO_RCVTIMEO    = 20
-	SO_REUSEADDR   = 2
-	SO_SNDBUF      = 7
-	SO_SNDTIMEO    = 21
-	SO_TIMESTAMP   = 29
-	SO_TIMESTAMPNS = 35
-	SO_TYPE        = 3
+	SO_DEBUG                 = 1
+	SO_REUSEADDR             = 2
+	SO_TYPE                  = 3
+	SO_ERROR                 = 4
+	SO_DONTROUTE             = 5
+	SO_BROADCAST             = 6
+	SO_SNDBUF                = 7
+	SO_RCVBUF                = 8
+	SO_KEEPALIVE             = 9
+	SO_OOBINLINE             = 10
+	SO_NO_CHECK              = 11
+	SO_PRIORITY              = 12
+	SO_LINGER                = 13
+	SO_BSDCOMPAT             = 14
+	SO_REUSEPORT             = 15
+	SO_PASSCRED              = 16
+	SO_PEERCRED              = 17
+	SO_RCVLOWAT              = 18
+	SO_SNDLOWAT              = 19
+	SO_RCVTIMEO              = 20
+	SO_SNDTIMEO              = 21
+	SO_BINDTODEVICE          = 25
+	SO_ATTACH_FILTER         = 26
+	SO_DETACH_FILTER         = 27
+	SO_GET_FILTER            = SO_ATTACH_FILTER
+	SO_PEERNAME              = 28
+	SO_TIMESTAMP             = 29
+	SO_ACCEPTCONN            = 30
+	SO_PEERSEC               = 31
+	SO_SNDBUFFORCE           = 32
+	SO_RCVBUFFORCE           = 33
+	SO_PASSSEC               = 34
+	SO_TIMESTAMPNS           = 35
+	SO_MARK                  = 36
+	SO_TIMESTAMPING          = 37
+	SO_PROTOCOL              = 38
+	SO_DOMAIN                = 39
+	SO_RXQ_OVFL              = 40
+	SO_WIFI_STATUS           = 41
+	SO_PEEK_OFF              = 42
+	SO_NOFCS                 = 43
+	SO_LOCK_FILTER           = 44
+	SO_SELECT_ERR_QUEUE      = 45
+	SO_BUSY_POLL             = 46
+	SO_MAX_PACING_RATE       = 47
+	SO_BPF_EXTENSIONS        = 48
+	SO_INCOMING_CPU          = 49
+	SO_ATTACH_BPF            = 50
+	SO_ATTACH_REUSEPORT_CBPF = 51
+	SO_ATTACH_REUSEPORT_EBPF = 52
+	SO_CNX_ADVICE            = 53
+	SO_MEMINFO               = 55
+	SO_INCOMING_NAPI_ID      = 56
+	SO_COOKIE                = 57
+	SO_PEERGROUPS            = 59
+	SO_ZEROCOPY              = 60
+	SO_TXTIME                = 61
 )
 
 // SockAddrMax is the maximum size of a struct sockaddr, from
diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go
new file mode 100644
index 000000000..7586ada42
--- /dev/null
+++ b/pkg/abi/linux/tcp.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Socket options from uapi/linux/tcp.h.
+const (
+	TCP_NODELAY              = 1
+	TCP_MAXSEG               = 2
+	TCP_CORK                 = 3
+	TCP_KEEPIDLE             = 4
+	TCP_KEEPINTVL            = 5
+	TCP_KEEPCNT              = 6
+	TCP_SYNCNT               = 7
+	TCP_LINGER2              = 8
+	TCP_DEFER_ACCEPT         = 9
+	TCP_WINDOW_CLAMP         = 10
+	TCP_INFO                 = 11
+	TCP_QUICKACK             = 12
+	TCP_CONGESTION           = 13
+	TCP_MD5SIG               = 14
+	TCP_THIN_LINEAR_TIMEOUTS = 16
+	TCP_THIN_DUPACK          = 17
+	TCP_USER_TIMEOUT         = 18
+	TCP_REPAIR               = 19
+	TCP_REPAIR_QUEUE         = 20
+	TCP_QUEUE_SEQ            = 21
+	TCP_REPAIR_OPTIONS       = 22
+	TCP_FASTOPEN             = 23
+	TCP_TIMESTAMP            = 24
+	TCP_NOTSENT_LOWAT        = 25
+	TCP_CC_INFO              = 26
+	TCP_SAVE_SYN             = 27
+	TCP_SAVED_SYN            = 28
+	TCP_REPAIR_WINDOW        = 29
+	TCP_FASTOPEN_CONNECT     = 30
+	TCP_ULP                  = 31
+	TCP_MD5SIG_EXT           = 32
+	TCP_FASTOPEN_KEY         = 33
+	TCP_FASTOPEN_NO_COOKIE   = 34
+	TCP_ZEROCOPY_RECEIVE     = 35
+	TCP_INQ                  = 36
+)
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index d14bbad01..c5ce289b5 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -515,189 +515,233 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
 func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
-		switch name {
-		case linux.SO_TYPE:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
-			return int32(skType), nil
+		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
 
-		case linux.SO_ERROR:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+	case linux.SOL_TCP:
+		return getSockOptTCP(t, ep, name, outLen)
 
-			// Get the last error and convert it.
-			err := ep.GetSockOpt(tcpip.ErrorOption{})
-			if err == nil {
-				return int32(0), nil
-			}
-			return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
+	case linux.SOL_IPV6:
+		return getSockOptIPv6(t, ep, name, outLen)
 
-		case linux.SO_PEERCRED:
-			if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
-				return nil, syserr.ErrInvalidArgument
-			}
+	case linux.SOL_IP,
+		linux.SOL_UDP,
+		linux.SOL_ICMPV6,
+		linux.SOL_RAW,
+		linux.SOL_PACKET:
 
-			tcred := t.Credentials()
-			return syscall.Ucred{
-				Pid: int32(t.ThreadGroup().ID()),
-				Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
-				Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
-			}, nil
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
 
-		case linux.SO_PASSCRED:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+	return nil, syserr.ErrProtocolNotAvailable
+}
 
-			var v tcpip.PasscredOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+// getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
+func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) {
+	switch name {
+	case linux.SO_TYPE:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+		return int32(skType), nil
 
-			return int32(v), nil
+	case linux.SO_ERROR:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_SNDBUF:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		// Get the last error and convert it.
+		err := ep.GetSockOpt(tcpip.ErrorOption{})
+		if err == nil {
+			return int32(0), nil
+		}
+		return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
 
-			var size tcpip.SendBufferSizeOption
-			if err := ep.GetSockOpt(&size); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+	case linux.SO_PEERCRED:
+		if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			if size > math.MaxInt32 {
-				size = math.MaxInt32
-			}
+		tcred := t.Credentials()
+		return syscall.Ucred{
+			Pid: int32(t.ThreadGroup().ID()),
+			Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
+			Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
+		}, nil
 
-			return int32(size), nil
+	case linux.SO_PASSCRED:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_RCVBUF:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		var v tcpip.PasscredOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			var size tcpip.ReceiveBufferSizeOption
-			if err := ep.GetSockOpt(&size); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		return int32(v), nil
 
-			if size > math.MaxInt32 {
-				size = math.MaxInt32
-			}
+	case linux.SO_SNDBUF:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			return int32(size), nil
+		var size tcpip.SendBufferSizeOption
+		if err := ep.GetSockOpt(&size); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-		case linux.SO_REUSEADDR:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		if size > math.MaxInt32 {
+			size = math.MaxInt32
+		}
 
-			var v tcpip.ReuseAddressOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		return int32(size), nil
 
-			return int32(v), nil
+	case linux.SO_RCVBUF:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_KEEPALIVE:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
-			return int32(0), nil
+		var size tcpip.ReceiveBufferSizeOption
+		if err := ep.GetSockOpt(&size); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-		case linux.SO_LINGER:
-			if outLen < syscall.SizeofLinger {
-				return nil, syserr.ErrInvalidArgument
-			}
-			return syscall.Linger{}, nil
+		if size > math.MaxInt32 {
+			size = math.MaxInt32
+		}
 
-		case linux.SO_RCVTIMEO:
-			if outLen < linux.SizeOfTimeval {
-				return nil, syserr.ErrInvalidArgument
-			}
+		return int32(size), nil
 
-			return linux.NsecToTimeval(s.RecvTimeout()), nil
+	case linux.SO_REUSEADDR:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_TIMESTAMP:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		var v tcpip.ReuseAddressOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			var v tcpip.TimestampOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		return int32(v), nil
 
-			return int32(v), nil
+	case linux.SO_KEEPALIVE:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
 		}
+		return int32(0), nil
 
-	case syscall.SOL_TCP:
-		switch name {
-		case syscall.TCP_NODELAY:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+	case linux.SO_LINGER:
+		if outLen < syscall.SizeofLinger {
+			return nil, syserr.ErrInvalidArgument
+		}
+		return syscall.Linger{}, nil
 
-			var v tcpip.DelayOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+	case linux.SO_RCVTIMEO:
+		if outLen < linux.SizeOfTimeval {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			if v == 0 {
-				return int32(1), nil
-			}
-			return int32(0), nil
+		return linux.NsecToTimeval(s.RecvTimeout()), nil
 
-		case syscall.TCP_CORK:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+	case linux.SO_TIMESTAMP:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			var v tcpip.CorkOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		var v tcpip.TimestampOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			return int32(v), nil
+		return int32(v), nil
 
-		case syscall.TCP_INFO:
-			var v tcpip.TCPInfoOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+	default:
+		socket.GetSockOptEmitUnimplementedEvent(t, name)
+	}
+	return nil, syserr.ErrProtocolNotAvailable
+}
+
+// getSockOptTCP implements GetSockOpt when level is SOL_TCP.
+func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+	switch name {
+	case linux.TCP_NODELAY:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			// TODO: Translate fields once they are added to
-			// tcpip.TCPInfoOption.
-			info := linux.TCPInfo{}
+		var v tcpip.DelayOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			// Linux truncates the output binary to outLen.
-			ib := binary.Marshal(nil, usermem.ByteOrder, &info)
-			if len(ib) > outLen {
-				ib = ib[:outLen]
-			}
+		if v == 0 {
+			return int32(1), nil
+		}
+		return int32(0), nil
 
-			return ib, nil
+	case linux.TCP_CORK:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
 		}
 
-	case syscall.SOL_IPV6:
-		switch name {
-		case syscall.IPV6_V6ONLY:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		var v tcpip.CorkOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			var v tcpip.V6OnlyOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		return int32(v), nil
 
-			return int32(v), nil
+	case linux.TCP_INFO:
+		var v tcpip.TCPInfoOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		// TODO: Translate fields once they are added to
+		// tcpip.TCPInfoOption.
+		info := linux.TCPInfo{}
+
+		// Linux truncates the output binary to outLen.
+		ib := binary.Marshal(nil, usermem.ByteOrder, &info)
+		if len(ib) > outLen {
+			ib = ib[:outLen]
 		}
+
+		return ib, nil
+
+	case linux.TCP_CC_INFO,
+		linux.TCP_NOTSENT_LOWAT,
+		linux.TCP_ZEROCOPY_RECEIVE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUmplementedEventTCP(t, name)
 	}
+	return nil, syserr.ErrProtocolNotAvailable
+}
+
+// getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
+func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+	switch name {
+	case linux.IPV6_V6ONLY:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
+		var v tcpip.V6OnlyOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
+	case linux.IPV6_PATHMTU:
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUmplementedEventIPv6(t, name)
+	}
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
@@ -712,109 +756,304 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
 	switch level {
 	case linux.SOL_SOCKET:
-		switch name {
-		case linux.SO_SNDBUF:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+		return setSockOptSocket(t, s, ep, name, optVal)
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.SendBufferSizeOption(v)))
+	case linux.SOL_TCP:
+		return setSockOptTCP(t, ep, name, optVal)
 
-		case linux.SO_RCVBUF:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+	case linux.SOL_IPV6:
+		return setSockOptIPv6(t, ep, name, optVal)
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(v)))
+	case linux.SOL_IP:
+		return setSockOptIP(t, ep, name, optVal)
 
-		case linux.SO_REUSEADDR:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+	case linux.SOL_UDP,
+		linux.SOL_ICMPV6,
+		linux.SOL_RAW,
+		linux.SOL_PACKET:
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
 
-		case linux.SO_PASSCRED:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
+// setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
+func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	switch name {
+	case linux.SO_SNDBUF:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_RCVTIMEO:
-			if len(optVal) < linux.SizeOfTimeval {
-				return syserr.ErrInvalidArgument
-			}
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.SendBufferSizeOption(v)))
 
-			var v linux.Timeval
-			binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
-			s.SetRecvTimeout(v.ToNsecCapped())
-			return nil
+	case linux.SO_RCVBUF:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_TIMESTAMP:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(v)))
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TimestampOption(v)))
+	case linux.SO_REUSEADDR:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
 		}
 
-	case syscall.SOL_TCP:
-		switch name {
-		case syscall.TCP_NODELAY:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			var o tcpip.DelayOption
-			if v == 0 {
-				o = 1
-			}
-			return syserr.TranslateNetstackError(ep.SetSockOpt(o))
-		case syscall.TCP_CORK:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+	case linux.SO_PASSCRED:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
+	case linux.SO_RCVTIMEO:
+		if len(optVal) < linux.SizeOfTimeval {
+			return syserr.ErrInvalidArgument
 		}
-	case syscall.SOL_IPV6:
-		switch name {
-		case syscall.IPV6_V6ONLY:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
-		}
-	case syscall.SOL_IP:
-		const (
-			_IP_MULTICAST_IF   = 32
-			_IP_ADD_MEMBERSHIP = 35
-			_MCAST_JOIN_GROUP  = 42
-		)
-		switch name {
-		case _IP_ADD_MEMBERSHIP, _MCAST_JOIN_GROUP, _IP_MULTICAST_IF:
-			// FIXME: Disallow IP-level multicast group options by
-			// default. These will need to be supported by appropriately plumbing
-			// the level through to the network stack (if at all). However, we
-			// still allow setting TTL, and multicast-enable/disable type options.
+		var v linux.Timeval
+		binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+		s.SetRecvTimeout(v.ToNsecCapped())
+		return nil
+
+	case linux.SO_TIMESTAMP:
+		if len(optVal) < sizeOfInt32 {
 			return syserr.ErrInvalidArgument
 		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TimestampOption(v)))
+
+	default:
+		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
 
 	// Default to the old behavior; hand off to network stack.
 	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
 }
 
+// setSockOptTCP implements SetSockOpt when level is SOL_TCP.
+func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	switch name {
+	case linux.TCP_NODELAY:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		var o tcpip.DelayOption
+		if v == 0 {
+			o = 1
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(o))
+
+	case linux.TCP_CORK:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
+
+	case linux.TCP_REPAIR_OPTIONS:
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUmplementedEventTCP(t, name)
+	}
+
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
+
+// setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
+func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	switch name {
+	case linux.IPV6_V6ONLY:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
+
+	case linux.IPV6_ADD_MEMBERSHIP,
+		linux.IPV6_DROP_MEMBERSHIP,
+		linux.IPV6_IPSEC_POLICY,
+		linux.IPV6_JOIN_ANYCAST,
+		linux.IPV6_LEAVE_ANYCAST,
+		linux.IPV6_PKTINFO,
+		linux.IPV6_ROUTER_ALERT,
+		linux.IPV6_XFRM_POLICY,
+		linux.MCAST_BLOCK_SOURCE,
+		linux.MCAST_JOIN_GROUP,
+		linux.MCAST_JOIN_SOURCE_GROUP,
+		linux.MCAST_LEAVE_GROUP,
+		linux.MCAST_LEAVE_SOURCE_GROUP,
+		linux.MCAST_UNBLOCK_SOURCE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUmplementedEventIPv6(t, name)
+	}
+
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
+
+// setSockOptIP implements SetSockOpt when level is SOL_IP.
+func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	switch name {
+	case linux.IP_ADD_MEMBERSHIP, linux.MCAST_JOIN_GROUP, linux.IP_MULTICAST_IF:
+		// FIXME: Disallow IP-level multicast group options by
+		// default. These will need to be supported by appropriately plumbing
+		// the level through to the network stack (if at all). However, we
+		// still allow setting TTL, and multicast-enable/disable type options.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return syserr.ErrInvalidArgument
+
+	case linux.IP_ADD_SOURCE_MEMBERSHIP,
+		linux.IP_BIND_ADDRESS_NO_PORT,
+		linux.IP_BLOCK_SOURCE,
+		linux.IP_CHECKSUM,
+		linux.IP_DROP_MEMBERSHIP,
+		linux.IP_DROP_SOURCE_MEMBERSHIP,
+		linux.IP_FREEBIND,
+		linux.IP_HDRINCL,
+		linux.IP_IPSEC_POLICY,
+		linux.IP_MINTTL,
+		linux.IP_MSFILTER,
+		linux.IP_MTU_DISCOVER,
+		linux.IP_MULTICAST_ALL,
+		linux.IP_MULTICAST_LOOP,
+		linux.IP_MULTICAST_TTL,
+		linux.IP_NODEFRAG,
+		linux.IP_OPTIONS,
+		linux.IP_PASSSEC,
+		linux.IP_PKTINFO,
+		linux.IP_RECVERR,
+		linux.IP_RECVFRAGSIZE,
+		linux.IP_RECVOPTS,
+		linux.IP_RECVORIGDSTADDR,
+		linux.IP_RECVTOS,
+		linux.IP_RECVTTL,
+		linux.IP_RETOPTS,
+		linux.IP_TOS,
+		linux.IP_TRANSPARENT,
+		linux.IP_TTL,
+		linux.IP_UNBLOCK_SOURCE,
+		linux.IP_UNICAST_IF,
+		linux.IP_XFRM_POLICY,
+		linux.MCAST_BLOCK_SOURCE,
+		linux.MCAST_JOIN_SOURCE_GROUP,
+		linux.MCAST_LEAVE_GROUP,
+		linux.MCAST_LEAVE_SOURCE_GROUP,
+		linux.MCAST_MSFILTER,
+		linux.MCAST_UNBLOCK_SOURCE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
+
+// emitUmplementedEventTCP emits unimplemented event if name is valid. This
+// function contains names that are common between Get and SetSockOpt when
+// level is SOL_TCP.
+func emitUmplementedEventTCP(t *kernel.Task, name int) {
+	switch name {
+	case linux.TCP_CONGESTION,
+		linux.TCP_CORK,
+		linux.TCP_DEFER_ACCEPT,
+		linux.TCP_FASTOPEN,
+		linux.TCP_FASTOPEN_CONNECT,
+		linux.TCP_FASTOPEN_KEY,
+		linux.TCP_FASTOPEN_NO_COOKIE,
+		linux.TCP_INQ,
+		linux.TCP_KEEPCNT,
+		linux.TCP_KEEPIDLE,
+		linux.TCP_KEEPINTVL,
+		linux.TCP_LINGER2,
+		linux.TCP_MAXSEG,
+		linux.TCP_QUEUE_SEQ,
+		linux.TCP_QUICKACK,
+		linux.TCP_REPAIR,
+		linux.TCP_REPAIR_QUEUE,
+		linux.TCP_REPAIR_WINDOW,
+		linux.TCP_SAVED_SYN,
+		linux.TCP_SAVE_SYN,
+		linux.TCP_SYNCNT,
+		linux.TCP_THIN_DUPACK,
+		linux.TCP_THIN_LINEAR_TIMEOUTS,
+		linux.TCP_TIMESTAMP,
+		linux.TCP_ULP,
+		linux.TCP_USER_TIMEOUT,
+		linux.TCP_WINDOW_CLAMP:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+}
+
+// emitUmplementedEventIPv6 emits unimplemented event if name is valid. It
+// contains names that are common between Get and SetSockOpt when level is
+// SOL_IPV6.
+func emitUmplementedEventIPv6(t *kernel.Task, name int) {
+	switch name {
+	case linux.IPV6_2292DSTOPTS,
+		linux.IPV6_2292HOPLIMIT,
+		linux.IPV6_2292HOPOPTS,
+		linux.IPV6_2292PKTINFO,
+		linux.IPV6_2292PKTOPTIONS,
+		linux.IPV6_2292RTHDR,
+		linux.IPV6_ADDR_PREFERENCES,
+		linux.IPV6_AUTOFLOWLABEL,
+		linux.IPV6_DONTFRAG,
+		linux.IPV6_DSTOPTS,
+		linux.IPV6_FLOWINFO,
+		linux.IPV6_FLOWINFO_SEND,
+		linux.IPV6_FLOWLABEL_MGR,
+		linux.IPV6_FREEBIND,
+		linux.IPV6_HOPOPTS,
+		linux.IPV6_MINHOPCOUNT,
+		linux.IPV6_MTU,
+		linux.IPV6_MTU_DISCOVER,
+		linux.IPV6_MULTICAST_ALL,
+		linux.IPV6_MULTICAST_HOPS,
+		linux.IPV6_MULTICAST_IF,
+		linux.IPV6_MULTICAST_LOOP,
+		linux.IPV6_RECVDSTOPTS,
+		linux.IPV6_RECVERR,
+		linux.IPV6_RECVFRAGSIZE,
+		linux.IPV6_RECVHOPLIMIT,
+		linux.IPV6_RECVHOPOPTS,
+		linux.IPV6_RECVORIGDSTADDR,
+		linux.IPV6_RECVPATHMTU,
+		linux.IPV6_RECVPKTINFO,
+		linux.IPV6_RECVRTHDR,
+		linux.IPV6_RECVTCLASS,
+		linux.IPV6_RTHDR,
+		linux.IPV6_RTHDRDSTOPTS,
+		linux.IPV6_TCLASS,
+		linux.IPV6_TRANSPARENT,
+		linux.IPV6_UNICAST_HOPS,
+		linux.IPV6_UNICAST_IF,
+		linux.MCAST_MSFILTER,
+		linux.IPV6_ADDRFORM:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+}
+
 // isLinkLocal determines if the given IPv6 address is link-local. This is the
 // case when it has the fe80::/10 prefix. This check is used to determine when
 // the NICID is relevant for a given IPv6 address.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index f901cfa0b..b1f6620de 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -299,6 +299,21 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (in
 			}
 			// We don't have limit on receiving size.
 			return math.MaxInt32, nil
+
+		default:
+			socket.GetSockOptEmitUnimplementedEvent(t, name)
+		}
+	case linux.SOL_NETLINK:
+		switch name {
+		case linux.NETLINK_BROADCAST_ERROR,
+			linux.NETLINK_CAP_ACK,
+			linux.NETLINK_DUMP_STRICT_CHK,
+			linux.NETLINK_EXT_ACK,
+			linux.NETLINK_LIST_MEMBERSHIPS,
+			linux.NETLINK_NO_ENOBUFS,
+			linux.NETLINK_PKTINFO:
+
+			t.Kernel().EmitUnimplementedEvent(t)
 		}
 	}
 	// TODO: other sockopts are not supported.
@@ -329,7 +344,25 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			// We don't have limit on receiving size. So just accept anything as
 			// valid for compatibility.
 			return nil
+		default:
+			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
+
+	case linux.SOL_NETLINK:
+		switch name {
+		case linux.NETLINK_ADD_MEMBERSHIP,
+			linux.NETLINK_BROADCAST_ERROR,
+			linux.NETLINK_CAP_ACK,
+			linux.NETLINK_DROP_MEMBERSHIP,
+			linux.NETLINK_DUMP_STRICT_CHK,
+			linux.NETLINK_EXT_ACK,
+			linux.NETLINK_LISTEN_ALL_NSID,
+			linux.NETLINK_NO_ENOBUFS,
+			linux.NETLINK_PKTINFO:
+
+			t.Kernel().EmitUnimplementedEvent(t)
+		}
+
 	}
 	// TODO: other sockopts are not supported.
 	return syserr.ErrProtocolNotAvailable
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index a235c5249..b1dcbf7b0 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -213,3 +213,94 @@ func (rt *ReceiveTimeout) SetRecvTimeout(nanoseconds int64) {
 func (rt *ReceiveTimeout) RecvTimeout() int64 {
 	return atomic.LoadInt64(&rt.ns)
 }
+
+// GetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
+// It contains names that are valid for GetSockOpt when level is SOL_SOCKET.
+func GetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) {
+	switch name {
+	case linux.SO_ACCEPTCONN,
+		linux.SO_BPF_EXTENSIONS,
+		linux.SO_COOKIE,
+		linux.SO_DOMAIN,
+		linux.SO_ERROR,
+		linux.SO_GET_FILTER,
+		linux.SO_INCOMING_NAPI_ID,
+		linux.SO_MEMINFO,
+		linux.SO_PEERCRED,
+		linux.SO_PEERGROUPS,
+		linux.SO_PEERNAME,
+		linux.SO_PEERSEC,
+		linux.SO_PROTOCOL,
+		linux.SO_SNDLOWAT,
+		linux.SO_TYPE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUnimplementedEvent(t, name)
+	}
+}
+
+// SetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
+// It contains names that are valid for SetSockOpt when level is SOL_SOCKET.
+func SetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) {
+	switch name {
+	case linux.SO_ATTACH_BPF,
+		linux.SO_ATTACH_FILTER,
+		linux.SO_ATTACH_REUSEPORT_CBPF,
+		linux.SO_ATTACH_REUSEPORT_EBPF,
+		linux.SO_CNX_ADVICE,
+		linux.SO_DETACH_FILTER,
+		linux.SO_RCVBUFFORCE,
+		linux.SO_SNDBUFFORCE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUnimplementedEvent(t, name)
+	}
+}
+
+// emitUnimplementedEvent emits unimplemented event if name is valid. It
+// contains names that are common between Get and SetSocketOpt when level is
+// SOL_SOCKET.
+func emitUnimplementedEvent(t *kernel.Task, name int) {
+	switch name {
+	case linux.SO_BINDTODEVICE,
+		linux.SO_BROADCAST,
+		linux.SO_BSDCOMPAT,
+		linux.SO_BUSY_POLL,
+		linux.SO_DEBUG,
+		linux.SO_DONTROUTE,
+		linux.SO_INCOMING_CPU,
+		linux.SO_KEEPALIVE,
+		linux.SO_LINGER,
+		linux.SO_LOCK_FILTER,
+		linux.SO_MARK,
+		linux.SO_MAX_PACING_RATE,
+		linux.SO_NOFCS,
+		linux.SO_NO_CHECK,
+		linux.SO_OOBINLINE,
+		linux.SO_PASSCRED,
+		linux.SO_PASSSEC,
+		linux.SO_PEEK_OFF,
+		linux.SO_PRIORITY,
+		linux.SO_RCVBUF,
+		linux.SO_RCVLOWAT,
+		linux.SO_RCVTIMEO,
+		linux.SO_REUSEADDR,
+		linux.SO_REUSEPORT,
+		linux.SO_RXQ_OVFL,
+		linux.SO_SELECT_ERR_QUEUE,
+		linux.SO_SNDBUF,
+		linux.SO_SNDTIMEO,
+		linux.SO_TIMESTAMP,
+		linux.SO_TIMESTAMPING,
+		linux.SO_TIMESTAMPNS,
+		linux.SO_TXTIME,
+		linux.SO_WIFI_STATUS,
+		linux.SO_ZEROCOPY:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+}
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 4c49e90e3..c2a77ebf5 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -89,10 +89,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
 	if tr == nil {
 		switch sysnr {
 		case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
-			tr = newCmdTracker(0)
+			// args: cmd, ...
+			tr = newArgsTracker(0)
 
 		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL:
-			tr = newCmdTracker(1)
+			// args: fd, cmd, ...
+			tr = newArgsTracker(1)
+
+		case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
+			// args: fd, level, name, ...
+			tr = newArgsTracker(1, 2)
 
 		default:
 			tr = &onceTracker{}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 2bb769a49..0c9472f18 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -20,35 +20,58 @@ import (
 	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
 )
 
-// cmdTracker reports only a single time for each different command argument in
-// the syscall. It's used for generic syscalls like ioctl to report once per
-// 'cmd'
-type cmdTracker struct {
-	// argIdx is the syscall argument index where the command is located.
-	argIdx int
-	cmds   map[uint32]struct{}
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+	// argsIdx is the syscall arguments to use as unique ID.
+	argsIdx  []int
+	reported map[string]struct{}
+	count    int
 }
 
-func newCmdTracker(argIdx int) *cmdTracker {
-	return &cmdTracker{argIdx: argIdx, cmds: make(map[uint32]struct{})}
+func newArgsTracker(argIdx ...int) *argsTracker {
+	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
 }
 
 // cmd returns the command based on the syscall argument index.
-func (c *cmdTracker) cmd(regs *rpb.AMD64Registers) uint32 {
-	switch c.argIdx {
+func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
+	var rv string
+	for _, idx := range a.argsIdx {
+		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	}
+	return rv
+}
+
+func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+	switch argIdx {
 	case 0:
 		return uint32(regs.Rdi)
 	case 1:
 		return uint32(regs.Rsi)
+	case 2:
+		return uint32(regs.Rdx)
+	case 3:
+		return uint32(regs.R10)
+	case 4:
+		return uint32(regs.R8)
+	case 5:
+		return uint32(regs.R9)
 	}
-	panic(fmt.Sprintf("unsupported syscall argument index %d", c.argIdx))
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func (c *cmdTracker) shouldReport(regs *rpb.AMD64Registers) bool {
-	_, ok := c.cmds[c.cmd(regs)]
+func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
+	if a.count >= reportLimit {
+		return false
+	}
+	_, ok := a.reported[a.key(regs)]
 	return !ok
 }
 
-func (c *cmdTracker) onReported(regs *rpb.AMD64Registers) {
-	c.cmds[c.cmd(regs)] = struct{}{}
+func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
+	a.count++
+	a.reported[a.key(regs)] = struct{}{}
 }
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index 30b94798a..f1940dd72 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -33,34 +33,53 @@ func TestOnceTracker(t *testing.T) {
 	}
 }
 
-func TestCmdTracker(t *testing.T) {
+func TestArgsTracker(t *testing.T) {
 	for _, tc := range []struct {
 		name string
-		idx  int
+		idx  []int
 		rdi1 uint64
 		rdi2 uint64
 		rsi1 uint64
 		rsi2 uint64
 		want bool
 	}{
-		{name: "same rdi", idx: 0, rdi1: 123, rdi2: 123, want: false},
-		{name: "same rsi", idx: 1, rsi1: 123, rsi2: 123, want: false},
-		{name: "diff rdi", idx: 0, rdi1: 123, rdi2: 321, want: true},
-		{name: "diff rsi", idx: 1, rsi1: 123, rsi2: 321, want: true},
-		{name: "cmd is uint32", idx: 0, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
+		{name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false},
+		{name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false},
+		{name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true},
+		{name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true},
+		{name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
+		{name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false},
+		{name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
-			c := newCmdTracker(tc.idx)
+			c := newArgsTracker(tc.idx...)
 			regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
 			if !c.shouldReport(regs) {
-				t.Error("first call to checkAndMark, got: false, want: true")
+				t.Error("first call to shouldReport, got: false, want: true")
 			}
 			c.onReported(regs)
 
 			regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
 			if got := c.shouldReport(regs); tc.want != got {
-				t.Errorf("after first call to checkAndMark, got: %t, want: %t", got, tc.want)
+				t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
 			}
 		})
 	}
 }
+
+func TestArgsTrackerLimit(t *testing.T) {
+	c := newArgsTracker(0, 1)
+	for i := 0; i < reportLimit; i++ {
+		regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)}
+		if !c.shouldReport(regs) {
+			t.Error("shouldReport before limit was reached, got: false, want: true")
+		}
+		c.onReported(regs)
+	}
+
+	// Should hit the count limit now.
+	regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456}
+	if c.shouldReport(regs) {
+		t.Error("shouldReport after limit was reached, got: true, want: false")
+	}
+}
-- 
cgit v1.2.3


From 9363edcf067a69ba443425c0b5604897fcd5b87b Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 19 Nov 2018 17:55:42 -0800
Subject: Internal change.

PiperOrigin-RevId: 222170431
Change-Id: I26a6d6ad5d6910a94bb8b0a05fc2d12e23098399
---
 runsc/test/testutil/testutil.go | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

(limited to 'runsc')

diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 7a17d0552..162ffe09f 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -87,22 +87,35 @@ func FindFile(path string) (string, error) {
 		root = dir[:len(dir)-1]
 	}
 
-	// bazel adds the build type to the directory structure. Since I don't want
-	// to guess what build type it's, just place '*' to match anything.
-	//
-	// The pattern goes like: /test-path/__main__/directories/*/file.
-	pattern := filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path))
-	matches, err := filepath.Glob(pattern)
-	if err != nil {
-		return "", fmt.Errorf("error globbing %q: %v", pattern, err)
-	}
-	if len(matches) == 0 {
-		return "", fmt.Errorf("file %q not found", path)
+	// Annoyingly, bazel adds the build type to the directory path for go
+	// binaries, but not for c++ binaries. We use two different patterns to
+	// to find our file.
+	patterns := []string{
+		// Try the obvious path first.
+		filepath.Join(root, path),
+		// If it was a go binary, use a wildcard to match the build
+		// type. The pattern is: /test-path/__main__/directories/*/file.
+		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
 	}
-	if len(matches) != 1 {
-		return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+
+	for _, p := range patterns {
+		matches, err := filepath.Glob(p)
+		if err != nil {
+			// "The only possible returned error is ErrBadPattern,
+			// when pattern is malformed." -godoc
+			return "", fmt.Errorf("error globbing %q: %v", p, err)
+		}
+		switch len(matches) {
+		case 0:
+			// Try the next pattern.
+		case 1:
+			// We found it.
+			return matches[0], nil
+		default:
+			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+		}
 	}
-	return matches[0], nil
+	return "", fmt.Errorf("file %q not found", path)
 }
 
 // TestConfig returns the default configuration to use in tests. Note that
-- 
cgit v1.2.3


From f894610c572976026f4cf6841f4095718827e4f8 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 20 Nov 2018 15:09:04 -0800
Subject: Use math.Rand to generate a random test container id.

We were relying on time.UnixNano, but that was causing collisions.

Now we generate 20 bytes of entropy from rand.Read, and base32-encode it to get
a valid container id.

PiperOrigin-RevId: 222313867
Change-Id: Iaeea9b9582d36de55f9f02f55de6a5de3f739371
---
 runsc/test/testutil/testutil.go | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 162ffe09f..b8f981053 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -18,10 +18,12 @@ package testutil
 import (
 	"bufio"
 	"context"
+	"encoding/base32"
 	"encoding/json"
 	"fmt"
 	"io"
 	"io/ioutil"
+	"math/rand"
 	"net/http"
 	"os"
 	"os/exec"
@@ -41,6 +43,10 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
+func init() {
+	rand.Seed(time.Now().UnixNano())
+}
+
 // RaceEnabled is set to true if it was built with '--race' option.
 var RaceEnabled = false
 
@@ -220,7 +226,15 @@ func writeSpec(dir string, spec *specs.Spec) error {
 // name, sometimes between test runs the socket does not get cleaned up quickly
 // enough, causing container creation to fail.
 func UniqueContainerID() string {
-	return fmt.Sprintf("test-container-%d", time.Now().UnixNano())
+	// Read 20 random bytes.
+	b := make([]byte, 20)
+	// "[Read] always returns len(p) and a nil error." --godoc
+	if _, err := rand.Read(b); err != nil {
+		panic("rand.Read failed: " + err.Error())
+	}
+	// base32 encode the random bytes, so that the name is a valid
+	// container id and can be used as a socket name in the filesystem.
+	return fmt.Sprintf("test-container-%s", base32.StdEncoding.EncodeToString(b))
 }
 
 // Copy copies file from src to dst.
-- 
cgit v1.2.3


From eaac94d91c28b745c51c33dd352ed9bfdd671b8c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 20 Nov 2018 22:55:41 -0800
Subject: Use RET_KILL_PROCESS if available in kernel

RET_KILL_THREAD doesn't work well for Go because it will
kill only the offending thread and leave the process hanging.
RET_TRAP can be masked out and it's not guaranteed to kill
the process. RET_KILL_PROCESS is available since 4.14.

For older kernel, continue to use RET_TRAP as this is the
best option (likely to kill process, easy to debug).

PiperOrigin-RevId: 222357867
Change-Id: Icc1d7d731274b16c2125b7a1ba4f7883fbdb2cbd
---
 pkg/abi/linux/seccomp.go                       | 12 +++---
 pkg/seccomp/seccomp.go                         | 52 ++++++++++++++++++++++----
 pkg/seccomp/seccomp_test.go                    | 20 +++++++---
 pkg/seccomp/seccomp_test_victim.go             |  2 +-
 pkg/seccomp/seccomp_unsafe.go                  | 30 ++++++++++++---
 pkg/sentry/kernel/seccomp.go                   |  4 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go |  2 +-
 runsc/boot/filter/filter.go                    |  3 +-
 runsc/fsgofer/filter/filter.go                 |  3 +-
 9 files changed, 95 insertions(+), 33 deletions(-)

(limited to 'runsc')

diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index 9963ceeba..5ec01cc4a 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -19,17 +19,19 @@ const (
 	SECCOMP_MODE_NONE   = 0
 	SECCOMP_MODE_FILTER = 2
 
-	SECCOMP_RET_KILL  = 0x00000000
-	SECCOMP_RET_TRAP  = 0x00030000
-	SECCOMP_RET_ERRNO = 0x00050000
-	SECCOMP_RET_TRACE = 0x7ff00000
-	SECCOMP_RET_ALLOW = 0x7fff0000
+	SECCOMP_RET_KILL_PROCESS = 0x80000000
+	SECCOMP_RET_KILL_THREAD  = 0x00000000
+	SECCOMP_RET_TRAP         = 0x00030000
+	SECCOMP_RET_ERRNO        = 0x00050000
+	SECCOMP_RET_TRACE        = 0x7ff00000
+	SECCOMP_RET_ALLOW        = 0x7fff0000
 
 	SECCOMP_RET_ACTION = 0x7fff0000
 	SECCOMP_RET_DATA   = 0x0000ffff
 
 	SECCOMP_SET_MODE_FILTER   = 1
 	SECCOMP_FILTER_FLAG_TSYNC = 1
+	SECCOMP_GET_ACTION_AVAIL  = 2
 )
 
 const (
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 1dfbf749e..9d714d02d 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -33,17 +33,42 @@ const (
 	defaultLabel = "default_action"
 )
 
+func actionName(a uint32) string {
+	switch a {
+	case linux.SECCOMP_RET_KILL_PROCESS:
+		return "kill process"
+	case linux.SECCOMP_RET_TRAP:
+		return "trap"
+	}
+	panic(fmt.Sprintf("invalid action: %d", a))
+}
+
 // Install generates BPF code based on the set of syscalls provided. It only
-// allows syscalls that conform to the specification and generates SIGSYS
-// trap unless kill is set.
+// allows syscalls that conform to the specification. Syscalls that violate the
+// specification will trigger RET_KILL_PROCESS, except for the cases below.
+//
+// RET_TRAP is used in violations, instead of RET_KILL_PROCESS, in the
+// following cases:
+//	 1. Kernel doesn't support RET_KILL_PROCESS: RET_KILL_THREAD only kills the
+//      offending thread and often keeps the sentry hanging.
+//   2. Debug: RET_TRAP generates a panic followed by a stack trace which is
+//      much easier to debug then RET_KILL_PROCESS which can't be caught.
 //
-// This is a convenience wrapper around BuildProgram and SetFilter.
-func Install(rules SyscallRules, kill bool) error {
-	log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill)
-	defaultAction := uint32(linux.SECCOMP_RET_TRAP)
-	if kill {
-		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+// Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored,
+// making it possible for the process to continue running after a violation.
+// However, it will leave a SECCOMP audit event trail behind. In any case, the
+// syscall is still blocked from executing.
+func Install(rules SyscallRules) error {
+	defaultAction, err := defaultAction()
+	if err != nil {
+		return err
 	}
+
+	// Uncomment to get stack trace when there is a violation.
+	// defaultAction = uint32(linux.SECCOMP_RET_TRAP)
+
+	log.Infof("Installing seccomp filters for %d syscalls (action=%s)", len(rules), actionName(defaultAction))
+
 	instrs, err := BuildProgram([]RuleSet{
 		RuleSet{
 			Rules:  rules,
@@ -70,6 +95,17 @@ func Install(rules SyscallRules, kill bool) error {
 	return nil
 }
 
+func defaultAction() (uint32, error) {
+	available, err := isKillProcessAvailable()
+	if err != nil {
+		return 0, err
+	}
+	if available {
+		return uint32(linux.SECCOMP_RET_KILL_PROCESS), nil
+	}
+	return uint32(linux.SECCOMP_RET_TRAP), nil
+}
+
 // RuleSet is a set of rules and associated action.
 type RuleSet struct {
 	Rules  SyscallRules
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 226f30b7b..f2b903e42 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -121,7 +121,7 @@ func TestBasic(t *testing.T) {
 					Action: linux.SECCOMP_RET_TRAP,
 				},
 			},
-			defaultAction: linux.SECCOMP_RET_KILL,
+			defaultAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
 					desc: "Multiple rulesets allowed (1a)",
@@ -141,7 +141,7 @@ func TestBasic(t *testing.T) {
 				{
 					desc: "Multiple rulesets allowed (2)",
 					data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64},
-					want: linux.SECCOMP_RET_KILL,
+					want: linux.SECCOMP_RET_KILL_THREAD,
 				},
 			},
 		},
@@ -431,15 +431,23 @@ func TestRealDeal(t *testing.T) {
 				t.Errorf("victim was not killed as expected, output: %s", out)
 				continue
 			}
+			// Depending on kernel version, either RET_TRAP or RET_KILL_PROCESS is
+			// used. RET_TRAP dumps reason for exit in output, while RET_KILL_PROCESS
+			// returns SIGSYS as exit status.
+			if !strings.Contains(string(out), test.want) &&
+				!strings.Contains(err.Error(), test.want) {
+				t.Errorf("Victim error is wrong, got: %v, err: %v, want: %v", string(out), err, test.want)
+				continue
+			}
 		} else {
 			if err != nil {
 				t.Errorf("victim failed to execute, err: %v", err)
 				continue
 			}
-		}
-		if !strings.Contains(string(out), test.want) {
-			t.Errorf("Victim output is wrong, got: %v, want: %v", err, test.want)
-			continue
+			if !strings.Contains(string(out), test.want) {
+				t.Errorf("Victim output is wrong, got: %v, want: %v", string(out), test.want)
+				continue
+			}
 		}
 	}
 }
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index 007038273..dd5ed0041 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -106,7 +106,7 @@ func main() {
 		}
 	}
 
-	if err := seccomp.Install(syscalls, false); err != nil {
+	if err := seccomp.Install(syscalls); err != nil {
 		fmt.Printf("Failed to install seccomp: %v", err)
 		os.Exit(1)
 	}
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index dd009221a..a31c6471d 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -36,22 +36,40 @@ type sockFprog struct {
 //
 //go:nosplit
 func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
-	// SYS_SECCOMP is not available in syscall package.
-	const SYS_SECCOMP = 317
-
 	// PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details.
 	if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 {
 		return errno
 	}
 
-	// TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available.
 	sockProg := sockFprog{
 		Len:    uint16(len(instrs)),
 		Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])),
 	}
-	if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); errno != 0 {
-		return errno
+	return seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg))
+}
+
+func isKillProcessAvailable() (bool, error) {
+	action := uint32(linux.SECCOMP_RET_KILL_PROCESS)
+	if errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 {
+		// EINVAL: SECCOMP_GET_ACTION_AVAIL not in this kernel yet.
+		// EOPNOTSUPP: SECCOMP_RET_KILL_PROCESS not supported.
+		if errno == syscall.EINVAL || errno == syscall.EOPNOTSUPP {
+			return false, nil
+		}
+		return false, errno
 	}
+	return true, nil
+}
 
+// seccomp calls seccomp(2). This is safe to call from an afterFork context.
+//
+//go:nosplit
+func seccomp(op, flags uint32, ptr unsafe.Pointer) syscall.Errno {
+	// SYS_SECCOMP is not available in syscall package.
+	const SYS_SECCOMP = 317
+
+	if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)); errno != 0 {
+		return errno
+	}
 	return 0
 }
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 433b900c7..d6dc45bbd 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -117,7 +117,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
 		// "Results in the system call being executed."
 		return seccompResultAllow
 
-	case linux.SECCOMP_RET_KILL:
+	case linux.SECCOMP_RET_KILL_THREAD:
 		// "Results in the task exiting immediately without executing the
 		// system call. The exit status of the task will be SIGSYS, not
 		// SIGKILL."
@@ -155,7 +155,7 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i
 		thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
 		if err != nil {
 			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
-			thisRet = linux.SECCOMP_RET_KILL
+			thisRet = linux.SECCOMP_RET_KILL_THREAD
 		}
 		// "If multiple filters exist, the return value for the evaluation of a
 		// given system call will always use the highest precedent value." -
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 885ba4b2e..25b8e8cb7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -115,7 +115,7 @@ func createStub() (*thread, error) {
 	var defaultAction uint32
 	if probeSeccomp() {
 		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
-		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+		defaultAction = uint32(linux.SECCOMP_RET_KILL_THREAD)
 	} else {
 		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
 		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index dc7294b1d..d69a6a2cc 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -57,8 +57,7 @@ func Install(opt Options) error {
 		return fmt.Errorf("unknown platform type %T", p)
 	}
 
-	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
-	return seccomp.Install(s, false)
+	return seccomp.Install(s)
 }
 
 // Report writes a warning message to the log.
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index f50b6bc87..c120d57a6 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -29,6 +29,5 @@ func Install() error {
 	// when not enabled.
 	s.Merge(instrumentationFilters())
 
-	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
-	return seccomp.Install(s, false)
+	return seccomp.Install(s)
 }
-- 
cgit v1.2.3


From 071aeea9d3ff783b2946ef291b1c440aa9b21b88 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 27 Nov 2018 09:24:17 -0800
Subject: Disable crictl tests

gvisor-containerd-shim installation is currently broken.

PiperOrigin-RevId: 223002877
Change-Id: I2b890c5bf602a96c475c3805f24852ead8593a35
---
 kokoro/run_tests.sh            | 9 +++++----
 runsc/test/root/crictl_test.go | 9 +++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index bfdb3fe09..fb9a7f300 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -100,10 +100,11 @@ EOF
 )
 
 # Install containerd and crictl.
-if [[ ${exit_code} -eq 0 ]]; then
-  installCrictl
-  exit_code=${?}
-fi
+# FIXME: gvisor-containerd-shim installation broken.
+#if [[ ${exit_code} -eq 0 ]]; then
+#  installCrictl
+#  exit_code=${?}
+#fi
 
 # Execute local tests that require docker.
 if [[ ${exit_code} -eq 0 ]]; then
diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
index 556d95fff..45cbec6b5 100644
--- a/runsc/test/root/crictl_test.go
+++ b/runsc/test/root/crictl_test.go
@@ -37,6 +37,9 @@ import (
 // Tests for crictl have to be run as root (rather than in a user namespace)
 // because crictl creates named network namespaces in /var/run/netns/.
 func TestCrictlSanity(t *testing.T) {
+	// FIXME
+	t.Skip("crictl installation broken")
+
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
 	if err != nil {
@@ -59,6 +62,9 @@ func TestCrictlSanity(t *testing.T) {
 	}
 }
 func TestMountPaths(t *testing.T) {
+	// FIXME
+	t.Skip("crictl installation broken")
+
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
 	if err != nil {
@@ -81,6 +87,9 @@ func TestMountPaths(t *testing.T) {
 	}
 }
 func TestMountOverSymlinks(t *testing.T) {
+	// FIXME
+	t.Skip("crictl installation broken")
+
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
 	if err != nil {
-- 
cgit v1.2.3


From 7b86d36a63e9b281834fdb6c2db0840df992c57c Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 28 Nov 2018 10:09:22 -0800
Subject: Fix crictl tests.

gvisor-containerd-shim moved. It now has a stable URL that run_tests.sh always
uses.

PiperOrigin-RevId: 223188822
Change-Id: I5687c78289404da27becd8d5949371e580fdb360
---
 kokoro/run_tests.sh            | 13 +++++++------
 runsc/test/root/crictl_test.go |  9 ---------
 2 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index fb9a7f300..927acb6a1 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -75,8 +75,10 @@ installCrictl() (
   sudo -n -E make install
 
   # Install gvisor-containerd-shim.
+  local latest=/tmp/gvisor-containerd-shim-latest
   local shim_path=/tmp/gvisor-containerd-shim
-  wget https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/gvisor-containerd-shim -O ${shim_path}
+  wget https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/latest -O ${latest}
+  wget https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/$(cat ${latest}) -O ${shim_path}
   chmod +x ${shim_path}
   sudo -n -E mv ${shim_path} /usr/local/bin
 
@@ -100,11 +102,10 @@ EOF
 )
 
 # Install containerd and crictl.
-# FIXME: gvisor-containerd-shim installation broken.
-#if [[ ${exit_code} -eq 0 ]]; then
-#  installCrictl
-#  exit_code=${?}
-#fi
+if [[ ${exit_code} -eq 0 ]]; then
+  installCrictl
+  exit_code=${?}
+fi
 
 # Execute local tests that require docker.
 if [[ ${exit_code} -eq 0 ]]; then
diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
index 45cbec6b5..556d95fff 100644
--- a/runsc/test/root/crictl_test.go
+++ b/runsc/test/root/crictl_test.go
@@ -37,9 +37,6 @@ import (
 // Tests for crictl have to be run as root (rather than in a user namespace)
 // because crictl creates named network namespaces in /var/run/netns/.
 func TestCrictlSanity(t *testing.T) {
-	// FIXME
-	t.Skip("crictl installation broken")
-
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
 	if err != nil {
@@ -62,9 +59,6 @@ func TestCrictlSanity(t *testing.T) {
 	}
 }
 func TestMountPaths(t *testing.T) {
-	// FIXME
-	t.Skip("crictl installation broken")
-
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
 	if err != nil {
@@ -87,9 +81,6 @@ func TestMountPaths(t *testing.T) {
 	}
 }
 func TestMountOverSymlinks(t *testing.T) {
-	// FIXME
-	t.Skip("crictl installation broken")
-
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
 	if err != nil {
-- 
cgit v1.2.3


From 4d0da37cbb81292c66a8d7a7b8d5658450a847f5 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Wed, 28 Nov 2018 14:00:54 -0800
Subject: Internal change.

PiperOrigin-RevId: 223231273
Change-Id: I8fb97ea91f7507b4918f7ce6562890611513fc30
---
 runsc/cmd/boot.go                  | 2 +-
 runsc/cmd/checkpoint.go            | 2 +-
 runsc/cmd/create.go                | 1 +
 runsc/cmd/debug.go                 | 2 +-
 runsc/cmd/delete.go                | 2 +-
 runsc/cmd/events.go                | 2 +-
 runsc/cmd/exec.go                  | 2 +-
 runsc/cmd/gofer.go                 | 2 +-
 runsc/cmd/kill.go                  | 2 +-
 runsc/cmd/list.go                  | 2 +-
 runsc/cmd/pause.go                 | 1 +
 runsc/cmd/restore.go               | 2 +-
 runsc/cmd/resume.go                | 1 +
 runsc/cmd/run.go                   | 2 +-
 runsc/cmd/spec.go                  | 2 +-
 runsc/cmd/start.go                 | 1 +
 runsc/cmd/state.go                 | 2 +-
 runsc/cmd/wait.go                  | 2 +-
 runsc/main.go                      | 2 +-
 runsc/tools/dockercfg/dockercfg.go | 2 +-
 20 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 7c14857ba..5f5dca109 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -15,12 +15,12 @@
 package cmd
 
 import (
+	"context"
 	"os"
 	"runtime/debug"
 	"strings"
 	"syscall"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index d49d0169b..4f4771da2 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -15,11 +15,11 @@
 package cmd
 
 import (
+	"context"
 	"os"
 	"path/filepath"
 	"syscall"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/pkg/log"
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index b84185b43..d187b8592 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"context"
+
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 288cbe435..de530c068 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -15,9 +15,9 @@
 package cmd
 
 import (
+	"context"
 	"syscall"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/pkg/log"
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index ea1ca1278..8c7c7a5cd 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -15,10 +15,10 @@
 package cmd
 
 import (
+	"context"
 	"fmt"
 	"os"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/pkg/log"
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index df03415ec..a54856fb4 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -15,11 +15,11 @@
 package cmd
 
 import (
+	"context"
 	"encoding/json"
 	"os"
 	"time"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/pkg/log"
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 35aa5499e..548207222 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -15,6 +15,7 @@
 package cmd
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
@@ -26,7 +27,6 @@ import (
 	"syscall"
 	"time"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 4ec3dba9c..7276f3f26 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -15,11 +15,11 @@
 package cmd
 
 import (
+	"context"
 	"os"
 	"sync"
 	"syscall"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 1f1086250..7d86bb043 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -15,12 +15,12 @@
 package cmd
 
 import (
+	"context"
 	"fmt"
 	"strconv"
 	"strings"
 	"syscall"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"golang.org/x/sys/unix"
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index fd59b73e6..acefcb2db 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -15,13 +15,13 @@
 package cmd
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"os"
 	"text/tabwriter"
 	"time"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index 5ff6f059c..ee608faba 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"context"
+
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index cc99b3503..64b302b0c 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -15,10 +15,10 @@
 package cmd
 
 import (
+	"context"
 	"path/filepath"
 	"syscall"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index 274b5d084..e684aeb5c 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"context"
+
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index b6a12f5d6..9a574679f 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -15,9 +15,9 @@
 package cmd
 
 import (
+	"context"
 	"syscall"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 57ee37c86..ee306bfa6 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -15,11 +15,11 @@
 package cmd
 
 import (
+	"context"
 	"io/ioutil"
 	"os"
 	"path/filepath"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 )
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 48bd4c401..065efec06 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"context"
+
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index f8ce8c3d8..15e27b250 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -15,10 +15,10 @@
 package cmd
 
 import (
+	"context"
 	"encoding/json"
 	"os"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/pkg/log"
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 121c54554..1e1c1fe17 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -15,11 +15,11 @@
 package cmd
 
 import (
+	"context"
 	"encoding/json"
 	"os"
 	"syscall"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/main.go b/runsc/main.go
index c0ee04216..81c36067b 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -17,13 +17,13 @@
 package main
 
 import (
+	"context"
 	"io"
 	"os"
 	"path/filepath"
 	"strings"
 	"syscall"
 
-	"context"
 	"flag"
 
 	"github.com/google/subcommands"
diff --git a/runsc/tools/dockercfg/dockercfg.go b/runsc/tools/dockercfg/dockercfg.go
index 110a581ff..cc7a67816 100644
--- a/runsc/tools/dockercfg/dockercfg.go
+++ b/runsc/tools/dockercfg/dockercfg.go
@@ -16,13 +16,13 @@
 package main
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"os"
 
-	"context"
 	"flag"
 	"github.com/google/subcommands"
 )
-- 
cgit v1.2.3


From 613899f852510e8d2e0fd3d87824151a82e43332 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Mon, 3 Dec 2018 17:26:53 -0800
Subject: Internal change.

PiperOrigin-RevId: 223893409
Change-Id: I58869c7fb0012f6c3f7612a96cb649348b56335f
---
 runsc/container/BUILD | 1 +
 1 file changed, 1 insertion(+)

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index bdd93aaba..f57af582a 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -40,6 +40,7 @@ go_test(
         "//runsc",
     ],
     embed = [":container"],
+    shard_count = 5,
     tags = [
         "requires-kvm",
     ],
-- 
cgit v1.2.3


From 82719be42e636f86780d21b01e10ecb2c9a25e53 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 4 Dec 2018 14:31:08 -0800
Subject: Max link traversals should be for an entire path.

The number of symbolic links that are allowed to be followed
are for a full path and not just a chain of symbolic links.

PiperOrigin-RevId: 224047321
Change-Id: I5e3c4caf66a93c17eeddcc7f046d1e8bb9434a40
---
 pkg/sentry/fs/copy_up_test.go           |  3 ++-
 pkg/sentry/fs/host/fs.go                |  3 ++-
 pkg/sentry/fs/host/fs_test.go           |  3 ++-
 pkg/sentry/fs/inode_overlay_test.go     |  3 ++-
 pkg/sentry/fs/mount_test.go             | 10 +++++++---
 pkg/sentry/fs/mounts.go                 | 22 ++++++++++++----------
 pkg/sentry/fs/mounts_test.go            |  6 ++++--
 pkg/sentry/fs/ramfs/tree_test.go        |  3 ++-
 pkg/sentry/kernel/kernel.go             |  6 ++++--
 pkg/sentry/kernel/task_context.go       |  2 +-
 pkg/sentry/loader/elf.go                |  2 +-
 pkg/sentry/loader/loader.go             | 10 +++++-----
 pkg/sentry/socket/unix/unix.go          |  6 ++++--
 pkg/sentry/syscalls/linux/sys_file.go   | 11 +++++++----
 pkg/sentry/syscalls/linux/sys_thread.go |  3 ++-
 runsc/boot/fs.go                        | 12 ++++++++----
 runsc/boot/loader_test.go               |  3 ++-
 17 files changed, 67 insertions(+), 41 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 64f030f72..fcba14ed4 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -166,7 +166,8 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
 
 	// Walk to all of the files in the overlay, open them readable.
 	for _, f := range files {
-		d, err := mns.FindInode(ctx, mns.Root(), mns.Root(), f.name, 0)
+		maxTraversals := uint(0)
+		d, err := mns.FindInode(ctx, mns.Root(), mns.Root(), f.name, &maxTraversals)
 		if err != nil {
 			t.Fatalf("failed to find %q: %v", f.name, err)
 		}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index fec890964..54cbb94f9 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -170,7 +170,8 @@ func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string)
 			current := paths[i][:j]
 
 			// Lookup the given component in the tree.
-			d, err := m.FindLink(ctx, root, nil, current, maxTraversals)
+			remainingTraversals := uint(maxTraversals)
+			d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals)
 			if err != nil {
 				log.Warningf("populate failed for %q: %v", current, err)
 				continue
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index e69559aac..44db61ecd 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -150,7 +150,8 @@ func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base stri
 	root := m.Root()
 	defer root.DecRef()
 
-	d, err := m.FindLink(ctx, root, nil, base, 1)
+	maxTraversals := uint(1)
+	d, err := m.FindLink(ctx, root, nil, base, &maxTraversals)
 	if err != nil {
 		t.Logf("FindLink failed for %q", base)
 		return paths, err
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index bba20da14..acdb2b4f8 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -324,7 +324,8 @@ func TestCacheFlush(t *testing.T) {
 
 	for _, fileName := range []string{upperFileName, lowerFileName} {
 		// Walk to the file.
-		dirent, err := mns.FindInode(ctx, root, nil, fileName, 0)
+		maxTraversals := uint(0)
+		dirent, err := mns.FindInode(ctx, root, nil, fileName, &maxTraversals)
 		if err != nil {
 			t.Fatalf("FindInode(%q) failed: %v", fileName, err)
 		}
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index a1c9f4f79..269d6b9da 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -115,8 +115,10 @@ func TestMountSourceParentChildRelationship(t *testing.T) {
 		"/waldo",
 	}
 
+	var maxTraversals uint
 	for _, p := range paths {
-		d, err := mm.FindLink(ctx, rootDirent, nil, p, 0)
+		maxTraversals = 0
+		d, err := mm.FindLink(ctx, rootDirent, nil, p, &maxTraversals)
 		if err != nil {
 			t.Fatalf("could not find path %q in mount manager: %v", p, err)
 		}
@@ -164,7 +166,8 @@ func TestMountSourceParentChildRelationship(t *testing.T) {
 	}
 
 	// "foo" mount should have two children: /foo/bar, and /foo/qux.
-	d, err := mm.FindLink(ctx, rootDirent, nil, "/foo", 0)
+	maxTraversals = 0
+	d, err := mm.FindLink(ctx, rootDirent, nil, "/foo", &maxTraversals)
 	if err != nil {
 		t.Fatalf("could not find path %q in mount manager: %v", "/foo", err)
 	}
@@ -185,7 +188,8 @@ func TestMountSourceParentChildRelationship(t *testing.T) {
 	}
 
 	// "waldo" mount should have no submounts or children.
-	waldo, err := mm.FindLink(ctx, rootDirent, nil, "/waldo", 0)
+	maxTraversals = 0
+	waldo, err := mm.FindLink(ctx, rootDirent, nil, "/waldo", &maxTraversals)
 	if err != nil {
 		t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err)
 	}
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 7c5348cce..f6f7be0aa 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -350,7 +350,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 //
 // Precondition: root must be non-nil.
 // Precondition: the path must be non-empty.
-func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
+func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, remainingTraversals *uint) (*Dirent, error) {
 	if root == nil {
 		panic("MountNamespace.FindLink: root must not be nil")
 	}
@@ -419,7 +419,7 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path
 			//
 			// See resolve for reference semantics; on err next
 			// will have one dropped.
-			current, err = mns.resolve(ctx, root, next, maxTraversals)
+			current, err = mns.resolve(ctx, root, next, remainingTraversals)
 			if err != nil {
 				return nil, err
 			}
@@ -439,15 +439,15 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path
 // FindInode is identical to FindLink except the return value is resolved.
 //
 //go:nosplit
-func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
-	d, err := mns.FindLink(ctx, root, wd, path, maxTraversals)
+func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, remainingTraversals *uint) (*Dirent, error) {
+	d, err := mns.FindLink(ctx, root, wd, path, remainingTraversals)
 	if err != nil {
 		return nil, err
 	}
 
 	// See resolve for reference semantics; on err d will have the
 	// reference dropped.
-	return mns.resolve(ctx, root, d, maxTraversals)
+	return mns.resolve(ctx, root, d, remainingTraversals)
 }
 
 // resolve resolves the given link.
@@ -458,14 +458,14 @@ func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path
 // If not successful, a reference is _also_ dropped on the node and an error
 // returned. This is for convenience in using resolve directly as a return
 // value.
-func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxTraversals uint) (*Dirent, error) {
+func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, remainingTraversals *uint) (*Dirent, error) {
 	// Resolve the path.
 	target, err := node.Inode.Getlink(ctx)
 
 	switch err {
 	case nil:
 		// Make sure we didn't exhaust the traversal budget.
-		if maxTraversals == 0 {
+		if *remainingTraversals == 0 {
 			target.DecRef()
 			return nil, syscall.ELOOP
 		}
@@ -481,7 +481,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxT
 		defer node.DecRef() // See above.
 
 		// First, check if we should traverse.
-		if maxTraversals == 0 {
+		if *remainingTraversals == 0 {
 			return nil, syscall.ELOOP
 		}
 
@@ -492,7 +492,8 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxT
 		}
 
 		// Find the node; we resolve relative to the current symlink's parent.
-		d, err := mns.FindInode(ctx, root, node.parent, targetPath, maxTraversals-1)
+		*remainingTraversals--
+		d, err := mns.FindInode(ctx, root, node.parent, targetPath, remainingTraversals)
 		if err != nil {
 			return nil, err
 		}
@@ -544,7 +545,8 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s
 	defer root.DecRef()
 	for _, p := range paths {
 		binPath := path.Join(p, name)
-		d, err := mns.FindInode(ctx, root, nil, binPath, linux.MaxSymlinkTraversals)
+		traversals := uint(linux.MaxSymlinkTraversals)
+		d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
 		if err == syserror.ENOENT || err == syserror.EACCES {
 			// Didn't find it here.
 			continue
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index cc7c32c9b..2f7a1710f 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -77,7 +77,8 @@ func TestFindLink(t *testing.T) {
 		{"bar", foo, "/foo/bar"},
 	} {
 		wdPath, _ := tc.wd.FullName(root)
-		if d, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, 0); err != nil {
+		maxTraversals := uint(0)
+		if d, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, &maxTraversals); err != nil {
 			t.Errorf("FindLink(%q, wd=%q) failed: %v", tc.findPath, wdPath, err)
 		} else if got, _ := d.FullName(root); got != tc.wantPath {
 			t.Errorf("FindLink(%q, wd=%q) got dirent %q, want %q", tc.findPath, wdPath, got, tc.wantPath)
@@ -95,7 +96,8 @@ func TestFindLink(t *testing.T) {
 		{"foo", foo},
 	} {
 		wdPath, _ := tc.wd.FullName(root)
-		if _, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, 0); err == nil {
+		maxTraversals := uint(0)
+		if _, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, &maxTraversals); err == nil {
 			t.Errorf("FindLink(%q, wd=%q) did not return error", tc.findPath, wdPath)
 		}
 	}
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index d5567d9e1..54df2143c 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -70,7 +70,8 @@ func TestMakeDirectoryTree(t *testing.T) {
 		defer mm.DecRef()
 
 		for _, p := range test.subdirs {
-			if _, err := mm.FindInode(ctx, root, nil, p, 0); err != nil {
+			maxTraversals := uint(0)
+			if _, err := mm.FindInode(ctx, root, nil, p, &maxTraversals); err != nil {
 				t.Errorf("%s: failed to find node %s: %v", test.name, p, err)
 				break
 			}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 17425e656..cb61e27f1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -634,10 +634,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	args.Root = nil
 
 	// Grab the working directory.
+	remainingTraversals := uint(args.MaxSymlinkTraversals)
 	wd := root // Default.
 	if args.WorkingDirectory != "" {
 		var err error
-		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals)
+		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
 		if err != nil {
 			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
 		}
@@ -656,7 +657,8 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	}
 
 	// Create a fresh task context.
-	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+	remainingTraversals = uint(args.MaxSymlinkTraversals)
+	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
 	if err != nil {
 		return nil, 0, err
 	}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 45b8d2b04..aaff309f0 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -142,7 +142,7 @@ func (t *Task) Stack() *arch.Stack {
 //  * argv: Binary argv
 //  * envv: Binary envv
 //  * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) {
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) {
 	// Prepare a new user address space to load into.
 	m := mm.NewMemoryManager(k)
 	defer m.DecUsers(ctx)
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 9b1e81dc9..385ad0102 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -610,7 +610,7 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, in
 //
 // Preconditions:
 //  * f is an ELF file
-func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
 	bin, ac, err := loadInitialELF(ctx, m, fs, f)
 	if err != nil {
 		ctx.Infof("Error loading binary: %v", err)
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index d1417c4f1..69a090844 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -55,7 +55,7 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 // installed in the Task FDMap. The caller takes ownership of both.
 //
 // name must be a readable, executable, regular file.
-func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, name string) (*fs.Dirent, *fs.File, error) {
+func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) {
 	if name == "" {
 		ctx.Infof("cannot open empty name")
 		return nil, nil, syserror.ENOENT
@@ -136,9 +136,9 @@ const (
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated argv
-func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
-		d, f, err := openPath(ctx, mounts, root, wd, maxTraversals, filename)
+		d, f, err := openPath(ctx, mounts, root, wd, remainingTraversals, filename)
 		if err != nil {
 			ctx.Infof("Error opening %s: %v", filename, err)
 			return loadedELF{}, nil, nil, nil, err
@@ -163,7 +163,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 
 		switch {
 		case bytes.Equal(hdr[:], []byte(elfMagic)):
-			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, maxTraversals, fs, f)
+			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, fs, f)
 			if err != nil {
 				ctx.Infof("Error loading ELF: %v", err)
 				return loadedELF{}, nil, nil, nil, err
@@ -196,7 +196,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 // Preconditions:
 //  * The Task MemoryManager is empty.
 //  * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
 	// Load the binary itself.
 	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv)
 	if err != nil {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 334169372..4379486cf 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -266,7 +266,8 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 					subPath = "/"
 				}
 				var err error
-				d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, fs.DefaultTraversalLimit)
+				remainingTraversals := uint(fs.DefaultTraversalLimit)
+				d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, &remainingTraversals)
 				if err != nil {
 					// No path available.
 					return syserr.ErrNoSuchFile
@@ -314,7 +315,8 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
 	// Find the node in the filesystem.
 	root := t.FSContext().RootDirectory()
 	cwd := t.FSContext().WorkingDirectory()
-	d, e := t.MountNamespace().FindInode(t, root, cwd, path, fs.DefaultTraversalLimit)
+	remainingTraversals := uint(fs.DefaultTraversalLimit)
+	d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals)
 	cwd.DecRef()
 	root.DecRef()
 	if e != nil {
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 89d21dd98..37c90f6fd 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -92,10 +92,11 @@ func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func
 	root := t.FSContext().RootDirectory()
 
 	// Lookup the node.
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
 	if resolve {
-		d, err = t.MountNamespace().FindInode(t, root, rel, path, linux.MaxSymlinkTraversals)
+		d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals)
 	} else {
-		d, err = t.MountNamespace().FindLink(t, root, rel, path, linux.MaxSymlinkTraversals)
+		d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
 	}
 	root.DecRef()
 	if wd != nil {
@@ -312,7 +313,8 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 		fileFlags.LargeFile = true
 
 		// Does this file exist already?
-		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
+		remainingTraversals := uint(linux.MaxSymlinkTraversals)
+		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
 		var newFile *fs.File
 		switch err {
 		case nil:
@@ -997,7 +999,8 @@ func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileM
 		}
 
 		// Does this directory exist already?
-		f, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
+		remainingTraversals := uint(linux.MaxSymlinkTraversals)
+		f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
 		switch err {
 		case nil:
 			// The directory existed.
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 9eed613a1..c12693ee2 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -103,7 +103,8 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	defer wd.DecRef()
 
 	// Load the new TaskContext.
-	tc, err := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, linux.MaxSymlinkTraversals, filename, argv, envv, t.Arch().FeatureSet())
+	maxTraversals := uint(linux.MaxSymlinkTraversals)
+	tc, err := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, argv, envv, t.Arch().FeatureSet())
 	if err != nil {
 		return 0, nil, err
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 1e355fe4e..1e75b0efc 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -338,7 +338,8 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, ro
 		}
 	}
 
-	dirent, err := mns.FindInode(ctx, root, root, m.Destination, 0 /* maxTraversals */)
+	maxTraversals := uint(0)
+	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
 	if err != nil {
 		return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
 	}
@@ -582,7 +583,8 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	defer globalRoot.DecRef()
 
 	// Create mount point for the container's rootfs.
-	contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, 0 /* TraversalLimit */)
+	maxTraversals := uint(0)
+	contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals)
 	if err != nil {
 		return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
 	}
@@ -656,7 +658,8 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 	mnsRoot := mns.Root()
 	defer mnsRoot.DecRef()
 	containerRoot := path.Join(ChildContainersDir, cid)
-	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, 0 /* maxTraversals */)
+	maxTraversals := uint(0)
+	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals)
 	if err == syserror.ENOENT {
 		// Container must have been destroyed already. That's fine.
 		return nil
@@ -691,7 +694,8 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 
 	// Get a reference to the parent directory and remove the root
 	// container directory.
-	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, 0 /* maxTraversals */)
+	maxTraversals = 0
+	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals)
 	if err != nil {
 		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index d5cee5608..0ed3002e0 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -406,7 +406,8 @@ func TestCreateMountNamespace(t *testing.T) {
 			root := mm.Root()
 			defer root.DecRef()
 			for _, p := range tc.expectedPaths {
-				if d, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
+				maxTraversals := uint(0)
+				if d, err := mm.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
 				} else {
 					d.DecRef()
-- 
cgit v1.2.3


From 1b1a42ba6dc7953db742959a54fd19124348f3fc Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 6 Dec 2018 15:26:58 -0800
Subject: A sandbox process should wait until it has not been moved into
 cgroups

PiperOrigin-RevId: 224418900
Change-Id: I53cf4d7c1c70117875b6920f8fd3d58a3b1497e9
---
 runsc/cmd/boot.go        | 24 +++++++++++++++++-
 runsc/sandbox/sandbox.go | 65 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 66 insertions(+), 23 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 5f5dca109..192df7d3c 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -69,6 +69,9 @@ type Boot struct {
 
 	// userLogFD is the file descriptor to write user logs to.
 	userLogFD int
+
+	// startSyncFD is the file descriptor to synchronize runsc and sandbox.
+	startSyncFD int
 }
 
 // Name implements subcommands.Command.Name.
@@ -99,12 +102,13 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
 	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
 	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
+	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
 }
 
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
 // waiting state.
 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if b.specFD == -1 || b.controllerFD == -1 || f.NArg() != 1 {
+	if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
@@ -155,6 +159,14 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		panic("setCapsAndCallSelf must never return success")
 	}
 
+	// Wait until this process has been moved into cgroups.
+	startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
+	defer startSyncFile.Close()
+	buf := make([]byte, 1)
+	if r, err := startSyncFile.Read(buf); err != nil || r != 1 {
+		Fatalf("Unable to read from the start-sync descriptor: %v", err)
+	}
+
 	// Create the loader.
 	bootArgs := boot.Args{
 		ID:           f.Arg(0),
@@ -173,9 +185,19 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
+
 	// Fatalf exits the process and doesn't run defers. 'l' must be destroyed
 	// explicitly!
 
+	// Notify the parent process the controller has been created.
+	if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
+		l.Destroy()
+		Fatalf("Unable to write into the start-sync descriptor: %v", err)
+	}
+	// startSyncFile is closed here to be sure that starting with this point
+	// the runsc process will not write anything into it.
+	startSyncFile.Close()
+
 	// Notify other processes the loader has been created.
 	l.NotifyLoaderCreated()
 
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 3f00eba94..0798aef9b 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -60,7 +60,7 @@ type Sandbox struct {
 	// is running in.
 	Chroot string `json:"chroot"`
 
-	// Ccroup has the cgroup configuration for the sandbox.
+	// Cgroup has the cgroup configuration for the sandbox.
 	Cgroup *cgroup.Cgroup `json:"cgroup"`
 }
 
@@ -69,7 +69,7 @@ type Sandbox struct {
 func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
 	// The Cleanup object cleans up partially created sandboxes when an error occurs.
-	// Any errors occuring during cleanup itself are ignored.
+	// Any errors occurring during cleanup itself are ignored.
 	c := specutils.MakeCleanup(func() { _ = s.destroy() })
 	defer c.Clean()
 
@@ -82,13 +82,25 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		}
 	}
 
-	// Create the sandbox process.
-	if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles); err != nil {
-		return nil, err
+	// Create a socket pair to synchronize runsc and sandbox processes.
+	// It is used for the following:
+	// * to notify the sandbox process when it has been moved into cgroups.
+	// * to wait for the controller socket.
+	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating a start-sync socket pair %q: %v", s.ID, err)
 	}
+	startSyncFile := os.NewFile(uintptr(fds[0]), "start-sync socket")
+	defer startSyncFile.Close()
 
-	// Wait for the control server to come up (or timeout).
-	if err := s.waitForCreated(20 * time.Second); err != nil {
+	sandboxSyncFile := os.NewFile(uintptr(fds[1]), "sandbox start-sync socket")
+
+	// Create the sandbox process.
+	err = s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles, sandboxSyncFile)
+	// sandboxSyncFile has to be closed to be able to detect
+	// when the sandbox process exits unexpectedly.
+	sandboxSyncFile.Close()
+	if err != nil {
 		return nil, err
 	}
 
@@ -98,6 +110,24 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		}
 	}
 
+	b := make([]byte, 1)
+	// Notify the sandbox process it has been moved into cgroups.
+	if l, err := startSyncFile.Write(b); err != nil || l != 1 {
+		return nil, fmt.Errorf("error writing into the start-sync descriptor: %v", err)
+	}
+	// Wait until the sandbox process has initialized the controller socket.
+	if l, err := startSyncFile.Read(b); err != nil || l != 1 {
+		return nil, fmt.Errorf("error reading from the start-sync descriptor: %v", err)
+	}
+	// startSyncFile is closed here to be sure that starting with this point
+	// the sandbox process will not write anything into it.
+	startSyncFile.Close()
+
+	// Wait for the control server to come up.
+	if err := s.waitForCreated(); err != nil {
+		return nil, err
+	}
+
 	c.Release()
 	return s, nil
 }
@@ -282,7 +312,7 @@ func (s *Sandbox) connError(err error) error {
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File) error {
+func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, startSyncFile *os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -346,6 +376,10 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD))
 	nextFD++
 
+	cmd.ExtraFiles = append(cmd.ExtraFiles, startSyncFile)
+	cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
 	// If there is a gofer, sends all socket ends to the sandbox.
 	for _, f := range ioFiles {
 		defer f.Close()
@@ -581,21 +615,8 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 // waitForCreated waits for the sandbox subprocess control server to be
 // running and for the loader to have been created, at which point the sandbox
 // is in Created state.
-func (s *Sandbox) waitForCreated(timeout time.Duration) error {
+func (s *Sandbox) waitForCreated() error {
 	log.Debugf("Waiting for sandbox %q creation", s.ID)
-
-	ready := func() (bool, error) {
-		c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
-		if err != nil {
-			return false, nil
-		}
-		// It's alive!
-		c.Close()
-		return true, nil
-	}
-	if err := specutils.WaitForReady(s.Pid, timeout, ready); err != nil {
-		return fmt.Errorf("unexpected error waiting for sandbox %q, err: %v", s.ID, err)
-	}
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
-- 
cgit v1.2.3


From 9984138abee51d6145469f9298bfeb8a98589709 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Fri, 7 Dec 2018 17:03:06 -0800
Subject: sentry: turn "dynamically-created" procfs files into static creation.

PiperOrigin-RevId: 224600982
Change-Id: I547253528e24fb0bb318fc9d2632cb80504acb34
---
 pkg/sentry/fs/proc/proc.go | 33 +++++++++++++--------------------
 runsc/boot/controller.go   |  4 ++++
 2 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index b658cd328..70e549c31 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -85,8 +85,6 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 
 	p := &proc{k: k, pidns: pidns}
 	p.InitDir(ctx, map[string]*fs.Inode{
-		// Note that these are just the static members. There are
-		// dynamic members populated in Readdir and Lookup below.
 		"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
 		"loadavg":     seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc),
 		"meminfo":     seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc),
@@ -96,12 +94,23 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 	}, fs.RootOwner, fs.FilePermsFromMode(0555))
 
 	p.AddChild(ctx, "cpuinfo", p.newCPUInfo(ctx, msrc))
+	// If we're using rpcinet we will let it manage /proc/net.
+	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
+		p.AddChild(ctx, "net", newRPCInetProcNet(ctx, msrc))
+	} else {
+		p.AddChild(ctx, "net", p.newNetDir(ctx, msrc))
+	}
+	p.AddChild(ctx, "self", p.newSelf(ctx, msrc))
+	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
+	p.AddChild(ctx, "thread-self", p.newThreadSelf(ctx, msrc))
 	p.AddChild(ctx, "uptime", p.newUptime(ctx, msrc))
 
 	return newFile(p, msrc, fs.SpecialDirectory, nil), nil
 }
 
 // self is a magical link.
+//
+// +stateify savable
 type self struct {
 	ramfs.Symlink
 
@@ -146,6 +155,8 @@ func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 }
 
 // threadSelf is more magical than "self" link.
+//
+// +stateify savable
 type threadSelf struct {
 	ramfs.Symlink
 
@@ -169,29 +180,11 @@ func (s *threadSelf) Readlink(ctx context.Context, inode *fs.Inode) (string, err
 
 // Lookup loads an Inode at name into a Dirent.
 func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
-	// Is it one of the static ones?
 	dirent, walkErr := p.Dir.Lookup(ctx, dir, name)
 	if walkErr == nil {
 		return dirent, nil
 	}
 
-	// Is it a dynamic element?
-	nfs := map[string]func() *fs.Inode{
-		"net": func() *fs.Inode {
-			// If we're using rpcinet we will let it manage /proc/net.
-			if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-				return newRPCInetProcNet(ctx, dir.MountSource)
-			}
-			return p.newNetDir(ctx, dir.MountSource)
-		},
-		"self":        func() *fs.Inode { return p.newSelf(ctx, dir.MountSource) },
-		"sys":         func() *fs.Inode { return p.newSysDir(ctx, dir.MountSource) },
-		"thread-self": func() *fs.Inode { return p.newThreadSelf(ctx, dir.MountSource) },
-	}
-	if nf, ok := nfs[name]; ok {
-		return fs.NewDirent(nf(), name), nil
-	}
-
 	// Try to lookup a corresponding task.
 	tid, err := strconv.ParseUint(name, 10, 64)
 	if err != nil {
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 7a1f42119..05d4f3a5b 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -30,6 +30,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 )
 
@@ -356,6 +357,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	if err != nil {
 		return fmt.Errorf("failed to create network: %v", err)
 	}
+	if eps, ok := networkStack.(*epsocket.Stack); ok {
+		stack.StackFromEnv = eps.Stack // FIXME
+	}
 	info, err := o.FilePayload.Files[0].Stat()
 	if err != nil {
 		return err
-- 
cgit v1.2.3


From 833edbd10b49db1f934dcb2495dcb41c1310eea4 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 10 Dec 2018 12:51:08 -0800
Subject: Internal change.

PiperOrigin-RevId: 224865061
Change-Id: I6aa31f880931980ad2fc4c4b3cc4c532aacb31f4
---
 runsc/test/testutil/testutil.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index b8f981053..a84530287 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -172,7 +172,7 @@ func NewSpecWithArgs(args ...string) *specs.Spec {
 				Source:      TmpDir(),
 			},
 		},
-		Hostname: "runsc-test-" + strings.Join(args, "_"),
+		Hostname: "runsc-test-hostname",
 	}
 }
 
-- 
cgit v1.2.3


From d3bc79bc8438206ac6a14fde4eaa288fc07eee82 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Mon, 10 Dec 2018 14:41:40 -0800
Subject: Open source system call tests.

PiperOrigin-RevId: 224886231
Change-Id: I0fccb4d994601739d8b16b1d4e6b31f40297fb22
---
 kokoro/run_tests.sh                                |    9 +-
 runsc/boot/BUILD                                   |    1 +
 runsc/container/BUILD                              |    1 +
 runsc/specutils/BUILD                              |    1 +
 runsc/test/testutil/BUILD                          |    1 +
 test/syscalls/BUILD                                |  522 ++++
 test/syscalls/README.md                            |  110 +
 test/syscalls/build_defs.bzl                       |   54 +
 test/syscalls/gtest/BUILD                          |   12 +
 test/syscalls/gtest/gtest.go                       |   93 +
 test/syscalls/linux/32bit.cc                       |  226 ++
 test/syscalls/linux/BUILD                          | 2951 ++++++++++++++++++++
 test/syscalls/linux/accept_bind.cc                 |  600 ++++
 test/syscalls/linux/accept_bind_stream.cc          |   93 +
 test/syscalls/linux/access.cc                      |  170 ++
 test/syscalls/linux/affinity.cc                    |  241 ++
 test/syscalls/linux/aio.cc                         |  433 +++
 test/syscalls/linux/alarm.cc                       |  193 ++
 test/syscalls/linux/arch_prctl.cc                  |   48 +
 test/syscalls/linux/bad.cc                         |   39 +
 test/syscalls/linux/base_poll_test.cc              |   65 +
 test/syscalls/linux/base_poll_test.h               |  101 +
 test/syscalls/linux/bind.cc                        |  146 +
 test/syscalls/linux/brk.cc                         |   31 +
 test/syscalls/linux/chdir.cc                       |   69 +
 test/syscalls/linux/chmod.cc                       |  262 ++
 test/syscalls/linux/chown.cc                       |  200 ++
 test/syscalls/linux/chroot.cc                      |  364 +++
 test/syscalls/linux/clock_getres.cc                |   37 +
 test/syscalls/linux/clock_gettime.cc               |  156 ++
 test/syscalls/linux/clock_nanosleep.cc             |  153 +
 test/syscalls/linux/concurrency.cc                 |  124 +
 test/syscalls/linux/creat.cc                       |   57 +
 test/syscalls/linux/dev.cc                         |  149 +
 test/syscalls/linux/dup.cc                         |  139 +
 test/syscalls/linux/epoll.cc                       |  468 ++++
 test/syscalls/linux/eventfd.cc                     |  189 ++
 test/syscalls/linux/exceptions.cc                  |  146 +
 test/syscalls/linux/exec.cc                        |  625 +++++
 test/syscalls/linux/exec.h                         |   34 +
 test/syscalls/linux/exec_assert_closed_workload.cc |   45 +
 test/syscalls/linux/exec_basic_workload.cc         |   31 +
 test/syscalls/linux/exec_binary.cc                 | 1367 +++++++++
 test/syscalls/linux/exec_proc_exe_workload.cc      |   35 +
 test/syscalls/linux/exec_state_workload.cc         |  202 ++
 test/syscalls/linux/exit.cc                        |   77 +
 test/syscalls/linux/exit_script.sh                 |   22 +
 test/syscalls/linux/fadvise64.cc                   |   72 +
 test/syscalls/linux/fallocate.cc                   |   57 +
 test/syscalls/linux/fault.cc                       |   71 +
 test/syscalls/linux/fchdir.cc                      |   77 +
 test/syscalls/linux/fcntl.cc                       |  978 +++++++
 test/syscalls/linux/file_base.h                    |  206 ++
 test/syscalls/linux/flock.cc                       |  588 ++++
 test/syscalls/linux/fork.cc                        |  413 +++
 test/syscalls/linux/fpsig_fork.cc                  |  105 +
 test/syscalls/linux/fpsig_nested.cc                |  134 +
 test/syscalls/linux/fsync.cc                       |   55 +
 test/syscalls/linux/futex.cc                       |  595 ++++
 test/syscalls/linux/getcpu.cc                      |   40 +
 test/syscalls/linux/getdents.cc                    |  485 ++++
 test/syscalls/linux/getrandom.cc                   |   61 +
 test/syscalls/linux/getrusage.cc                   |  177 ++
 test/syscalls/linux/inotify.cc                     | 1489 ++++++++++
 test/syscalls/linux/ioctl.cc                       |  375 +++
 test/syscalls/linux/ip_socket_test_util.cc         |   78 +
 test/syscalls/linux/ip_socket_test_util.h          |   57 +
 test/syscalls/linux/itimer.cc                      |  342 +++
 test/syscalls/linux/kill.cc                        |  380 +++
 test/syscalls/linux/link.cc                        |  291 ++
 test/syscalls/linux/lseek.cc                       |  202 ++
 test/syscalls/linux/madvise.cc                     |  142 +
 test/syscalls/linux/memory_accounting.cc           |   99 +
 test/syscalls/linux/mempolicy.cc                   |  258 ++
 test/syscalls/linux/mincore.cc                     |   96 +
 test/syscalls/linux/mkdir.cc                       |   96 +
 test/syscalls/linux/mknod.cc                       |  173 ++
 test/syscalls/linux/mmap.cc                        | 1714 ++++++++++++
 test/syscalls/linux/mount.cc                       |  302 ++
 test/syscalls/linux/mremap.cc                      |  514 ++++
 test/syscalls/linux/msync.cc                       |  145 +
 test/syscalls/linux/munmap.cc                      |   53 +
 test/syscalls/linux/open.cc                        |  340 +++
 test/syscalls/linux/open_create.cc                 |  130 +
 test/syscalls/linux/partial_bad_buffer.cc          |  305 ++
 test/syscalls/linux/pause.cc                       |   88 +
 test/syscalls/linux/pipe.cc                        |  480 ++++
 test/syscalls/linux/poll.cc                        |  279 ++
 test/syscalls/linux/ppoll.cc                       |  155 +
 test/syscalls/linux/prctl.cc                       |  171 ++
 test/syscalls/linux/prctl_setuid.cc                |  262 ++
 test/syscalls/linux/pread64.cc                     |  152 +
 test/syscalls/linux/preadv.cc                      |   94 +
 test/syscalls/linux/preadv2.cc                     |  217 ++
 test/syscalls/linux/priority.cc                    |  215 ++
 test/syscalls/linux/priority_execve.cc             |   42 +
 test/syscalls/linux/proc.cc                        | 1830 ++++++++++++
 test/syscalls/linux/proc_net.cc                    |   59 +
 test/syscalls/linux/pselect.cc                     |  190 ++
 test/syscalls/linux/ptrace.cc                      |  948 +++++++
 test/syscalls/linux/pty.cc                         | 1230 ++++++++
 test/syscalls/linux/pwrite64.cc                    |   79 +
 test/syscalls/linux/read.cc                        |  117 +
 test/syscalls/linux/readv.cc                       |  293 ++
 test/syscalls/linux/readv_common.cc                |  180 ++
 test/syscalls/linux/readv_common.h                 |   61 +
 test/syscalls/linux/readv_socket.cc                |  182 ++
 test/syscalls/linux/rename.cc                      |  373 +++
 test/syscalls/linux/rlimits.cc                     |   61 +
 test/syscalls/linux/rtsignal.cc                    |  172 ++
 test/syscalls/linux/sched.cc                       |   71 +
 test/syscalls/linux/sched_yield.cc                 |   33 +
 test/syscalls/linux/seccomp.cc                     |  374 +++
 test/syscalls/linux/select.cc                      |  128 +
 test/syscalls/linux/semaphore.cc                   |  438 +++
 test/syscalls/linux/sendfile.cc                    |  409 +++
 test/syscalls/linux/sendfile_socket.cc             |  156 ++
 test/syscalls/linux/shm.cc                         |  445 +++
 test/syscalls/linux/sigaction.cc                   |   70 +
 test/syscalls/linux/sigaltstack.cc                 |  274 ++
 test/syscalls/linux/sigaltstack_check.cc           |   33 +
 test/syscalls/linux/sigiret.cc                     |  137 +
 test/syscalls/linux/sigprocmask.cc                 |  272 ++
 test/syscalls/linux/sigstop.cc                     |  150 +
 test/syscalls/linux/sigtimedwait.cc                |  248 ++
 test/syscalls/linux/socket_abstract.cc             |   43 +
 test/syscalls/linux/socket_filesystem.cc           |   43 +
 test/syscalls/linux/socket_generic.cc              |  403 +++
 test/syscalls/linux/socket_generic.h               |   30 +
 test/syscalls/linux/socket_inet_loopback.cc        |  812 ++++++
 test/syscalls/linux/socket_ip_tcp_generic.cc       |  392 +++
 test/syscalls/linux/socket_ip_tcp_generic.h        |   29 +
 .../linux/socket_ip_tcp_generic_loopback.cc        |   47 +
 test/syscalls/linux/socket_ip_tcp_loopback.cc      |   43 +
 .../linux/socket_ip_tcp_loopback_blocking.cc       |   44 +
 .../linux/socket_ip_tcp_loopback_nonblock.cc       |   46 +
 test/syscalls/linux/socket_ip_tcp_udp_generic.cc   |   78 +
 test/syscalls/linux/socket_ip_udp_loopback.cc      |   48 +
 .../linux/socket_ip_udp_loopback_blocking.cc       |   40 +
 .../linux/socket_ip_udp_loopback_nonblock.cc       |   42 +
 test/syscalls/linux/socket_netdevice.cc            |  182 ++
 test/syscalls/linux/socket_netlink_route.cc        |  314 +++
 test/syscalls/linux/socket_netlink_util.cc         |  100 +
 test/syscalls/linux/socket_netlink_util.h          |   42 +
 test/syscalls/linux/socket_non_blocking.cc         |   63 +
 test/syscalls/linux/socket_non_blocking.h          |   29 +
 test/syscalls/linux/socket_non_stream.cc           |  174 ++
 test/syscalls/linux/socket_non_stream.h            |   29 +
 test/syscalls/linux/socket_non_stream_blocking.cc  |   51 +
 test/syscalls/linux/socket_non_stream_blocking.h   |   30 +
 test/syscalls/linux/socket_stream.cc               |   99 +
 test/syscalls/linux/socket_stream.h                |   30 +
 test/syscalls/linux/socket_stream_blocking.cc      |  131 +
 test/syscalls/linux/socket_stream_blocking.h       |   30 +
 test/syscalls/linux/socket_stream_nonblock.cc      |   50 +
 test/syscalls/linux/socket_stream_nonblock.h       |   30 +
 test/syscalls/linux/socket_test_util.cc            |  660 +++++
 test/syscalls/linux/socket_test_util.h             |  449 +++
 test/syscalls/linux/socket_unix.cc                 | 1181 ++++++++
 test/syscalls/linux/socket_unix.h                  |   29 +
 test/syscalls/linux/socket_unix_abstract.cc        |   38 +
 .../linux/socket_unix_abstract_nonblock.cc         |   38 +
 test/syscalls/linux/socket_unix_dgram.cc           |   45 +
 test/syscalls/linux/socket_unix_dgram.h            |   29 +
 test/syscalls/linux/socket_unix_dgram_local.cc     |   59 +
 .../linux/socket_unix_dgram_non_blocking.cc        |   68 +
 test/syscalls/linux/socket_unix_domain.cc          |   38 +
 test/syscalls/linux/socket_unix_filesystem.cc      |   38 +
 .../linux/socket_unix_filesystem_nonblock.cc       |   38 +
 test/syscalls/linux/socket_unix_non_stream.cc      |  229 ++
 test/syscalls/linux/socket_unix_non_stream.h       |   30 +
 .../linux/socket_unix_non_stream_blocking_local.cc |   47 +
 test/syscalls/linux/socket_unix_pair.cc            |   38 +
 test/syscalls/linux/socket_unix_pair_nonblock.cc   |   38 +
 test/syscalls/linux/socket_unix_seqpacket.cc       |   49 +
 test/syscalls/linux/socket_unix_seqpacket.h        |   30 +
 test/syscalls/linux/socket_unix_seqpacket_local.cc |   59 +
 test/syscalls/linux/socket_unix_stream.cc          |   69 +
 .../linux/socket_unix_stream_blocking_local.cc     |   47 +
 test/syscalls/linux/socket_unix_stream_local.cc    |   49 +
 .../linux/socket_unix_stream_nonblock_local.cc     |   49 +
 .../syscalls/linux/socket_unix_unbound_abstract.cc |  116 +
 test/syscalls/linux/socket_unix_unbound_dgram.cc   |  162 ++
 .../linux/socket_unix_unbound_filesystem.cc        |   84 +
 .../linux/socket_unix_unbound_seqpacket.cc         |   91 +
 test/syscalls/linux/socket_unix_unbound_stream.cc  |  738 +++++
 test/syscalls/linux/stat.cc                        |  410 +++
 test/syscalls/linux/stat_times.cc                  |  220 ++
 test/syscalls/linux/statfs.cc                      |   81 +
 test/syscalls/linux/sticky.cc                      |  116 +
 test/syscalls/linux/symlink.cc                     |  288 ++
 test/syscalls/linux/sync.cc                        |   60 +
 test/syscalls/linux/sync_file_range.cc             |  111 +
 test/syscalls/linux/sysinfo.cc                     |   86 +
 test/syscalls/linux/syslog.cc                      |   51 +
 test/syscalls/linux/sysret.cc                      |  113 +
 test/syscalls/linux/tcp_socket.cc                  |  759 +++++
 test/syscalls/linux/temp_umask.h                   |   39 +
 test/syscalls/linux/tgkill.cc                      |   48 +
 test/syscalls/linux/time.cc                        |  103 +
 test/syscalls/linux/timerfd.cc                     |  238 ++
 test/syscalls/linux/timers.cc                      |  642 +++++
 test/syscalls/linux/tkill.cc                       |   75 +
 test/syscalls/linux/truncate.cc                    |  217 ++
 test/syscalls/linux/udp_bind.cc                    |  316 +++
 test/syscalls/linux/udp_socket.cc                  |  941 +++++++
 test/syscalls/linux/uidgid.cc                      |  277 ++
 test/syscalls/linux/uname.cc                       |   99 +
 .../syscalls/linux/unix_domain_socket_test_util.cc |  346 +++
 test/syscalls/linux/unix_domain_socket_test_util.h |  161 ++
 test/syscalls/linux/unlink.cc                      |  211 ++
 test/syscalls/linux/unshare.cc                     |   50 +
 test/syscalls/linux/utimes.cc                      |  330 +++
 test/syscalls/linux/vdso.cc                        |   48 +
 test/syscalls/linux/vdso_clock_gettime.cc          |  104 +
 test/syscalls/linux/vfork.cc                       |  193 ++
 test/syscalls/linux/vsyscall.cc                    |   44 +
 test/syscalls/linux/wait.cc                        |  748 +++++
 test/syscalls/linux/write.cc                       |  134 +
 test/syscalls/syscall_test.go                      |  245 ++
 test/syscalls/syscall_test_runner.sh               |   25 +
 test/util/BUILD                                    |  239 ++
 test/util/capability_util.cc                       |   79 +
 test/util/capability_util.h                        |  101 +
 test/util/cleanup.h                                |   61 +
 test/util/file_descriptor.h                        |  134 +
 test/util/fs_util.cc                               |  585 ++++
 test/util/fs_util.h                                |  182 ++
 test/util/fs_util_test.cc                          |  100 +
 test/util/logging.cc                               |   97 +
 test/util/logging.h                                |   73 +
 test/util/memory_util.h                            |  124 +
 test/util/mount_util.h                             |   48 +
 test/util/multiprocess_util.cc                     |  139 +
 test/util/multiprocess_util.h                      |  113 +
 test/util/posix_error.cc                           |   93 +
 test/util/posix_error.h                            |  428 +++
 test/util/posix_error_test.cc                      |   45 +
 test/util/proc_util.cc                             |   98 +
 test/util/proc_util.h                              |  150 +
 test/util/save_util.cc                             |   59 +
 test/util/save_util.h                              |   47 +
 test/util/signal_util.cc                           |  103 +
 test/util/signal_util.h                            |   92 +
 test/util/temp_path.cc                             |  157 ++
 test/util/temp_path.h                              |  134 +
 test/util/test_main.cc                             |   20 +
 test/util/test_util.cc                             |  248 ++
 test/util/test_util.h                              |  794 ++++++
 test/util/test_util_test.cc                        |  250 ++
 test/util/thread_util.h                            |   89 +
 test/util/timer_util.cc                            |   27 +
 test/util/timer_util.h                             |   74 +
 253 files changed, 55023 insertions(+), 3 deletions(-)
 create mode 100644 test/syscalls/BUILD
 create mode 100644 test/syscalls/README.md
 create mode 100644 test/syscalls/build_defs.bzl
 create mode 100644 test/syscalls/gtest/BUILD
 create mode 100644 test/syscalls/gtest/gtest.go
 create mode 100644 test/syscalls/linux/32bit.cc
 create mode 100644 test/syscalls/linux/BUILD
 create mode 100644 test/syscalls/linux/accept_bind.cc
 create mode 100644 test/syscalls/linux/accept_bind_stream.cc
 create mode 100644 test/syscalls/linux/access.cc
 create mode 100644 test/syscalls/linux/affinity.cc
 create mode 100644 test/syscalls/linux/aio.cc
 create mode 100644 test/syscalls/linux/alarm.cc
 create mode 100644 test/syscalls/linux/arch_prctl.cc
 create mode 100644 test/syscalls/linux/bad.cc
 create mode 100644 test/syscalls/linux/base_poll_test.cc
 create mode 100644 test/syscalls/linux/base_poll_test.h
 create mode 100644 test/syscalls/linux/bind.cc
 create mode 100644 test/syscalls/linux/brk.cc
 create mode 100644 test/syscalls/linux/chdir.cc
 create mode 100644 test/syscalls/linux/chmod.cc
 create mode 100644 test/syscalls/linux/chown.cc
 create mode 100644 test/syscalls/linux/chroot.cc
 create mode 100644 test/syscalls/linux/clock_getres.cc
 create mode 100644 test/syscalls/linux/clock_gettime.cc
 create mode 100644 test/syscalls/linux/clock_nanosleep.cc
 create mode 100644 test/syscalls/linux/concurrency.cc
 create mode 100644 test/syscalls/linux/creat.cc
 create mode 100644 test/syscalls/linux/dev.cc
 create mode 100644 test/syscalls/linux/dup.cc
 create mode 100644 test/syscalls/linux/epoll.cc
 create mode 100644 test/syscalls/linux/eventfd.cc
 create mode 100644 test/syscalls/linux/exceptions.cc
 create mode 100644 test/syscalls/linux/exec.cc
 create mode 100644 test/syscalls/linux/exec.h
 create mode 100644 test/syscalls/linux/exec_assert_closed_workload.cc
 create mode 100644 test/syscalls/linux/exec_basic_workload.cc
 create mode 100644 test/syscalls/linux/exec_binary.cc
 create mode 100644 test/syscalls/linux/exec_proc_exe_workload.cc
 create mode 100644 test/syscalls/linux/exec_state_workload.cc
 create mode 100644 test/syscalls/linux/exit.cc
 create mode 100755 test/syscalls/linux/exit_script.sh
 create mode 100644 test/syscalls/linux/fadvise64.cc
 create mode 100644 test/syscalls/linux/fallocate.cc
 create mode 100644 test/syscalls/linux/fault.cc
 create mode 100644 test/syscalls/linux/fchdir.cc
 create mode 100644 test/syscalls/linux/fcntl.cc
 create mode 100644 test/syscalls/linux/file_base.h
 create mode 100644 test/syscalls/linux/flock.cc
 create mode 100644 test/syscalls/linux/fork.cc
 create mode 100644 test/syscalls/linux/fpsig_fork.cc
 create mode 100644 test/syscalls/linux/fpsig_nested.cc
 create mode 100644 test/syscalls/linux/fsync.cc
 create mode 100644 test/syscalls/linux/futex.cc
 create mode 100644 test/syscalls/linux/getcpu.cc
 create mode 100644 test/syscalls/linux/getdents.cc
 create mode 100644 test/syscalls/linux/getrandom.cc
 create mode 100644 test/syscalls/linux/getrusage.cc
 create mode 100644 test/syscalls/linux/inotify.cc
 create mode 100644 test/syscalls/linux/ioctl.cc
 create mode 100644 test/syscalls/linux/ip_socket_test_util.cc
 create mode 100644 test/syscalls/linux/ip_socket_test_util.h
 create mode 100644 test/syscalls/linux/itimer.cc
 create mode 100644 test/syscalls/linux/kill.cc
 create mode 100644 test/syscalls/linux/link.cc
 create mode 100644 test/syscalls/linux/lseek.cc
 create mode 100644 test/syscalls/linux/madvise.cc
 create mode 100644 test/syscalls/linux/memory_accounting.cc
 create mode 100644 test/syscalls/linux/mempolicy.cc
 create mode 100644 test/syscalls/linux/mincore.cc
 create mode 100644 test/syscalls/linux/mkdir.cc
 create mode 100644 test/syscalls/linux/mknod.cc
 create mode 100644 test/syscalls/linux/mmap.cc
 create mode 100644 test/syscalls/linux/mount.cc
 create mode 100644 test/syscalls/linux/mremap.cc
 create mode 100644 test/syscalls/linux/msync.cc
 create mode 100644 test/syscalls/linux/munmap.cc
 create mode 100644 test/syscalls/linux/open.cc
 create mode 100644 test/syscalls/linux/open_create.cc
 create mode 100644 test/syscalls/linux/partial_bad_buffer.cc
 create mode 100644 test/syscalls/linux/pause.cc
 create mode 100644 test/syscalls/linux/pipe.cc
 create mode 100644 test/syscalls/linux/poll.cc
 create mode 100644 test/syscalls/linux/ppoll.cc
 create mode 100644 test/syscalls/linux/prctl.cc
 create mode 100644 test/syscalls/linux/prctl_setuid.cc
 create mode 100644 test/syscalls/linux/pread64.cc
 create mode 100644 test/syscalls/linux/preadv.cc
 create mode 100644 test/syscalls/linux/preadv2.cc
 create mode 100644 test/syscalls/linux/priority.cc
 create mode 100644 test/syscalls/linux/priority_execve.cc
 create mode 100644 test/syscalls/linux/proc.cc
 create mode 100644 test/syscalls/linux/proc_net.cc
 create mode 100644 test/syscalls/linux/pselect.cc
 create mode 100644 test/syscalls/linux/ptrace.cc
 create mode 100644 test/syscalls/linux/pty.cc
 create mode 100644 test/syscalls/linux/pwrite64.cc
 create mode 100644 test/syscalls/linux/read.cc
 create mode 100644 test/syscalls/linux/readv.cc
 create mode 100644 test/syscalls/linux/readv_common.cc
 create mode 100644 test/syscalls/linux/readv_common.h
 create mode 100644 test/syscalls/linux/readv_socket.cc
 create mode 100644 test/syscalls/linux/rename.cc
 create mode 100644 test/syscalls/linux/rlimits.cc
 create mode 100644 test/syscalls/linux/rtsignal.cc
 create mode 100644 test/syscalls/linux/sched.cc
 create mode 100644 test/syscalls/linux/sched_yield.cc
 create mode 100644 test/syscalls/linux/seccomp.cc
 create mode 100644 test/syscalls/linux/select.cc
 create mode 100644 test/syscalls/linux/semaphore.cc
 create mode 100644 test/syscalls/linux/sendfile.cc
 create mode 100644 test/syscalls/linux/sendfile_socket.cc
 create mode 100644 test/syscalls/linux/shm.cc
 create mode 100644 test/syscalls/linux/sigaction.cc
 create mode 100644 test/syscalls/linux/sigaltstack.cc
 create mode 100644 test/syscalls/linux/sigaltstack_check.cc
 create mode 100644 test/syscalls/linux/sigiret.cc
 create mode 100644 test/syscalls/linux/sigprocmask.cc
 create mode 100644 test/syscalls/linux/sigstop.cc
 create mode 100644 test/syscalls/linux/sigtimedwait.cc
 create mode 100644 test/syscalls/linux/socket_abstract.cc
 create mode 100644 test/syscalls/linux/socket_filesystem.cc
 create mode 100644 test/syscalls/linux/socket_generic.cc
 create mode 100644 test/syscalls/linux/socket_generic.h
 create mode 100644 test/syscalls/linux/socket_inet_loopback.cc
 create mode 100644 test/syscalls/linux/socket_ip_tcp_generic.cc
 create mode 100644 test/syscalls/linux/socket_ip_tcp_generic.h
 create mode 100644 test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
 create mode 100644 test/syscalls/linux/socket_ip_tcp_loopback.cc
 create mode 100644 test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
 create mode 100644 test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
 create mode 100644 test/syscalls/linux/socket_ip_tcp_udp_generic.cc
 create mode 100644 test/syscalls/linux/socket_ip_udp_loopback.cc
 create mode 100644 test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
 create mode 100644 test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
 create mode 100644 test/syscalls/linux/socket_netdevice.cc
 create mode 100644 test/syscalls/linux/socket_netlink_route.cc
 create mode 100644 test/syscalls/linux/socket_netlink_util.cc
 create mode 100644 test/syscalls/linux/socket_netlink_util.h
 create mode 100644 test/syscalls/linux/socket_non_blocking.cc
 create mode 100644 test/syscalls/linux/socket_non_blocking.h
 create mode 100644 test/syscalls/linux/socket_non_stream.cc
 create mode 100644 test/syscalls/linux/socket_non_stream.h
 create mode 100644 test/syscalls/linux/socket_non_stream_blocking.cc
 create mode 100644 test/syscalls/linux/socket_non_stream_blocking.h
 create mode 100644 test/syscalls/linux/socket_stream.cc
 create mode 100644 test/syscalls/linux/socket_stream.h
 create mode 100644 test/syscalls/linux/socket_stream_blocking.cc
 create mode 100644 test/syscalls/linux/socket_stream_blocking.h
 create mode 100644 test/syscalls/linux/socket_stream_nonblock.cc
 create mode 100644 test/syscalls/linux/socket_stream_nonblock.h
 create mode 100644 test/syscalls/linux/socket_test_util.cc
 create mode 100644 test/syscalls/linux/socket_test_util.h
 create mode 100644 test/syscalls/linux/socket_unix.cc
 create mode 100644 test/syscalls/linux/socket_unix.h
 create mode 100644 test/syscalls/linux/socket_unix_abstract.cc
 create mode 100644 test/syscalls/linux/socket_unix_abstract_nonblock.cc
 create mode 100644 test/syscalls/linux/socket_unix_dgram.cc
 create mode 100644 test/syscalls/linux/socket_unix_dgram.h
 create mode 100644 test/syscalls/linux/socket_unix_dgram_local.cc
 create mode 100644 test/syscalls/linux/socket_unix_dgram_non_blocking.cc
 create mode 100644 test/syscalls/linux/socket_unix_domain.cc
 create mode 100644 test/syscalls/linux/socket_unix_filesystem.cc
 create mode 100644 test/syscalls/linux/socket_unix_filesystem_nonblock.cc
 create mode 100644 test/syscalls/linux/socket_unix_non_stream.cc
 create mode 100644 test/syscalls/linux/socket_unix_non_stream.h
 create mode 100644 test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
 create mode 100644 test/syscalls/linux/socket_unix_pair.cc
 create mode 100644 test/syscalls/linux/socket_unix_pair_nonblock.cc
 create mode 100644 test/syscalls/linux/socket_unix_seqpacket.cc
 create mode 100644 test/syscalls/linux/socket_unix_seqpacket.h
 create mode 100644 test/syscalls/linux/socket_unix_seqpacket_local.cc
 create mode 100644 test/syscalls/linux/socket_unix_stream.cc
 create mode 100644 test/syscalls/linux/socket_unix_stream_blocking_local.cc
 create mode 100644 test/syscalls/linux/socket_unix_stream_local.cc
 create mode 100644 test/syscalls/linux/socket_unix_stream_nonblock_local.cc
 create mode 100644 test/syscalls/linux/socket_unix_unbound_abstract.cc
 create mode 100644 test/syscalls/linux/socket_unix_unbound_dgram.cc
 create mode 100644 test/syscalls/linux/socket_unix_unbound_filesystem.cc
 create mode 100644 test/syscalls/linux/socket_unix_unbound_seqpacket.cc
 create mode 100644 test/syscalls/linux/socket_unix_unbound_stream.cc
 create mode 100644 test/syscalls/linux/stat.cc
 create mode 100644 test/syscalls/linux/stat_times.cc
 create mode 100644 test/syscalls/linux/statfs.cc
 create mode 100644 test/syscalls/linux/sticky.cc
 create mode 100644 test/syscalls/linux/symlink.cc
 create mode 100644 test/syscalls/linux/sync.cc
 create mode 100644 test/syscalls/linux/sync_file_range.cc
 create mode 100644 test/syscalls/linux/sysinfo.cc
 create mode 100644 test/syscalls/linux/syslog.cc
 create mode 100644 test/syscalls/linux/sysret.cc
 create mode 100644 test/syscalls/linux/tcp_socket.cc
 create mode 100644 test/syscalls/linux/temp_umask.h
 create mode 100644 test/syscalls/linux/tgkill.cc
 create mode 100644 test/syscalls/linux/time.cc
 create mode 100644 test/syscalls/linux/timerfd.cc
 create mode 100644 test/syscalls/linux/timers.cc
 create mode 100644 test/syscalls/linux/tkill.cc
 create mode 100644 test/syscalls/linux/truncate.cc
 create mode 100644 test/syscalls/linux/udp_bind.cc
 create mode 100644 test/syscalls/linux/udp_socket.cc
 create mode 100644 test/syscalls/linux/uidgid.cc
 create mode 100644 test/syscalls/linux/uname.cc
 create mode 100644 test/syscalls/linux/unix_domain_socket_test_util.cc
 create mode 100644 test/syscalls/linux/unix_domain_socket_test_util.h
 create mode 100644 test/syscalls/linux/unlink.cc
 create mode 100644 test/syscalls/linux/unshare.cc
 create mode 100644 test/syscalls/linux/utimes.cc
 create mode 100644 test/syscalls/linux/vdso.cc
 create mode 100644 test/syscalls/linux/vdso_clock_gettime.cc
 create mode 100644 test/syscalls/linux/vfork.cc
 create mode 100644 test/syscalls/linux/vsyscall.cc
 create mode 100644 test/syscalls/linux/wait.cc
 create mode 100644 test/syscalls/linux/write.cc
 create mode 100644 test/syscalls/syscall_test.go
 create mode 100755 test/syscalls/syscall_test_runner.sh
 create mode 100644 test/util/BUILD
 create mode 100644 test/util/capability_util.cc
 create mode 100644 test/util/capability_util.h
 create mode 100644 test/util/cleanup.h
 create mode 100644 test/util/file_descriptor.h
 create mode 100644 test/util/fs_util.cc
 create mode 100644 test/util/fs_util.h
 create mode 100644 test/util/fs_util_test.cc
 create mode 100644 test/util/logging.cc
 create mode 100644 test/util/logging.h
 create mode 100644 test/util/memory_util.h
 create mode 100644 test/util/mount_util.h
 create mode 100644 test/util/multiprocess_util.cc
 create mode 100644 test/util/multiprocess_util.h
 create mode 100644 test/util/posix_error.cc
 create mode 100644 test/util/posix_error.h
 create mode 100644 test/util/posix_error_test.cc
 create mode 100644 test/util/proc_util.cc
 create mode 100644 test/util/proc_util.h
 create mode 100644 test/util/save_util.cc
 create mode 100644 test/util/save_util.h
 create mode 100644 test/util/signal_util.cc
 create mode 100644 test/util/signal_util.h
 create mode 100644 test/util/temp_path.cc
 create mode 100644 test/util/temp_path.h
 create mode 100644 test/util/test_main.cc
 create mode 100644 test/util/test_util.cc
 create mode 100644 test/util/test_util.h
 create mode 100644 test/util/test_util_test.cc
 create mode 100644 test/util/thread_util.h
 create mode 100644 test/util/timer_util.cc
 create mode 100644 test/util/timer_util.h

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 927acb6a1..ea6440140 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -28,8 +28,8 @@ bazel version
 
 cd git/repo
 
-# Build everything.
-bazel build //...
+# Build everything except //test.
+bazel build //pkg/... //runsc/... //tools/...
 
 # Test use this variable to determine what runtime to use.
 runtime=runsc_test_$((RANDOM))
@@ -45,7 +45,10 @@ uninstallRuntime() {
 # We turn off "-e" flag because we must move the log files even if the test
 # fails.
 set +e
-bazel test --test_output=errors //...
+
+# Note: We do not run the tests in the //test folder as these would take
+# too long.
+bazel test --test_output=errors //pkg/... //runsc/... //tools/...
 exit_code=${?}
 
 # This function spawns a subshell to install crictl and containerd.
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 07afce807..15a7cdae1 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -21,6 +21,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
     visibility = [
         "//runsc:__subpackages__",
+        "//test:__subpackages__",
     ],
     deps = [
         "//pkg/abi",
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index f57af582a..28ec81d3f 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -13,6 +13,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/container",
     visibility = [
         "//runsc:__subpackages__",
+        "//test:__subpackages__",
     ],
     deps = [
         "//pkg/log",
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index a1e5da3f5..77a10e2b6 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -11,6 +11,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
     visibility = [
         "//runsc:__subpackages__",
+        "//test:__subpackages__",
     ],
     deps = [
         "//pkg/abi/linux",
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 3ed235393..826b7bf0b 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -13,6 +13,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/runsc/test/testutil",
     visibility = [
         "//runsc:__subpackages__",
+        "//test:__subpackages__",
     ],
     deps = [
         "//runsc/boot",
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
new file mode 100644
index 000000000..318d80393
--- /dev/null
+++ b/test/syscalls/BUILD
@@ -0,0 +1,522 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
+package(licenses = ["notice"])  # Apache 2.0
+
+load("//test/syscalls:build_defs.bzl", "syscall_test")
+
+syscall_test(test = "//test/syscalls/linux:32bit_test")
+
+syscall_test(test = "//test/syscalls/linux:accept_bind_stream_test")
+
+syscall_test(
+    size = "enormous",
+    test = "//test/syscalls/linux:accept_bind_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:access_test")
+
+syscall_test(test = "//test/syscalls/linux:affinity_test")
+
+syscall_test(test = "//test/syscalls/linux:aio_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:alarm_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:arch_prctl_test")
+
+syscall_test(test = "//test/syscalls/linux:bad_test")
+
+syscall_test(
+    size = "large",
+    test = "//test/syscalls/linux:bind_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:brk_test")
+
+syscall_test(test = "//test/syscalls/linux:chdir_test")
+
+syscall_test(test = "//test/syscalls/linux:chmod_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:chown_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:chroot_test")
+
+syscall_test(test = "//test/syscalls/linux:clock_getres_test")
+
+syscall_test(test = "//test/syscalls/linux:clock_gettime_test")
+
+syscall_test(test = "//test/syscalls/linux:clock_nanosleep_test")
+
+syscall_test(test = "//test/syscalls/linux:concurrency_test")
+
+syscall_test(test = "//test/syscalls/linux:creat_test")
+
+syscall_test(test = "//test/syscalls/linux:dev_test")
+
+syscall_test(test = "//test/syscalls/linux:dup_test")
+
+syscall_test(test = "//test/syscalls/linux:epoll_test")
+
+syscall_test(test = "//test/syscalls/linux:eventfd_test")
+
+syscall_test(test = "//test/syscalls/linux:exceptions_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:exec_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:exec_binary_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:exit_test")
+
+syscall_test(test = "//test/syscalls/linux:fadvise64_test")
+
+syscall_test(test = "//test/syscalls/linux:fallocate_test")
+
+syscall_test(test = "//test/syscalls/linux:fault_test")
+
+syscall_test(test = "//test/syscalls/linux:fchdir_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:fcntl_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:flock_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:fork_test")
+
+syscall_test(test = "//test/syscalls/linux:fpsig_fork_test")
+
+syscall_test(test = "//test/syscalls/linux:fpsig_nested_test")
+
+syscall_test(test = "//test/syscalls/linux:fsync_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:futex_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:getcpu_host_test")
+
+syscall_test(test = "//test/syscalls/linux:getcpu_test")
+
+syscall_test(test = "//test/syscalls/linux:getdents_test")
+
+syscall_test(test = "//test/syscalls/linux:getrandom_test")
+
+syscall_test(test = "//test/syscalls/linux:getrusage_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:inotify_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:ioctl_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:itimer_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:kill_test")
+
+syscall_test(test = "//test/syscalls/linux:link_test")
+
+syscall_test(test = "//test/syscalls/linux:lseek_test")
+
+syscall_test(test = "//test/syscalls/linux:madvise_test")
+
+syscall_test(test = "//test/syscalls/linux:memory_accounting_test")
+
+syscall_test(test = "//test/syscalls/linux:mempolicy_test")
+
+syscall_test(test = "//test/syscalls/linux:mincore_test")
+
+syscall_test(test = "//test/syscalls/linux:mkdir_test")
+
+syscall_test(test = "//test/syscalls/linux:mknod_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:mmap_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:mount_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:mremap_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:msync_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:munmap_test")
+
+syscall_test(test = "//test/syscalls/linux:open_create_test")
+
+syscall_test(test = "//test/syscalls/linux:open_test")
+
+syscall_test(test = "//test/syscalls/linux:partial_bad_buffer_test")
+
+syscall_test(test = "//test/syscalls/linux:pause_test")
+
+syscall_test(test = "//test/syscalls/linux:pipe_test")
+
+syscall_test(test = "//test/syscalls/linux:poll_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:ppoll_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:prctl_setuid_test")
+
+syscall_test(test = "//test/syscalls/linux:prctl_test")
+
+syscall_test(test = "//test/syscalls/linux:pread64_test")
+
+syscall_test(test = "//test/syscalls/linux:preadv_test")
+
+syscall_test(test = "//test/syscalls/linux:preadv2_test")
+
+syscall_test(test = "//test/syscalls/linux:priority_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:proc_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:pselect_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:ptrace_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:pty_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:pwrite64_test")
+
+syscall_test(test = "//test/syscalls/linux:read_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:readv_socket_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:readv_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:rename_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:rlimits_test")
+
+syscall_test(test = "//test/syscalls/linux:rtsignal_test")
+
+syscall_test(test = "//test/syscalls/linux:sched_test")
+
+syscall_test(test = "//test/syscalls/linux:sched_yield_test")
+
+syscall_test(test = "//test/syscalls/linux:seccomp_test")
+
+syscall_test(test = "//test/syscalls/linux:select_test")
+
+syscall_test(test = "//test/syscalls/linux:semaphore_test")
+
+syscall_test(test = "//test/syscalls/linux:sendfile_socket_test")
+
+syscall_test(test = "//test/syscalls/linux:sendfile_test")
+
+syscall_test(test = "//test/syscalls/linux:sigaction_test")
+
+# TODO: Enable once the test passes in runsc.
+# syscall_test(test = "//test/syscalls/linux:sigaltstack_test")
+
+syscall_test(test = "//test/syscalls/linux:sigiret_test")
+
+syscall_test(test = "//test/syscalls/linux:sigprocmask_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:sigstop_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:sigtimedwait_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:shm_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_abstract_non_blocking_test",
+)
+
+syscall_test(
+    size = "enormous",
+    test = "//test/syscalls/linux:socket_abstract_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_domain_non_blocking_test",
+)
+
+syscall_test(
+    size = "enormous",
+    test = "//test/syscalls/linux:socket_domain_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_filesystem_non_blocking_test",
+)
+
+syscall_test(
+    size = "enormous",
+    test = "//test/syscalls/linux:socket_filesystem_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_inet_loopback_test",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/syscalls/linux:socket_ip_tcp_generic_loopback_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_ip_tcp_loopback_non_blocking_test",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/syscalls/linux:socket_ip_tcp_loopback_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_ip_tcp_udp_generic_loopback_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_ip_udp_loopback_non_blocking_test",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/syscalls/linux:socket_ip_udp_loopback_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:socket_netdevice_test")
+
+syscall_test(test = "//test/syscalls/linux:socket_netlink_route_test")
+
+syscall_test(test = "//test/syscalls/linux:socket_non_stream_blocking_local_test")
+
+syscall_test(test = "//test/syscalls/linux:socket_non_stream_blocking_udp_test")
+
+syscall_test(
+    size = "large",
+    test = "//test/syscalls/linux:socket_stream_blocking_local_test",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/syscalls/linux:socket_stream_blocking_tcp_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_stream_local_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_stream_nonblock_local_test",
+)
+
+syscall_test(
+    size = "enormous",
+    test = "//test/syscalls/linux:socket_unix_abstract_test",
+)
+
+syscall_test(
+    # NOTE: Large sendmsg may stall a long time.
+    size = "enormous",
+    test = "//test/syscalls/linux:socket_unix_dgram_local_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_unix_dgram_non_blocking_test",
+)
+
+syscall_test(
+    size = "enormous",
+    test = "//test/syscalls/linux:socket_unix_filesystem_test",
+)
+
+syscall_test(
+    size = "enormous",
+    test = "//test/syscalls/linux:socket_unix_pair_test",
+)
+
+syscall_test(
+    # NOTE: Large sendmsg may stall a long time.
+    size = "enormous",
+    test = "//test/syscalls/linux:socket_unix_seqpacket_local_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_unix_stream_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_unix_unbound_abstract_test",
+)
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_unix_unbound_dgram_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:socket_unix_unbound_filesystem_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:socket_unix_unbound_seqpacket_test",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/syscalls/linux:socket_unix_unbound_stream_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:statfs_test")
+
+syscall_test(test = "//test/syscalls/linux:stat_test")
+
+syscall_test(test = "//test/syscalls/linux:stat_times_test")
+
+syscall_test(test = "//test/syscalls/linux:sticky_test")
+
+syscall_test(test = "//test/syscalls/linux:symlink_test")
+
+syscall_test(test = "//test/syscalls/linux:sync_test")
+
+syscall_test(test = "//test/syscalls/linux:sync_file_range_test")
+
+syscall_test(test = "//test/syscalls/linux:sysinfo_test")
+
+syscall_test(test = "//test/syscalls/linux:syslog_test")
+
+syscall_test(test = "//test/syscalls/linux:sysret_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:tcp_socket_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:tgkill_test")
+
+syscall_test(test = "//test/syscalls/linux:timerfd_test")
+
+syscall_test(test = "//test/syscalls/linux:timers_test")
+
+syscall_test(test = "//test/syscalls/linux:time_test")
+
+syscall_test(test = "//test/syscalls/linux:tkill_test")
+
+syscall_test(test = "//test/syscalls/linux:truncate_test")
+
+syscall_test(test = "//test/syscalls/linux:udp_bind_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:udp_socket_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:uidgid_test")
+
+syscall_test(test = "//test/syscalls/linux:uname_test")
+
+syscall_test(test = "//test/syscalls/linux:unlink_test")
+
+syscall_test(test = "//test/syscalls/linux:unshare_test")
+
+syscall_test(test = "//test/syscalls/linux:utimes_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:vdso_clock_gettime_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:vdso_test")
+
+syscall_test(test = "//test/syscalls/linux:vsyscall_test")
+
+syscall_test(test = "//test/syscalls/linux:vfork_test")
+
+syscall_test(
+    size = "medium",
+    test = "//test/syscalls/linux:wait_test",
+)
+
+syscall_test(test = "//test/syscalls/linux:write_test")
+
+go_test(
+    name = "syscall_test",
+    srcs = ["syscall_test.go"],
+    data = [
+        "//runsc",
+    ],
+    # Running this test by itself does not make sense. It should only be run
+    # via the syscall_test macro.
+    tags = [
+        "manual",
+    ],
+    deps = [
+        "//pkg/log",
+        "//runsc/boot",
+        "//runsc/container",
+        "//runsc/specutils",
+        "//runsc/test/testutil",
+        "//test/syscalls/gtest",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/test/syscalls/README.md b/test/syscalls/README.md
new file mode 100644
index 000000000..69712663c
--- /dev/null
+++ b/test/syscalls/README.md
@@ -0,0 +1,110 @@
+# gVisor system call test suite
+
+This is a test suite for Linux system calls. It runs under both gVisor and
+Linux, and ensures compatability between the two.
+
+When adding support for a new syscall (or syscall argument) to gVisor, a
+corresponding syscall test should be added. It's usually recommended to write
+the test first and make sure that it passes on Linux before making changes to
+gVisor.
+
+This document outlines the general guidelines for tests and specific rules that
+must be followed for new tests.
+
+## Running the tests
+
+Each test file generates three different test targets that run in different
+environments:
+
+* a `native` target that runs directly on the host machine,
+* a `runsc_ptrace` target that runs inside runsc using the ptrace platform, and
+* a `runsc_kvm` target that runs inside runsc using the KVM platform.
+
+For example, the test in `access_test.cc` generates the following targets:
+
+* `//test/syscalls:access_test_native`
+* `//test/syscalls:access_test_runsc_ptrace`
+* `//test/syscalls:access_test_runsc_kvm`
+
+Any of these targets can be run directly via `bazel test`.
+
+```bash
+$ bazel test //test/syscalls:access_test_native
+$ bazel test //test/syscalls:access_test_runsc_ptrace
+$ bazel test //test/syscalls:access_test_runsc_kvm
+```
+
+To run all the tests on a particular platform, you can filter by the platform
+tag:
+
+```bash
+# Run all tests in native environment:
+$ bazel test --test_tag_filter=native //test/syscalls:*
+
+# Run all tests in runsc with ptrace:
+$ bazel test --test_tag_filter=runsc_ptrace //test/syscalls:*
+
+# Run all tests in runsc with kvm:
+$ bazel test --test_tag_filter=runsc_kvm //test/syscalls:*
+```
+
+You can also run all the tests on every platform. (Warning, this may take a
+while to run.)
+
+```bash
+# Run all tests on every platform:
+$ bazel test //test/syscalls:*
+```
+
+## Writing new tests
+
+Whenever we add support for a new syscall, or add support for a new argument or
+option for a syscall, we should always add a new test (perhaps many new tests).
+
+In general, it is best to write the test first and make sure it passes on Linux
+by running the test on the `native` platform on a Linux machine. This ensures
+that the gVisor implementation matches actual Linux behavior. Sometimes man
+pages contain errors, so always check the actual Linux behavior.
+
+gVisor uses the [Google Test][googletest] test framework, with a few custom
+matchers and guidelines, described below.
+
+### Syscall matchers
+
+When testing an individual system call, use the following syscall matchers,
+which will match the value returned by the syscall and the errno.
+
+```cc
+SyscallSucceeds()
+SyscallSucceedsWithValue(...)
+SyscallFails()
+SyscallFailsWithErrno(...)
+```
+
+### Use test utilities (RAII classes)
+
+The test utilties are written as RAII classes. These utilities should be
+preferred over custom test harnesses.
+
+Local class instances should be preferred, whereever possible, over full test
+fixtures.
+
+A test utility should be created when there is more than one test that requires
+that same functionality, otherwise the class should be test local.
+
+
+## Save/Restore support in tests
+gVisor supports save/restore, and our syscall tests are written in a way to
+enable saving/restoring at certain points. Hence, there are calls to
+`MaybeSave`, and certain tests that should not trigger saves are named with
+`NoSave`.
+
+However, the current open-source test runner does not yet support triggering
+save/restore, so these functions and annotations have no effect on the
+open-source tests.
+
+We plan on extending our open-source test runner to trigger save/restore. Until
+then, these functions and annotations should be ignored.
+
+
+[googletest]: https://github.com/abseil/googletest
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
new file mode 100644
index 000000000..31b311f63
--- /dev/null
+++ b/test/syscalls/build_defs.bzl
@@ -0,0 +1,54 @@
+"""Defines a rule for syscall test targets."""
+
+# syscall_test is a macro that will create targets to run the given test target
+# on the host (native) and runsc.
+def syscall_test(test, size = "small"):
+    _syscall_test(test, size, "native")
+    _syscall_test(test, size, "kvm")
+    _syscall_test(test, size, "ptrace")
+
+def _syscall_test(test, size, platform):
+    test_name = test.split(":")[1]
+
+    # Prepend "runsc" to non-native platform names.
+    full_platform = platform if platform == "native" else "runsc_" + platform
+
+    # Add the full_platform in a tag to make it easier to run all the tests on
+    # a specific platform.
+    tags = [full_platform]
+
+    # Add tag to prevent the tests from running in a Bazel sandbox.
+    # TODO: Make the tests run without this tag.
+    tags.append("no-sandbox")
+
+    # TODO: KVM tests are tagged "manual" to until the platform is
+    # more stable.
+    if platform == "kvm":
+        tags += ["manual"]
+
+    sh_test(
+        srcs = ["syscall_test_runner.sh"],
+        name = test_name + "_" + full_platform,
+        data = [
+            ":syscall_test",
+            test,
+        ],
+        args = [
+            # First argument is location to syscall_test binary.
+            "$(location :syscall_test)",
+            # Rest of arguments are passed directly to syscall_test binary.
+            "--test-name=" + test_name,
+            "--platform=" + platform,
+            "--debug=false",
+            "--strace=false",
+            "--parallel=true",
+        ],
+        size = size,
+        tags = tags,
+    )
+
+def sh_test(**kwargs):
+    """Wraps the standard sh_test."""
+    native.sh_test(
+        **kwargs
+    )
diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD
new file mode 100644
index 000000000..d078fd3d5
--- /dev/null
+++ b/test/syscalls/gtest/BUILD
@@ -0,0 +1,12 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])  # Apache 2.0
+
+go_library(
+    name = "gtest",
+    srcs = ["gtest.go"],
+    importpath = "gvisor.googlesource.com/gvisor/test/syscalls/gtest",
+    visibility = [
+        "//test:__subpackages__",
+    ],
+)
diff --git a/test/syscalls/gtest/gtest.go b/test/syscalls/gtest/gtest.go
new file mode 100644
index 000000000..dfe5037cd
--- /dev/null
+++ b/test/syscalls/gtest/gtest.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gtest contains helpers for running google-test tests from Go.
+package gtest
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+var (
+	// ListTestFlag is the flag that will list tests in gtest binaries.
+	ListTestFlag = "--gtest_list_tests"
+
+	// FilterTestFlag is the flag that will filter tests in gtest binaries.
+	FilterTestFlag = "--gtest_filter"
+)
+
+// TestCase is a single gtest test case.
+type TestCase struct {
+	// Suite is the suite for this test.
+	Suite string
+
+	// Name is the name of this individual test.
+	Name string
+}
+
+// FullName returns the name of the test including the suite. It is suitable to
+// pass to "-gtest_filter".
+func (tc TestCase) FullName() string {
+	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
+}
+
+// ParseTestCases calls a gtest test binary to list its test and returns a
+// slice with the name and suite of each test.
+func ParseTestCases(testBin string, extraArgs ...string) ([]TestCase, error) {
+	args := append([]string{ListTestFlag}, extraArgs...)
+	cmd := exec.Command(testBin, args...)
+	out, err := cmd.Output()
+	if err != nil {
+		exitErr, ok := err.(*exec.ExitError)
+		if !ok {
+			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
+		}
+		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
+	}
+
+	var t []TestCase
+	var suite string
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// New suite?
+		if !strings.HasPrefix(line, " ") {
+			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
+			continue
+		}
+
+		// Individual test.
+		name := strings.TrimSpace(line)
+
+		// Do we have a suite yet?
+		if suite == "" {
+			return nil, fmt.Errorf("test without a suite: %v", name)
+		}
+
+		// Add this individual test.
+		t = append(t, TestCase{
+			Suite: suite,
+			Name:  name,
+		})
+
+	}
+
+	if len(t) == 0 {
+		return nil, fmt.Errorf("no tests parsed from %v", testBin)
+	}
+	return t, nil
+}
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
new file mode 100644
index 000000000..b8d5f0355
--- /dev/null
+++ b/test/syscalls/linux/32bit.cc
@@ -0,0 +1,226 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include <sys/mman.h>
+
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "gtest/gtest.h"
+
+#ifndef __x86_64__
+#error "This test is x86-64 specific."
+#endif
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kInt3 = '\xcc';
+
+constexpr char kInt80[2] = {'\xcd', '\x80'};
+constexpr char kSyscall[2] = {'\x0f', '\x05'};
+constexpr char kSysenter[2] = {'\x0f', '\x34'};
+
+void ExitGroup32(const char instruction[2], int code) {
+  const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE | PROT_EXEC,
+           MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0));
+
+  // Fill with INT 3 in case we execute too far.
+  memset(m.ptr(), kInt3, m.len());
+
+  memcpy(m.ptr(), instruction, 2);
+
+  // We're playing *extremely* fast-and-loose with the various syscall ABIs
+  // here, which we can more-or-less get away with since exit_group doesn't
+  // return.
+  //
+  // SYSENTER expects the user stack in (%ebp) and arg6 in 0(%ebp). The kernel
+  // will unconditionally dereference %ebp for arg6, so we must pass a valid
+  // address or it will return EFAULT.
+  //
+  // SYSENTER also unconditionally returns to thread_info->sysenter_return which
+  // is ostensibly a stub in the 32-bit VDSO. But a 64-bit binary doesn't have
+  // the 32-bit VDSO mapped, so sysenter_return will simply be the value
+  // inherited from the most recent 32-bit ancestor, or NULL if there is none.
+  // As a result, return would not return from SYSENTER.
+  asm volatile(
+      "movl $252, %%eax\n"     // exit_group
+      "movl %[code], %%ebx\n"  // code
+      "movl %%edx, %%ebp\n"    // SYSENTER: user stack (use IP as a valid addr)
+      "leaq -20(%%rsp), %%rsp\n"
+      "movl $0x2b, 16(%%rsp)\n"  // SS = CPL3 data segment
+      "movl $0,12(%%rsp)\n"      // ESP = nullptr (unused)
+      "movl $0, 8(%%rsp)\n"      // EFLAGS
+      "movl $0x23, 4(%%rsp)\n"   // CS = CPL3 32-bit code segment
+      "movl %%edx, 0(%%rsp)\n"   // EIP
+      "iretl\n"
+      "int $3\n"
+      :
+      : [code] "m"(code), [ip] "d"(m.ptr())
+      : "rax", "rbx", "rsp");
+}
+
+constexpr int kExitCode = 42;
+
+TEST(Syscall32Bit, Int80) {
+  switch (GvisorPlatform()) {
+    case Platform::kKVM:
+      // TODO: 32-bit segments are broken (but not explictly
+      // disabled).
+      return;
+    case Platform::kPtrace:
+      // TODO: The ptrace platform does not have a consistent story
+      // here.
+      return;
+    case Platform::kNative:
+      break;
+  }
+
+  // Upstream Linux. 32-bit syscalls allowed.
+  EXPECT_EXIT(ExitGroup32(kInt80, kExitCode), ::testing::ExitedWithCode(42),
+              "");
+}
+
+TEST(Syscall32Bit, Sysenter) {
+  switch (GvisorPlatform()) {
+    case Platform::kKVM:
+      // TODO: See above.
+      return;
+    case Platform::kPtrace:
+      // TODO: See above.
+      return;
+    case Platform::kNative:
+      break;
+  }
+
+  if (GetCPUVendor() == CPUVendor::kAMD) {
+    // SYSENTER is an illegal instruction in compatibility mode on AMD.
+    EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+                ::testing::KilledBySignal(SIGILL), "");
+    return;
+  }
+
+  // Upstream Linux on !AMD, 32-bit syscalls allowed.
+  EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode), ::testing::ExitedWithCode(42),
+              "");
+}
+
+TEST(Syscall32Bit, Syscall) {
+  switch (GvisorPlatform()) {
+    case Platform::kKVM:
+      // TODO: See above.
+      return;
+    case Platform::kPtrace:
+      // TODO: See above.
+      return;
+    case Platform::kNative:
+      break;
+  }
+
+  if (GetCPUVendor() == CPUVendor::kIntel) {
+    // SYSCALL is an illegal instruction in compatibility mode on Intel.
+    EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+                ::testing::KilledBySignal(SIGILL), "");
+    return;
+  }
+
+  // Upstream Linux on !Intel, 32-bit syscalls allowed.
+  EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode), ::testing::ExitedWithCode(42),
+              "");
+}
+
+// Far call code called below.
+//
+// Input stack layout:
+//
+// %esp+12 lcall segment
+// %esp+8  lcall address offset
+// %esp+0  return address
+//
+// The lcall will enter compatibility mode and jump to the call address (the
+// address of the lret). The lret will return to 64-bit mode at the retq, which
+// will return to the external caller of this function.
+//
+// Since this enters compatibility mode, it must be mapped in a 32-bit region of
+// address space and have a 32-bit stack pointer.
+constexpr char kFarCall[] = {
+    '\x67', '\xff', '\x5c', '\x24', '\x08',  // lcall *8(%esp)
+    '\xc3',                                  // retq
+    '\xcb',                                  // lret
+};
+
+void FarCall32() {
+  const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE | PROT_EXEC,
+           MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0));
+
+  // Fill with INT 3 in case we execute too far.
+  memset(m.ptr(), kInt3, m.len());
+
+  // 32-bit code.
+  memcpy(m.ptr(), kFarCall, sizeof(kFarCall));
+
+  // Use the end of the code page as its stack.
+  uintptr_t stack = m.endaddr();
+
+  uintptr_t lcall = m.addr();
+  uintptr_t lret = m.addr() + sizeof(kFarCall) - 1;
+
+  // N.B. We must save and restore RSP manually. GCC can do so automatically
+  // with an "rsp" clobber, but clang cannot.
+  asm volatile(
+      // Place the address of lret (%edx) and the 32-bit code segment (0x23) on
+      // the 32-bit stack for lcall.
+      "subl $0x8, %%ecx\n"
+      "movl $0x23, 4(%%ecx)\n"
+      "movl %%edx, 0(%%ecx)\n"
+
+      // Save the current stack and switch to 32-bit stack.
+      "pushq %%rbp\n"
+      "movq %%rsp, %%rbp\n"
+      "movq %%rcx, %%rsp\n"
+
+      // Run the lcall code.
+      "callq *%%rbx\n"
+
+      // Restore the old stack.
+      "leaveq\n"
+      : "+c"(stack)
+      : "b"(lcall), "d"(lret));
+}
+
+TEST(Call32Bit, Disallowed) {
+  switch (GvisorPlatform()) {
+    case Platform::kKVM:
+      // TODO: See above.
+      return;
+    case Platform::kPtrace:
+      // The ptrace platform cannot prevent switching to compatibility mode.
+      ABSL_FALLTHROUGH_INTENDED;
+    case Platform::kNative:
+      break;
+  }
+
+  // Shouldn't crash.
+  FarCall32();
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
new file mode 100644
index 000000000..1c48a2a43
--- /dev/null
+++ b/test/syscalls/linux/BUILD
@@ -0,0 +1,2951 @@
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_binary(
+    name = "sigaltstack_check",
+    testonly = 1,
+    srcs = ["sigaltstack_check.cc"],
+    deps = ["//test/util:logging"],
+)
+
+cc_binary(
+    name = "exec_assert_closed_workload",
+    testonly = 1,
+    srcs = ["exec_assert_closed_workload.cc"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_binary(
+    name = "exec_basic_workload",
+    testonly = 1,
+    srcs = [
+        "exec.h",
+        "exec_basic_workload.cc",
+    ],
+)
+
+cc_binary(
+    name = "exec_proc_exe_workload",
+    testonly = 1,
+    srcs = ["exec_proc_exe_workload.cc"],
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+    ],
+)
+
+cc_binary(
+    name = "exec_state_workload",
+    testonly = 1,
+    srcs = ["exec_state_workload.cc"],
+)
+
+sh_binary(
+    name = "exit_script",
+    testonly = 1,
+    srcs = [
+        "exit_script.sh",
+    ],
+)
+
+cc_binary(
+    name = "priority_execve",
+    testonly = 1,
+    srcs = [
+        "priority_execve.cc",
+    ],
+)
+
+cc_library(
+    name = "base_poll_test",
+    testonly = 1,
+    srcs = ["base_poll_test.cc"],
+    hdrs = ["base_poll_test.h"],
+    deps = [
+        "//test/util:logging",
+        "//test/util:signal_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "file_base",
+    testonly = 1,
+    hdrs = ["file_base.h"],
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "socket_netlink_util",
+    testonly = 1,
+    srcs = ["socket_netlink_util.cc"],
+    hdrs = ["socket_netlink_util.h"],
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "socket_test_util",
+    testonly = 1,
+    srcs = ["socket_test_util.cc"],
+    hdrs = ["socket_test_util.h"],
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "temp_umask",
+    hdrs = ["temp_umask.h"],
+)
+
+cc_library(
+    name = "unix_domain_socket_test_util",
+    testonly = 1,
+    srcs = ["unix_domain_socket_test_util.cc"],
+    hdrs = ["unix_domain_socket_test_util.h"],
+    deps = [
+        ":socket_test_util",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "ip_socket_test_util",
+    testonly = 1,
+    srcs = ["ip_socket_test_util.cc"],
+    hdrs = ["ip_socket_test_util.h"],
+    deps = [
+        ":socket_test_util",
+    ],
+)
+
+cc_binary(
+    name = "clock_nanosleep_test",
+    testonly = 1,
+    srcs = ["clock_nanosleep.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:posix_error",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "32bit_test",
+    testonly = 1,
+    srcs = ["32bit.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "accept_bind_test",
+    testonly = 1,
+    srcs = ["accept_bind.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "accept_bind_stream_test",
+    testonly = 1,
+    srcs = ["accept_bind_stream.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "access_test",
+    testonly = 1,
+    srcs = ["access.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "affinity_test",
+    testonly = 1,
+    srcs = ["affinity.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "aio_test",
+    testonly = 1,
+    srcs = [
+        "aio.cc",
+        "file_base.h",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "alarm_test",
+    testonly = 1,
+    srcs = ["alarm.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:signal_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "bad_test",
+    testonly = 1,
+    srcs = ["bad.cc"],
+    linkstatic = 1,
+    visibility = [
+        "//:sandbox",
+    ],
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "bind_test",
+    testonly = 1,
+    srcs = ["bind.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "brk_test",
+    testonly = 1,
+    srcs = ["brk.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "chdir_test",
+    testonly = 1,
+    srcs = ["chdir.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "chmod_test",
+    testonly = 1,
+    srcs = ["chmod.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "chown_test",
+    testonly = 1,
+    srcs = ["chown.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sticky_test",
+    testonly = 1,
+    srcs = ["sticky.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "chroot_test",
+    testonly = 1,
+    srcs = ["chroot.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:mount_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "clock_getres_test",
+    testonly = 1,
+    srcs = ["clock_getres.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "clock_gettime_test",
+    testonly = 1,
+    srcs = ["clock_gettime.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "concurrency_test",
+    testonly = 1,
+    srcs = ["concurrency.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "creat_test",
+    testonly = 1,
+    srcs = ["creat.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "dev_test",
+    testonly = 1,
+    srcs = ["dev.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "dup_test",
+    testonly = 1,
+    srcs = ["dup.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "epoll_test",
+    testonly = 1,
+    srcs = ["epoll.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "eventfd_test",
+    testonly = 1,
+    srcs = ["eventfd.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "exceptions_test",
+    testonly = 1,
+    srcs = ["exceptions.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "getcpu_test",
+    testonly = 1,
+    srcs = ["getcpu.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "getcpu_host_test",
+    testonly = 1,
+    srcs = ["getcpu.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "getrusage_test",
+    testonly = 1,
+    srcs = ["getrusage.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "exec_binary_test",
+    testonly = 1,
+    srcs = ["exec_binary.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:proc_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "exec_test",
+    testonly = 1,
+    srcs = [
+        "exec.cc",
+        "exec.h",
+    ],
+    data = [
+        ":exec_assert_closed_workload",
+        ":exec_basic_workload",
+        ":exec_proc_exe_workload",
+        ":exec_state_workload",
+        ":exit_script",
+        ":priority_execve",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "exit_test",
+    testonly = 1,
+    srcs = ["exit.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "fallocate_test",
+    testonly = 1,
+    srcs = ["fallocate.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "fault_test",
+    testonly = 1,
+    srcs = ["fault.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "fchdir_test",
+    testonly = 1,
+    srcs = ["fchdir.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "fcntl_test",
+    testonly = 1,
+    srcs = ["fcntl.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:cleanup",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "flock_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "flock.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "fork_test",
+    testonly = 1,
+    srcs = ["fork.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "fpsig_fork_test",
+    testonly = 1,
+    srcs = ["fpsig_fork.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "fpsig_nested_test",
+    testonly = 1,
+    srcs = ["fpsig_nested.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sync_file_range_test",
+    testonly = 1,
+    srcs = ["sync_file_range.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "fsync_test",
+    testonly = 1,
+    srcs = ["fsync.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "futex_test",
+    testonly = 1,
+    srcs = ["futex.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:memory_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "getdents_test",
+    testonly = 1,
+    srcs = ["getdents.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "getrandom_test",
+    testonly = 1,
+    srcs = ["getrandom.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "inotify_test",
+    testonly = 1,
+    srcs = ["inotify.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+cc_binary(
+    name = "ioctl_test",
+    testonly = 1,
+    srcs = ["ioctl.cc"],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "itimer_test",
+    testonly = 1,
+    srcs = ["itimer.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:signal_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "kill_test",
+    testonly = 1,
+    srcs = ["kill.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "link_test",
+    testonly = 1,
+    srcs = ["link.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "lseek_test",
+    testonly = 1,
+    srcs = ["lseek.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "madvise_test",
+    testonly = 1,
+    srcs = ["madvise.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "mempolicy_test",
+    testonly = 1,
+    srcs = ["mempolicy.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "mincore_test",
+    testonly = 1,
+    srcs = ["mincore.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "mkdir_test",
+    testonly = 1,
+    srcs = ["mkdir.cc"],
+    linkstatic = 1,
+    deps = [
+        ":temp_umask",
+        "//test/util:capability_util",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "mknod_test",
+    testonly = 1,
+    srcs = ["mknod.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "mmap_test",
+    testonly = 1,
+    srcs = ["mmap.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:memory_util",
+        "//test/util:multiprocess_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "mount_test",
+    testonly = 1,
+    srcs = ["mount.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:mount_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "mremap_test",
+    testonly = 1,
+    srcs = ["mremap.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "msync_test",
+    testonly = 1,
+    srcs = ["msync.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "munmap_test",
+    testonly = 1,
+    srcs = ["munmap.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "open_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "open.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "open_create_test",
+    testonly = 1,
+    srcs = ["open_create.cc"],
+    linkstatic = 1,
+    deps = [
+        ":temp_umask",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "pty_test",
+    testonly = 1,
+    srcs = ["pty.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "partial_bad_buffer_test",
+    testonly = 1,
+    srcs = ["partial_bad_buffer.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "pause_test",
+    testonly = 1,
+    srcs = ["pause.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "pipe_test",
+    testonly = 1,
+    srcs = ["pipe.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "poll_test",
+    testonly = 1,
+    srcs = ["poll.cc"],
+    linkstatic = 1,
+    deps = [
+        ":base_poll_test",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "ppoll_test",
+    testonly = 1,
+    srcs = ["ppoll.cc"],
+    linkstatic = 1,
+    deps = [
+        ":base_poll_test",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "arch_prctl_test",
+    testonly = 1,
+    srcs = ["arch_prctl.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "prctl_test",
+    testonly = 1,
+    srcs = ["prctl.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "prctl_setuid_test",
+    testonly = 1,
+    srcs = ["prctl_setuid.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "pread64_test",
+    testonly = 1,
+    srcs = ["pread64.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "preadv_test",
+    testonly = 1,
+    srcs = ["preadv.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "preadv2_test",
+    testonly = 1,
+    srcs = [
+        "preadv2.cc",
+        "readv_common.cc",
+        "readv_common.h",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":file_base",
+        "//test/util:file_descriptor",
+        "//test/util:memory_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "priority_test",
+    testonly = 1,
+    srcs = ["priority.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "proc_test",
+    testonly = 1,
+    srcs = ["proc.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "proc_net_test",
+    testonly = 1,
+    srcs = ["proc_net.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "pselect_test",
+    testonly = 1,
+    srcs = ["pselect.cc"],
+    linkstatic = 1,
+    deps = [
+        ":base_poll_test",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "ptrace_test",
+    testonly = 1,
+    srcs = ["ptrace.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "pwrite64_test",
+    testonly = 1,
+    srcs = ["pwrite64.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "read_test",
+    testonly = 1,
+    srcs = ["read.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "readv_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "readv.cc",
+        "readv_common.cc",
+        "readv_common.h",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "readv_socket_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "readv_common.cc",
+        "readv_common.h",
+        "readv_socket.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "rename_test",
+    testonly = 1,
+    srcs = ["rename.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "rlimits_test",
+    testonly = 1,
+    srcs = ["rlimits.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "rtsignal_test",
+    testonly = 1,
+    srcs = ["rtsignal.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:logging",
+        "//test/util:posix_error",
+        "//test/util:signal_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sched_test",
+    testonly = 1,
+    srcs = ["sched.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sched_yield_test",
+    testonly = 1,
+    srcs = ["sched_yield.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "seccomp_test",
+    testonly = 1,
+    srcs = ["seccomp.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:proc_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "select_test",
+    testonly = 1,
+    srcs = ["select.cc"],
+    linkstatic = 1,
+    deps = [
+        ":base_poll_test",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sendfile_test",
+    testonly = 1,
+    srcs = ["sendfile.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sendfile_socket_test",
+    testonly = 1,
+    srcs = ["sendfile_socket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sigaction_test",
+    testonly = 1,
+    srcs = ["sigaction.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sigaltstack_test",
+    testonly = 1,
+    srcs = ["sigaltstack.cc"],
+    data = [
+        ":sigaltstack_check",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:fs_util",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sigiret_test",
+    testonly = 1,
+    srcs = ["sigiret.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:signal_util",
+        "//test/util:test_util",
+        "//test/util:timer_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sigprocmask_test",
+    testonly = 1,
+    srcs = ["sigprocmask.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sigstop_test",
+    testonly = 1,
+    srcs = ["sigstop.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sigtimedwait_test",
+    testonly = 1,
+    srcs = ["sigtimedwait.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:signal_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "socket_generic_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_generic.cc",
+    ],
+    hdrs = [
+        "socket_generic.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_unix_dgram_test_cases",
+    testonly = 1,
+    srcs = ["socket_unix_dgram.cc"],
+    hdrs = ["socket_unix_dgram.h"],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_unix_seqpacket_test_cases",
+    testonly = 1,
+    srcs = ["socket_unix_seqpacket.cc"],
+    hdrs = ["socket_unix_seqpacket.h"],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_ip_tcp_generic_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ip_tcp_generic.cc",
+    ],
+    hdrs = [
+        "socket_ip_tcp_generic.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_non_blocking_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_non_blocking.cc",
+    ],
+    hdrs = [
+        "socket_non_blocking.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_unix_non_stream_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_unix_non_stream.cc",
+    ],
+    hdrs = [
+        "socket_unix_non_stream.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:memory_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_non_stream_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_non_stream.cc",
+    ],
+    hdrs = [
+        "socket_non_stream.h",
+    ],
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "socket_abstract_test",
+    testonly = 1,
+    srcs = [
+        "socket_abstract.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_generic_test_cases",
+        ":socket_test_util",
+        ":socket_unix_test_cases",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_abstract_non_blocking_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_abstract_nonblock.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_non_blocking_test_cases",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_dgram_local_test",
+    testonly = 1,
+    srcs = ["socket_unix_dgram_local.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_non_stream_test_cases",
+        ":socket_test_util",
+        ":socket_unix_dgram_test_cases",
+        ":socket_unix_non_stream_test_cases",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_dgram_non_blocking_test",
+    testonly = 1,
+    srcs = ["socket_unix_dgram_non_blocking.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_seqpacket_local_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_seqpacket_local.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_non_stream_test_cases",
+        ":socket_test_util",
+        ":socket_unix_non_stream_test_cases",
+        ":socket_unix_seqpacket_test_cases",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_stream_test",
+    testonly = 1,
+    srcs = ["socket_unix_stream.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_ip_tcp_generic_loopback_test",
+    testonly = 1,
+    srcs = [
+        "socket_ip_tcp_generic_loopback.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_ip_tcp_generic_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_ip_tcp_udp_generic_loopback_test",
+    testonly = 1,
+    srcs = [
+        "socket_ip_tcp_udp_generic.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_ip_tcp_loopback_test",
+    testonly = 1,
+    srcs = [
+        "socket_ip_tcp_loopback.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_generic_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_ip_tcp_loopback_non_blocking_test",
+    testonly = 1,
+    srcs = [
+        "socket_ip_tcp_loopback_nonblock.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_non_blocking_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_ip_udp_loopback_test",
+    testonly = 1,
+    srcs = [
+        "socket_ip_udp_loopback.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_generic_test_cases",
+        ":socket_non_stream_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_ip_udp_loopback_non_blocking_test",
+    testonly = 1,
+    srcs = [
+        "socket_ip_udp_loopback_nonblock.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_non_blocking_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_domain_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_domain.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_generic_test_cases",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_domain_non_blocking_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_pair_nonblock.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_non_blocking_test_cases",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_filesystem_test",
+    testonly = 1,
+    srcs = [
+        "socket_filesystem.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_generic_test_cases",
+        ":socket_test_util",
+        ":socket_unix_test_cases",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_filesystem_non_blocking_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_filesystem_nonblock.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_non_blocking_test_cases",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_inet_loopback_test",
+    testonly = 1,
+    srcs = ["socket_inet_loopback.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_netlink_route_test",
+    testonly = 1,
+    srcs = ["socket_netlink_route.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_netlink_util",
+        ":socket_test_util",
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# These socket tests are in a library because the test cases are shared
+# across several test build targets.
+cc_library(
+    name = "socket_stream_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_stream.cc",
+    ],
+    hdrs = [
+        "socket_stream.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_unix_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_unix.cc",
+    ],
+    hdrs = [
+        "socket_unix.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_stream_blocking_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_stream_blocking.cc",
+    ],
+    hdrs = [
+        "socket_stream_blocking.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_stream_nonblocking_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_stream_nonblock.cc",
+    ],
+    hdrs = [
+        "socket_stream_nonblock.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "socket_stream_local_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_stream_local.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_stream_test_cases",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_stream_blocking_local_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_stream_blocking_local.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_stream_blocking_test_cases",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_stream_blocking_tcp_test",
+    testonly = 1,
+    srcs = [
+        "socket_ip_tcp_loopback_blocking.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_stream_blocking_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_stream_nonblock_local_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_stream_nonblock_local.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_stream_nonblocking_test_cases",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_abstract_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_abstract.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":socket_unix_test_cases",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_unbound_dgram_test",
+    testonly = 1,
+    srcs = ["socket_unix_unbound_dgram.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_unbound_abstract_test",
+    testonly = 1,
+    srcs = ["socket_unix_unbound_abstract.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_unbound_filesystem_test",
+    testonly = 1,
+    srcs = ["socket_unix_unbound_filesystem.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_filesystem_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_filesystem.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":socket_unix_test_cases",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "socket_non_stream_blocking_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_non_stream_blocking.cc",
+    ],
+    hdrs = [
+        "socket_non_stream_blocking.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "//test/util:timer_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "socket_non_stream_blocking_local_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_non_stream_blocking_local.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_non_stream_blocking_test_cases",
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_non_stream_blocking_udp_test",
+    testonly = 1,
+    srcs = [
+        "socket_ip_udp_loopback_blocking.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_non_stream_blocking_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_pair_test",
+    testonly = 1,
+    srcs = [
+        "socket_unix_pair.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":socket_unix_test_cases",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_unbound_seqpacket_test",
+    testonly = 1,
+    srcs = ["socket_unix_unbound_seqpacket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_unix_unbound_stream_test",
+    testonly = 1,
+    srcs = ["socket_unix_unbound_stream.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_netdevice_test",
+    testonly = 1,
+    srcs = ["socket_netdevice.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_netlink_util",
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/base:endian",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "stat_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "stat.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "stat_times_test",
+    testonly = 1,
+    srcs = ["stat_times.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "statfs_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "statfs.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "symlink_test",
+    testonly = 1,
+    srcs = ["symlink.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sync_test",
+    testonly = 1,
+    srcs = ["sync.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sysinfo_test",
+    testonly = 1,
+    srcs = ["sysinfo.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "syslog_test",
+    testonly = 1,
+    srcs = ["syslog.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "sysret_test",
+    testonly = 1,
+    srcs = ["sysret.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "tcp_socket_test",
+    testonly = 1,
+    srcs = ["tcp_socket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "tgkill_test",
+    testonly = 1,
+    srcs = ["tgkill.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "time_test",
+    testonly = 1,
+    srcs = ["time.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:proc_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "timerfd_test",
+    testonly = 1,
+    srcs = ["timerfd.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "timers_test",
+    testonly = 1,
+    srcs = ["timers.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:signal_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "tkill_test",
+    testonly = 1,
+    srcs = ["tkill.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "truncate_test",
+    testonly = 1,
+    srcs = ["truncate.cc"],
+    linkstatic = 1,
+    deps = [
+        ":file_base",
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "udp_socket_test",
+    testonly = 1,
+    srcs = ["udp_socket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "udp_bind_test",
+    testonly = 1,
+    srcs = ["udp_bind.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "uidgid_test",
+    testonly = 1,
+    srcs = ["uidgid.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "uname_test",
+    testonly = 1,
+    srcs = ["uname.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "unlink_test",
+    testonly = 1,
+    srcs = ["unlink.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "unshare_test",
+    testonly = 1,
+    srcs = ["unshare.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "utimes_test",
+    testonly = 1,
+    srcs = ["utimes.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "vdso_test",
+    testonly = 1,
+    srcs = ["vdso.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:proc_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "vfork_test",
+    testonly = 1,
+    srcs = ["vfork.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "wait_test",
+    testonly = 1,
+    srcs = ["wait.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "write_test",
+    testonly = 1,
+    srcs = ["write.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:cleanup",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "memory_accounting_test",
+    testonly = 1,
+    srcs = ["memory_accounting.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "semaphore_test",
+    testonly = 1,
+    srcs = ["semaphore.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "shm_test",
+    testonly = 1,
+    srcs = ["shm.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "fadvise64_test",
+    testonly = 1,
+    srcs = ["fadvise64.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "vdso_clock_gettime_test",
+    testonly = 1,
+    srcs = ["vdso_clock_gettime.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "vsyscall_test",
+    testonly = 1,
+    srcs = ["vsyscall.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:proc_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
new file mode 100644
index 000000000..7c6e92317
--- /dev/null
+++ b/test/syscalls/linux/accept_bind.cc
@@ -0,0 +1,600 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include <algorithm>
+#include <vector>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(AllSocketPairTest, Listen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5),
+              SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ListenIncreaseBacklog) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 10),
+              SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ListenDecreaseBacklog) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 1),
+              SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ListenWithoutBind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(listen(sockets->first_fd(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, DoubleBind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->second_addr(),
+                   sockets->second_addr_size()),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, BindListenBind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->second_addr(),
+                   sockets->second_addr_size()),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, DoubleListen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, DoubleConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallFailsWithErrno(EISCONN));
+}
+
+TEST_P(AllSocketPairTest, Connect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ConnectToFilePath) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  constexpr char kPath[] = "/tmp";
+  memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+  ASSERT_THAT(
+      connect(sockets->second_fd(),
+              reinterpret_cast<const struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+TEST_P(AllSocketPairTest, ConnectToInvalidAbstractPath) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  constexpr char kPath[] = "\0nonexistent";
+  memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+  ASSERT_THAT(
+      connect(sockets->second_fd(),
+              reinterpret_cast<const struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+TEST_P(AllSocketPairTest, SelfConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->first_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, ConnectWithoutListen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+TEST_P(AllSocketPairTest, Accept) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  int accepted = -1;
+  ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallSucceeds());
+  ASSERT_THAT(close(accepted), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, AcceptValidAddrLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  int accepted = -1;
+  struct sockaddr_un addr = {};
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(
+      accepted = accept(sockets->first_fd(),
+                        reinterpret_cast<struct sockaddr*>(&addr), &addr_len),
+      SyscallSucceeds());
+  ASSERT_THAT(close(accepted), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, AcceptNegativeAddrLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  // With a negative addr_len, accept returns EINVAL,
+  struct sockaddr_un addr = {};
+  socklen_t addr_len = -1;
+  ASSERT_THAT(accept(sockets->first_fd(),
+                     reinterpret_cast<struct sockaddr*>(&addr), &addr_len),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, AcceptLargePositiveAddrLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  // With a large (positive) addr_len, accept does not return EINVAL.
+  int accepted = -1;
+  char addr_buf[200];
+  socklen_t addr_len = sizeof(addr_buf);
+  ASSERT_THAT(accepted = accept(sockets->first_fd(),
+                                reinterpret_cast<struct sockaddr*>(addr_buf),
+                                &addr_len),
+              SyscallSucceeds());
+  // addr_len should have been updated by accept().
+  EXPECT_LT(addr_len, sizeof(addr_buf));
+  ASSERT_THAT(close(accepted), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, AcceptVeryLargePositiveAddrLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  // With a large (positive) addr_len, accept does not return EINVAL.
+  int accepted = -1;
+  char addr_buf[2000];
+  socklen_t addr_len = sizeof(addr_buf);
+  ASSERT_THAT(accepted = accept(sockets->first_fd(),
+                                reinterpret_cast<struct sockaddr*>(addr_buf),
+                                &addr_len),
+              SyscallSucceeds());
+  // addr_len should have been updated by accept().
+  EXPECT_LT(addr_len, sizeof(addr_buf));
+  ASSERT_THAT(close(accepted), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, AcceptWithoutBind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, AcceptWithoutListen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, GetRemoteAddress) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  socklen_t addr_len = sockets->first_addr_size();
+  struct sockaddr_storage addr = {};
+  ASSERT_THAT(
+      getpeername(sockets->second_fd(), (struct sockaddr*)(&addr), &addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(addr_len, sockets->first_addr_len());
+  EXPECT_EQ(0, memcmp(&addr, sockets->first_addr(), sockets->first_addr_len()));
+}
+
+TEST_P(AllSocketPairTest, UnboundGetLocalAddress) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  socklen_t addr_len = sockets->first_addr_size();
+  struct sockaddr_storage addr = {};
+  ASSERT_THAT(
+      getsockname(sockets->second_fd(), (struct sockaddr*)(&addr), &addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(addr_len, 2);
+  EXPECT_EQ(
+      memcmp(&addr, sockets->second_addr(),
+             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+      0);
+}
+
+TEST_P(AllSocketPairTest, BoundGetLocalAddress) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+                   sockets->second_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  socklen_t addr_len = sockets->first_addr_size();
+  struct sockaddr_storage addr = {};
+  ASSERT_THAT(
+      getsockname(sockets->second_fd(), (struct sockaddr*)(&addr), &addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(addr_len, sockets->second_addr_len());
+  EXPECT_EQ(
+      memcmp(&addr, sockets->second_addr(),
+             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+      0);
+}
+
+TEST_P(AllSocketPairTest, BoundConnector) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+                   sockets->second_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, UnboundSenderAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  int accepted = -1;
+  ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallSucceeds());
+  FileDescriptor accepted_fd(accepted);
+
+  int i = 0;
+  ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+              SyscallSucceedsWithValue(sizeof(i)));
+
+  struct sockaddr_storage addr;
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(
+      RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
+                           reinterpret_cast<sockaddr*>(&addr), &addr_len),
+      SyscallSucceedsWithValue(sizeof(i)));
+  if (!IsRunningOnGvisor()) {
+    // Linux returns a zero length for addresses from recvfrom(2) and
+    // recvmsg(2). This differs from the behavior of getpeername(2) and
+    // getsockname(2). For simplicity, we use the getpeername(2) and
+    // getsockname(2) behavior for recvfrom(2) and recvmsg(2).
+    EXPECT_EQ(addr_len, 0);
+    return;
+  }
+  EXPECT_EQ(addr_len, 2);
+  EXPECT_EQ(
+      memcmp(&addr, sockets->second_addr(),
+             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+      0);
+}
+
+TEST_P(AllSocketPairTest, BoundSenderAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+                   sockets->second_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  int accepted = -1;
+  ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallSucceeds());
+  FileDescriptor accepted_fd(accepted);
+
+  int i = 0;
+  ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+              SyscallSucceedsWithValue(sizeof(i)));
+
+  struct sockaddr_storage addr;
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(
+      RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
+                           reinterpret_cast<sockaddr*>(&addr), &addr_len),
+      SyscallSucceedsWithValue(sizeof(i)));
+  EXPECT_EQ(addr_len, sockets->second_addr_len());
+  EXPECT_EQ(
+      memcmp(&addr, sockets->second_addr(),
+             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+      0);
+}
+
+TEST_P(AllSocketPairTest, BindAfterConnectSenderAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+                   sockets->second_addr_size()),
+              SyscallSucceeds());
+
+  int accepted = -1;
+  ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallSucceeds());
+  FileDescriptor accepted_fd(accepted);
+
+  int i = 0;
+  ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+              SyscallSucceedsWithValue(sizeof(i)));
+
+  struct sockaddr_storage addr;
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(
+      RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
+                           reinterpret_cast<sockaddr*>(&addr), &addr_len),
+      SyscallSucceedsWithValue(sizeof(i)));
+  EXPECT_EQ(addr_len, sockets->second_addr_len());
+  EXPECT_EQ(
+      memcmp(&addr, sockets->second_addr(),
+             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+      0);
+}
+
+TEST_P(AllSocketPairTest, BindAfterAcceptSenderAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  int accepted = -1;
+  ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallSucceeds());
+  FileDescriptor accepted_fd(accepted);
+
+  ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+                   sockets->second_addr_size()),
+              SyscallSucceeds());
+
+  int i = 0;
+  ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+              SyscallSucceedsWithValue(sizeof(i)));
+
+  struct sockaddr_storage addr;
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(
+      RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
+                           reinterpret_cast<sockaddr*>(&addr), &addr_len),
+      SyscallSucceedsWithValue(sizeof(i)));
+  EXPECT_EQ(addr_len, sockets->second_addr_len());
+  EXPECT_EQ(
+      memcmp(&addr, sockets->second_addr(),
+             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+      0);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, AllSocketPairTest,
+    ::testing::ValuesIn(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            FilesystemUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_SEQPACKET},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_SEQPACKET},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc
new file mode 100644
index 000000000..f7113a6fc
--- /dev/null
+++ b/test/syscalls/linux/accept_bind_stream.cc
@@ -0,0 +1,93 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include <algorithm>
+#include <vector>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(AllSocketPairTest, BoundSenderAddrCoalesced) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  int accepted = -1;
+  ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallSucceeds());
+  FileDescriptor closer(accepted);
+
+  int i = 0;
+  ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+              SyscallSucceedsWithValue(sizeof(i)));
+
+  ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+                   sockets->second_addr_size()),
+              SyscallSucceeds());
+
+  i = 0;
+  ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+              SyscallSucceedsWithValue(sizeof(i)));
+
+  int ri[2] = {0, 0};
+  struct sockaddr_storage addr;
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(
+      RetryEINTR(recvfrom)(accepted, ri, sizeof(ri), 0,
+                           reinterpret_cast<sockaddr*>(&addr), &addr_len),
+      SyscallSucceedsWithValue(sizeof(ri)));
+  EXPECT_EQ(addr_len, sockets->second_addr_len());
+
+  EXPECT_EQ(
+      memcmp(&addr, sockets->second_addr(),
+             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+      0);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, AllSocketPairTest,
+    ::testing::ValuesIn(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            FilesystemUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/access.cc b/test/syscalls/linux/access.cc
new file mode 100644
index 000000000..6ea070a5d
--- /dev/null
+++ b/test/syscalls/linux/access.cc
@@ -0,0 +1,170 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Ge;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class AccessTest : public ::testing::Test {
+ public:
+  std::string CreateTempFile(int perm) {
+    const std::string path = NewTempAbsPath();
+    const int fd = open(path.c_str(), O_CREAT | O_RDONLY, perm);
+    TEST_PCHECK(fd > 0);
+    TEST_PCHECK(close(fd) == 0);
+    return path;
+  }
+
+ protected:
+  // SetUp creates various configurations of files.
+  void SetUp() override {
+    // Move to the temporary directory. This allows us to reason more easily
+    // about absolute and relative paths.
+    ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+
+    // Create an empty file, standard permissions.
+    relfile_ = NewTempRelPath();
+    int fd;
+    ASSERT_THAT(fd = open(relfile_.c_str(), O_CREAT | O_TRUNC, 0644),
+                SyscallSucceedsWithValue(Ge(0)));
+    ASSERT_THAT(close(fd), SyscallSucceeds());
+    absfile_ = GetAbsoluteTestTmpdir() + "/" + relfile_;
+
+    // Create an empty directory, no writable permissions.
+    absdir_ = NewTempAbsPath();
+    reldir_ = JoinPath(Basename(absdir_), "");
+    ASSERT_THAT(mkdir(reldir_.c_str(), 0555), SyscallSucceeds());
+
+    // This file doesn't exist.
+    relnone_ = NewTempRelPath();
+    absnone_ = GetAbsoluteTestTmpdir() + "/" + relnone_;
+  }
+
+  // TearDown unlinks created files.
+  void TearDown() override {
+    ASSERT_THAT(unlink(absfile_.c_str()), SyscallSucceeds());
+    ASSERT_THAT(rmdir(absdir_.c_str()), SyscallSucceeds());
+  }
+
+  std::string relfile_;
+  std::string reldir_;
+
+  std::string absfile_;
+  std::string absdir_;
+
+  std::string relnone_;
+  std::string absnone_;
+};
+
+TEST_F(AccessTest, RelativeFile) {
+  EXPECT_THAT(access(relfile_.c_str(), R_OK), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, RelativeDir) {
+  EXPECT_THAT(access(reldir_.c_str(), R_OK | X_OK), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, AbsFile) {
+  EXPECT_THAT(access(absfile_.c_str(), R_OK), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, AbsDir) {
+  EXPECT_THAT(access(absdir_.c_str(), R_OK | X_OK), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, RelDoesNotExist) {
+  EXPECT_THAT(access(relnone_.c_str(), R_OK), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_F(AccessTest, AbsDoesNotExist) {
+  EXPECT_THAT(access(absnone_.c_str(), R_OK), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_F(AccessTest, InvalidMode) {
+  EXPECT_THAT(access(relfile_.c_str(), 0xffffffff),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(AccessTest, NoPerms) {
+  // Drop capabilities that allow us to override permissions. We must drop
+  // PERMITTED because access() checks those instead of EFFECTIVE.
+  ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_OVERRIDE));
+  ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_READ_SEARCH));
+
+  EXPECT_THAT(access(absdir_.c_str(), W_OK), SyscallFailsWithErrno(EACCES));
+}
+
+TEST_F(AccessTest, InvalidName) {
+  EXPECT_THAT(access(reinterpret_cast<char*>(0x1234), W_OK),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(AccessTest, UsrReadOnly) {
+  // Drop capabilities that allow us to override permissions. We must drop
+  // PERMITTED because access() checks those instead of EFFECTIVE.
+  ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_OVERRIDE));
+  ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_READ_SEARCH));
+
+  const std::string filename = CreateTempFile(0400);
+  EXPECT_THAT(access(filename.c_str(), R_OK), SyscallSucceeds());
+  EXPECT_THAT(access(filename.c_str(), W_OK), SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(access(filename.c_str(), X_OK), SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, UsrReadExec) {
+  // Drop capabilities that allow us to override permissions. We must drop
+  // PERMITTED because access() checks those instead of EFFECTIVE.
+  ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_OVERRIDE));
+  ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_READ_SEARCH));
+
+  const std::string filename = CreateTempFile(0500);
+  EXPECT_THAT(access(filename.c_str(), R_OK | X_OK), SyscallSucceeds());
+  EXPECT_THAT(access(filename.c_str(), W_OK), SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, UsrReadWrite) {
+  const std::string filename = CreateTempFile(0600);
+  EXPECT_THAT(access(filename.c_str(), R_OK | W_OK), SyscallSucceeds());
+  EXPECT_THAT(access(filename.c_str(), X_OK), SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, UsrReadWriteExec) {
+  const std::string filename = CreateTempFile(0700);
+  EXPECT_THAT(access(filename.c_str(), R_OK | W_OK | X_OK), SyscallSucceeds());
+  EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/affinity.cc b/test/syscalls/linux/affinity.cc
new file mode 100644
index 000000000..8a16343d5
--- /dev/null
+++ b/test/syscalls/linux/affinity.cc
@@ -0,0 +1,241 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_split.h"
+#include "test/util/cleanup.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// These tests are for both the sched_getaffinity(2) and sched_setaffinity(2)
+// syscalls.
+class AffinityTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    EXPECT_THAT(
+        // Needs use the raw syscall to get the actual size.
+        cpuset_size_ = syscall(SYS_sched_getaffinity, /*pid=*/0,
+                               sizeof(cpu_set_t), &mask_),
+        SyscallSucceeds());
+    // Lots of tests rely on having more than 1 logical processor available.
+    EXPECT_GT(CPU_COUNT(&mask_), 1);
+  }
+
+  static PosixError ClearLowestBit(cpu_set_t* mask, size_t cpus) {
+    const size_t mask_size = CPU_ALLOC_SIZE(cpus);
+    for (size_t n = 0; n < cpus; ++n) {
+      if (CPU_ISSET_S(n, mask_size, mask)) {
+        CPU_CLR_S(n, mask_size, mask);
+        return NoError();
+      }
+    }
+    return PosixError(EINVAL, "No bit to clear, mask is empty");
+  }
+
+  PosixError ClearLowestBit() { return ClearLowestBit(&mask_, CPU_SETSIZE); }
+
+  // Stores the initial cpu mask for this process.
+  cpu_set_t mask_ = {};
+  int cpuset_size_ = 0;
+};
+
+// sched_getaffinity(2) is implemented.
+TEST_F(AffinityTest, SchedGetAffinityImplemented) {
+  EXPECT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &mask_),
+              SyscallSucceeds());
+}
+
+// PID is not found.
+TEST_F(AffinityTest, SchedGetAffinityInvalidPID) {
+  // Flaky, but it's tough to avoid a race condition when finding an unused pid
+  EXPECT_THAT(sched_getaffinity(/*pid=*/INT_MAX - 1, sizeof(cpu_set_t), &mask_),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+// PID is not found.
+TEST_F(AffinityTest, SchedSetAffinityInvalidPID) {
+  // Flaky, but it's tough to avoid a race condition when finding an unused pid
+  EXPECT_THAT(sched_setaffinity(/*pid=*/INT_MAX - 1, sizeof(cpu_set_t), &mask_),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST_F(AffinityTest, SchedSetAffinityZeroMask) {
+  CPU_ZERO(&mask_);
+  EXPECT_THAT(sched_setaffinity(/*pid=*/0, sizeof(cpu_set_t), &mask_),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// N.B. This test case relies on cpuset_size_ larger than the actual number of
+// of all existing CPUs. Check your machine if the test fails.
+TEST_F(AffinityTest, SchedSetAffinityNonexistentCPUDropped) {
+  cpu_set_t mask = mask_;
+  // Add a nonexistent CPU.
+  //
+  // The number needs to be larger than the possible number of CPU available,
+  // but smaller than the number of the CPU that the kernel claims to support --
+  // it's implicitly returned by raw sched_getaffinity syscall.
+  CPU_SET(cpuset_size_ * 8 - 1, &mask);
+  EXPECT_THAT(
+      // Use raw syscall because it will be rejected by the libc wrapper
+      // otherwise.
+      syscall(SYS_sched_setaffinity, /*pid=*/0, sizeof(cpu_set_t), &mask),
+      SyscallSucceeds())
+      << "failed with cpumask : " << CPUSetToString(mask)
+      << ", cpuset_size_ : " << cpuset_size_;
+  cpu_set_t newmask;
+  EXPECT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &newmask),
+              SyscallSucceeds());
+  EXPECT_TRUE(CPU_EQUAL(&mask_, &newmask))
+      << "got: " << CPUSetToString(newmask)
+      << " != expected: " << CPUSetToString(mask_);
+}
+
+TEST_F(AffinityTest, SchedSetAffinityOnlyNonexistentCPUFails) {
+  // Make an empty cpu set.
+  CPU_ZERO(&mask_);
+  // Add a nonexistent CPU.
+  //
+  // The number needs to be larger than the possible number of CPU available,
+  // but smaller than the number of the CPU that the kernel claims to support --
+  // it's implicitly returned by raw sched_getaffinity syscall.
+  int cpu = cpuset_size_ * 8 - 1;
+  if (cpu <= NumCPUs()) {
+    LOG(INFO) << "Skipping test: cpu " << cpu << " exists";
+    return;
+  }
+  CPU_SET(cpu, &mask_);
+  EXPECT_THAT(
+      // Use raw syscall because it will be rejected by the libc wrapper
+      // otherwise.
+      syscall(SYS_sched_setaffinity, /*pid=*/0, sizeof(cpu_set_t), &mask_),
+      SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(AffinityTest, SchedSetAffinityInvalidSize) {
+  EXPECT_GT(cpuset_size_, 0);
+  // Not big enough.
+  EXPECT_THAT(sched_getaffinity(/*pid=*/0, cpuset_size_ - 1, &mask_),
+              SyscallFailsWithErrno(EINVAL));
+  // Not a multiple of word size.
+  EXPECT_THAT(sched_getaffinity(/*pid=*/0, cpuset_size_ + 1, &mask_),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(AffinityTest, Sanity) {
+  ASSERT_NO_ERRNO(ClearLowestBit());
+  EXPECT_THAT(sched_setaffinity(/*pid=*/0, sizeof(cpu_set_t), &mask_),
+              SyscallSucceeds());
+  cpu_set_t newmask;
+  EXPECT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &newmask),
+              SyscallSucceeds());
+  EXPECT_TRUE(CPU_EQUAL(&mask_, &newmask))
+      << "got: " << CPUSetToString(newmask)
+      << " != expected: " << CPUSetToString(mask_);
+}
+
+TEST_F(AffinityTest, NewThread) {
+  ASSERT_NO_ERRNO(ClearLowestBit());
+  ASSERT_NO_ERRNO(ClearLowestBit());
+  EXPECT_THAT(sched_setaffinity(/*pid=*/0, sizeof(cpu_set_t), &mask_),
+              SyscallSucceeds());
+  ScopedThread([this]() {
+    cpu_set_t child_mask;
+    ASSERT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &child_mask),
+                SyscallSucceeds());
+    ASSERT_TRUE(CPU_EQUAL(&child_mask, &mask_))
+        << "child cpu mask: " << CPUSetToString(child_mask)
+        << " != parent cpu mask: " << CPUSetToString(mask_);
+  });
+}
+
+TEST_F(AffinityTest, ConsistentWithProcCpuInfo) {
+  // Count how many cpus are shown in /proc/cpuinfo.
+  std::string cpuinfo = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/cpuinfo"));
+  int count = 0;
+  for (auto const& line : absl::StrSplit(cpuinfo, '\n')) {
+    if (absl::StartsWith(line, "processor")) {
+      count++;
+    }
+  }
+  EXPECT_GE(count, CPU_COUNT(&mask_));
+}
+
+TEST_F(AffinityTest, ConsistentWithProcStat) {
+  // Count how many cpus are shown in /proc/stat.
+  std::string stat = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/stat"));
+  int count = 0;
+  for (auto const& line : absl::StrSplit(stat, '\n')) {
+    if (absl::StartsWith(line, "cpu") && !absl::StartsWith(line, "cpu ")) {
+      count++;
+    }
+  }
+  EXPECT_GE(count, CPU_COUNT(&mask_));
+}
+
+TEST_F(AffinityTest, SmallCpuMask) {
+  const int num_cpus = NumCPUs();
+  const size_t mask_size = CPU_ALLOC_SIZE(num_cpus);
+  cpu_set_t* mask = CPU_ALLOC(num_cpus);
+  ASSERT_NE(mask, nullptr);
+  const auto free_mask = Cleanup([&] { CPU_FREE(mask); });
+
+  CPU_ZERO_S(mask_size, mask);
+  ASSERT_THAT(sched_getaffinity(0, mask_size, mask), SyscallSucceeds());
+}
+
+TEST_F(AffinityTest, LargeCpuMask) {
+  // Allocate mask bigger than cpu_set_t normally allocates.
+  const size_t cpus = CPU_SETSIZE * 8;
+  const size_t mask_size = CPU_ALLOC_SIZE(cpus);
+
+  cpu_set_t* large_mask = CPU_ALLOC(cpus);
+  auto free_mask = Cleanup([large_mask] { CPU_FREE(large_mask); });
+  CPU_ZERO_S(mask_size, large_mask);
+
+  // Check that get affinity with large mask works as expected.
+  ASSERT_THAT(sched_getaffinity(/*pid=*/0, mask_size, large_mask),
+              SyscallSucceeds());
+  EXPECT_TRUE(CPU_EQUAL(&mask_, large_mask))
+      << "got: " << CPUSetToString(*large_mask, cpus)
+      << " != expected: " << CPUSetToString(mask_);
+
+  // Check that set affinity with large mask works as expected.
+  ASSERT_NO_ERRNO(ClearLowestBit(large_mask, cpus));
+  EXPECT_THAT(sched_setaffinity(/*pid=*/0, mask_size, large_mask),
+              SyscallSucceeds());
+
+  cpu_set_t* new_mask = CPU_ALLOC(cpus);
+  auto free_new_mask = Cleanup([new_mask] { CPU_FREE(new_mask); });
+  CPU_ZERO_S(mask_size, new_mask);
+  EXPECT_THAT(sched_getaffinity(/*pid=*/0, mask_size, new_mask),
+              SyscallSucceeds());
+
+  EXPECT_TRUE(CPU_EQUAL_S(mask_size, large_mask, new_mask))
+      << "got: " << CPUSetToString(*new_mask, cpus)
+      << " != expected: " << CPUSetToString(*large_mask, cpus);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
new file mode 100644
index 000000000..cc5392223
--- /dev/null
+++ b/test/syscalls/linux/aio.cc
@@ -0,0 +1,433 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/aio_abi.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr char kData[] = "hello world!";
+
+int SubmitCtx(aio_context_t ctx, long nr, struct iocb** iocbpp) {
+  return syscall(__NR_io_submit, ctx, nr, iocbpp);
+}
+
+}  // namespace
+
+class AIOTest : public FileTest {
+ public:
+  AIOTest() : ctx_(0) {}
+
+  int SetupContext(unsigned int nr) {
+    return syscall(__NR_io_setup, nr, &ctx_);
+  }
+
+  int Submit(long nr, struct iocb** iocbpp) {
+    return SubmitCtx(ctx_, nr, iocbpp);
+  }
+
+  int GetEvents(long min, long max, struct io_event* events,
+                struct timespec* timeout) {
+    return RetryEINTR(syscall)(__NR_io_getevents, ctx_, min, max, events,
+                               timeout);
+  }
+
+  int DestroyContext() { return syscall(__NR_io_destroy, ctx_); }
+
+  void TearDown() override {
+    FileTest::TearDown();
+    if (ctx_ != 0) {
+      ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+    }
+  }
+
+  struct iocb CreateCallback() {
+    struct iocb cb = {};
+    cb.aio_data = 0x123;
+    cb.aio_fildes = test_file_fd_.get();
+    cb.aio_lio_opcode = IOCB_CMD_PWRITE;
+    cb.aio_buf = reinterpret_cast<uint64_t>(kData);
+    cb.aio_offset = 0;
+    cb.aio_nbytes = strlen(kData);
+    return cb;
+  }
+
+ protected:
+  aio_context_t ctx_;
+};
+
+TEST_F(AIOTest, BasicWrite) {
+  // Copied from fs/aio.c.
+  constexpr unsigned AIO_RING_MAGIC = 0xa10a10a1;
+  struct aio_ring {
+    unsigned id;
+    unsigned nr;
+    unsigned head;
+    unsigned tail;
+    unsigned magic;
+    unsigned compat_features;
+    unsigned incompat_features;
+    unsigned header_length;
+    struct io_event io_events[0];
+  };
+
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  // Check that 'ctx_' points to a valid address. libaio uses it to check if
+  // aio implementation uses aio_ring. gVisor doesn't and returns all zeroes.
+  // Linux implements aio_ring, so skip the zeroes check.
+  //
+  // TODO: Remove when gVisor implements aio_ring.
+  auto ring = reinterpret_cast<struct aio_ring*>(ctx_);
+  auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC;
+  EXPECT_EQ(ring->magic, magic);
+
+  struct iocb cb = CreateCallback();
+  struct iocb* cbs[1] = {&cb};
+
+  // Submit the request.
+  ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+  // Get the reply.
+  struct io_event events[1];
+  ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+
+  // Verify that it is as expected.
+  EXPECT_EQ(events[0].data, 0x123);
+  EXPECT_EQ(events[0].obj, reinterpret_cast<long>(&cb));
+  EXPECT_EQ(events[0].res, strlen(kData));
+
+  // Verify that the file contains the contents.
+  char verify_buf[32] = {};
+  ASSERT_THAT(read(test_file_fd_.get(), &verify_buf[0], strlen(kData)),
+              SyscallSucceeds());
+  EXPECT_EQ(strcmp(kData, &verify_buf[0]), 0);
+}
+
+TEST_F(AIOTest, BadWrite) {
+  // Create a pipe and immediately close the read end.
+  int pipefd[2];
+  ASSERT_THAT(pipe(pipefd), SyscallSucceeds());
+
+  FileDescriptor rfd(pipefd[0]);
+  FileDescriptor wfd(pipefd[1]);
+
+  rfd.reset();  // Close the read end.
+
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  struct iocb cb = CreateCallback();
+  // Try to write to the read end.
+  cb.aio_fildes = wfd.get();
+  struct iocb* cbs[1] = {&cb};
+
+  // Submit the request.
+  ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+  // Get the reply.
+  struct io_event events[1];
+  ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+
+  // Verify that it fails with the right error code.
+  EXPECT_EQ(events[0].data, 0x123);
+  EXPECT_EQ(events[0].obj, reinterpret_cast<uint64_t>(&cb));
+  EXPECT_LT(events[0].res, 0);
+}
+
+TEST_F(AIOTest, ExitWithPendingIo) {
+  // Setup a context that is 5 entries deep.
+  ASSERT_THAT(SetupContext(5), SyscallSucceeds());
+
+  struct iocb cb = CreateCallback();
+  struct iocb* cbs[] = {&cb};
+
+  // Submit a request but don't complete it to make it pending.
+  EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
+}
+
+int Submitter(void* arg) {
+  auto test = reinterpret_cast<AIOTest*>(arg);
+
+  struct iocb cb = test->CreateCallback();
+  struct iocb* cbs[1] = {&cb};
+
+  // Submit the request.
+  TEST_CHECK(test->Submit(1, cbs) == 1);
+  return 0;
+}
+
+TEST_F(AIOTest, CloneVm) {
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  const size_t kStackSize = 5 * kPageSize;
+  std::unique_ptr<char[]> stack(new char[kStackSize]);
+  char* bp = stack.get() + kStackSize;
+  pid_t child;
+  ASSERT_THAT(child = clone(Submitter, bp, CLONE_VM | SIGCHLD,
+                            reinterpret_cast<void*>(this)),
+              SyscallSucceeds());
+
+  // Get the reply.
+  struct io_event events[1];
+  ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+
+  // Verify that it is as expected.
+  EXPECT_EQ(events[0].data, 0x123);
+  EXPECT_EQ(events[0].res, strlen(kData));
+
+  // Verify that the file contains the contents.
+  char verify_buf[32] = {};
+  ASSERT_THAT(read(test_file_fd_.get(), &verify_buf[0], strlen(kData)),
+              SyscallSucceeds());
+  EXPECT_EQ(strcmp(kData, &verify_buf[0]), 0);
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+              SyscallSucceedsWithValue(child));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
+// Tests that AIO context can be remapped to a different address.
+TEST_F(AIOTest, Mremap) {
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  struct iocb cb = CreateCallback();
+  struct iocb* cbs[1] = {&cb};
+
+  // Reserve address space for the mremap target so we have something safe to
+  // map over.
+  //
+  // N.B. We reserve 2 pages because we'll attempt to remap to 2 pages below.
+  // That should fail with EFAULT, but will fail with EINVAL if this mmap
+  // returns the page immediately below ctx_, as
+  // [new_address, new_address+2*kPageSize) overlaps [ctx_, ctx_+kPageSize).
+  void* new_address = mmap(nullptr, 2 * kPageSize, PROT_READ,
+                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  ASSERT_THAT(reinterpret_cast<intptr_t>(new_address), SyscallSucceeds());
+  auto mmap_cleanup = Cleanup([new_address] {
+    EXPECT_THAT(munmap(new_address, 2 * kPageSize), SyscallSucceeds());
+  });
+
+  // Test that remapping to a larger address fails.
+  void* res = mremap(reinterpret_cast<void*>(ctx_), kPageSize, 2 * kPageSize,
+                     MREMAP_FIXED | MREMAP_MAYMOVE, new_address);
+  ASSERT_THAT(reinterpret_cast<intptr_t>(res), SyscallFailsWithErrno(EFAULT));
+
+  // Remap context 'handle' to a different address.
+  res = mremap(reinterpret_cast<void*>(ctx_), kPageSize, kPageSize,
+               MREMAP_FIXED | MREMAP_MAYMOVE, new_address);
+  ASSERT_THAT(
+      reinterpret_cast<intptr_t>(res),
+      SyscallSucceedsWithValue(reinterpret_cast<intptr_t>(new_address)));
+  mmap_cleanup.Release();
+  aio_context_t old_ctx = ctx_;
+  ctx_ = reinterpret_cast<aio_context_t>(new_address);
+
+  // Check that submitting the request with the old 'ctx_' fails.
+  ASSERT_THAT(SubmitCtx(old_ctx, 1, cbs), SyscallFailsWithErrno(EINVAL));
+
+  // Submit the request with the new 'ctx_'.
+  ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+  // Remap again.
+  new_address =
+      mmap(nullptr, kPageSize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  ASSERT_THAT(reinterpret_cast<int64_t>(new_address), SyscallSucceeds());
+  auto mmap_cleanup2 = Cleanup([new_address] {
+    EXPECT_THAT(munmap(new_address, kPageSize), SyscallSucceeds());
+  });
+  res = mremap(reinterpret_cast<void*>(ctx_), kPageSize, kPageSize,
+               MREMAP_FIXED | MREMAP_MAYMOVE, new_address);
+  ASSERT_THAT(reinterpret_cast<int64_t>(res),
+              SyscallSucceedsWithValue(reinterpret_cast<int64_t>(new_address)));
+  mmap_cleanup2.Release();
+  ctx_ = reinterpret_cast<aio_context_t>(new_address);
+
+  // Get the reply with yet another 'ctx_' and verify it.
+  struct io_event events[1];
+  ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(events[0].data, 0x123);
+  EXPECT_EQ(events[0].obj, reinterpret_cast<long>(&cb));
+  EXPECT_EQ(events[0].res, strlen(kData));
+
+  // Verify that the file contains the contents.
+  char verify_buf[32] = {};
+  ASSERT_THAT(read(test_file_fd_.get(), &verify_buf[0], strlen(kData)),
+              SyscallSucceeds());
+  EXPECT_EQ(strcmp(kData, &verify_buf[0]), 0);
+}
+
+// Tests that AIO context can be replaced with a different mapping at the same
+// address and continue working. Don't ask why, but Linux allows it.
+TEST_F(AIOTest, MremapOver) {
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  struct iocb cb = CreateCallback();
+  struct iocb* cbs[1] = {&cb};
+
+  ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+  // Allocate a new VMA, copy 'ctx_' content over, and remap it on top
+  // of 'ctx_'.
+  void* new_address = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  ASSERT_THAT(reinterpret_cast<int64_t>(new_address), SyscallSucceeds());
+  auto mmap_cleanup = Cleanup([new_address] {
+    EXPECT_THAT(munmap(new_address, kPageSize), SyscallSucceeds());
+  });
+
+  memcpy(new_address, reinterpret_cast<void*>(ctx_), kPageSize);
+  void* res =
+      mremap(new_address, kPageSize, kPageSize, MREMAP_FIXED | MREMAP_MAYMOVE,
+             reinterpret_cast<void*>(ctx_));
+  ASSERT_THAT(reinterpret_cast<int64_t>(res), SyscallSucceedsWithValue(ctx_));
+  mmap_cleanup.Release();
+
+  // Everything continues to work just fine.
+  struct io_event events[1];
+  ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(events[0].data, 0x123);
+  EXPECT_EQ(events[0].obj, reinterpret_cast<long>(&cb));
+  EXPECT_EQ(events[0].res, strlen(kData));
+
+  // Verify that the file contains the contents.
+  char verify_buf[32] = {};
+  ASSERT_THAT(read(test_file_fd_.get(), &verify_buf[0], strlen(kData)),
+              SyscallSucceeds());
+  EXPECT_EQ(strcmp(kData, &verify_buf[0]), 0);
+}
+
+// Tests that AIO calls fail if context's address is inaccessible.
+TEST_F(AIOTest, Mprotect) {
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  struct iocb cb = CreateCallback();
+  struct iocb* cbs[1] = {&cb};
+
+  ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+  // Makes the context 'handle' inaccessible and check that all subsequent
+  // calls fail.
+  ASSERT_THAT(mprotect(reinterpret_cast<void*>(ctx_), kPageSize, PROT_NONE),
+              SyscallSucceeds());
+  struct io_event events[1];
+  EXPECT_THAT(GetEvents(1, 1, events, nullptr), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(DestroyContext(), SyscallFailsWithErrno(EINVAL));
+
+  // Prevent TearDown from attempting to destroy the context and fail.
+  ctx_ = 0;
+}
+
+TEST_F(AIOTest, Timeout) {
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  struct timespec timeout;
+  timeout.tv_sec = 0;
+  timeout.tv_nsec = 10;
+  struct io_event events[1];
+  ASSERT_THAT(GetEvents(1, 1, events, &timeout), SyscallSucceedsWithValue(0));
+}
+
+class AIOReadWriteParamTest : public AIOTest,
+                              public ::testing::WithParamInterface<int> {};
+
+TEST_P(AIOReadWriteParamTest, BadOffset) {
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  struct iocb cb = CreateCallback();
+  struct iocb* cbs[1] = {&cb};
+
+  // Create a buffer that we can write to.
+  char buf[] = "hello world!";
+  cb.aio_buf = reinterpret_cast<uint64_t>(buf);
+
+  // Set the operation on the callback and give a negative offset.
+  const int opcode = GetParam();
+  cb.aio_lio_opcode = opcode;
+
+  iovec iov = {};
+  if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) {
+    // Create a valid iovec and set it in the callback.
+    iov.iov_base = reinterpret_cast<void*>(buf);
+    iov.iov_len = 1;
+    cb.aio_buf = reinterpret_cast<uint64_t>(&iov);
+    // aio_nbytes is the number of iovecs.
+    cb.aio_nbytes = 1;
+  }
+
+  // Pass a negative offset.
+  cb.aio_offset = -1;
+
+  // Should get error on submission.
+  ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EINVAL));
+}
+
+INSTANTIATE_TEST_CASE_P(BadOffset, AIOReadWriteParamTest,
+                        ::testing::Values(IOCB_CMD_PREAD, IOCB_CMD_PWRITE,
+                                          IOCB_CMD_PREADV, IOCB_CMD_PWRITEV));
+
+class AIOVectorizedParamTest : public AIOTest,
+                               public ::testing::WithParamInterface<int> {};
+
+TEST_P(AIOVectorizedParamTest, BadIOVecs) {
+  // Setup a context that is 128 entries deep.
+  ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+  struct iocb cb = CreateCallback();
+  struct iocb* cbs[1] = {&cb};
+
+  // Modify the callback to use the operation from the param.
+  cb.aio_lio_opcode = GetParam();
+
+  // Create an iovec with address in kernel range, and pass that as the buffer.
+  iovec iov = {};
+  iov.iov_base = reinterpret_cast<void*>(0xFFFFFFFF00000000);
+  iov.iov_len = 1;
+  cb.aio_buf = reinterpret_cast<uint64_t>(&iov);
+  // aio_nbytes is the number of iovecs.
+  cb.aio_nbytes = 1;
+
+  // Should get error on submission.
+  ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EFAULT));
+}
+
+INSTANTIATE_TEST_CASE_P(BadIOVecs, AIOVectorizedParamTest,
+                        ::testing::Values(IOCB_CMD_PREADV, IOCB_CMD_PWRITEV));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/alarm.cc b/test/syscalls/linux/alarm.cc
new file mode 100644
index 000000000..e0ddbb415
--- /dev/null
+++ b/test/syscalls/linux/alarm.cc
@@ -0,0 +1,193 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// N.B. Below, main blocks SIGALRM. Test cases must unblock it if they want
+// delivery.
+
+void do_nothing_handler(int sig, siginfo_t* siginfo, void* arg) {}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and read.
+TEST(AlarmTest, Interrupt_NoRandomSave) {
+  int pipe_fds[2];
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+  FileDescriptor read_fd(pipe_fds[0]);
+  FileDescriptor write_fd(pipe_fds[1]);
+
+  // Use a signal handler that interrupts but does nothing rather than using the
+  // default terminate action.
+  struct sigaction sa;
+  sa.sa_sigaction = do_nothing_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = 0;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+  // Actually allow SIGALRM delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+  // Alarm in 20 second, which should be well after read blocks below.
+  ASSERT_THAT(alarm(20), SyscallSucceeds());
+
+  char buf;
+  ASSERT_THAT(read(read_fd.get(), &buf, 1), SyscallFailsWithErrno(EINTR));
+}
+
+/* Count of the number of SIGALARMS handled. */
+static volatile int alarms_received = 0;
+
+void inc_alarms_handler(int sig, siginfo_t* siginfo, void* arg) {
+  alarms_received++;
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and read.
+TEST(AlarmTest, Restart_NoRandomSave) {
+  alarms_received = 0;
+
+  int pipe_fds[2];
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+  FileDescriptor read_fd(pipe_fds[0]);
+  // Write end closed by thread below.
+
+  struct sigaction sa;
+  sa.sa_sigaction = inc_alarms_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_RESTART;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+  // Spawn a thread to eventually unblock the read below.
+  ScopedThread t([pipe_fds] {
+    absl::SleepFor(absl::Seconds(30));
+    EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+  });
+
+  // Actually allow SIGALRM delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+  // Alarm in 20 second, which should be well after read blocks below, but
+  // before it returns.
+  ASSERT_THAT(alarm(20), SyscallSucceeds());
+
+  // Read and eventually get an EOF from the writer closing.  If SA_RESTART
+  // didn't work, then the alarm would not have fired and we wouldn't increment
+  // our alarms_received count in our signal handler, or we would have not
+  // restarted the syscall gracefully, which we expect below in order to be
+  // able to get the final EOF on the pipe.
+  char buf;
+  ASSERT_THAT(read(read_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_EQ(alarms_received, 1);
+
+  t.Join();
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and pause.
+TEST(AlarmTest, SaSiginfo_NoRandomSave) {
+  // Use a signal handler that interrupts but does nothing rather than using the
+  // default terminate action.
+  struct sigaction sa;
+  sa.sa_sigaction = do_nothing_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+  // Actually allow SIGALRM delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+  // Alarm in 20 second, which should be well after pause blocks below.
+  ASSERT_THAT(alarm(20), SyscallSucceeds());
+  ASSERT_THAT(pause(), SyscallFailsWithErrno(EINTR));
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and pause.
+TEST(AlarmTest, SaInterrupt_NoRandomSave) {
+  // Use a signal handler that interrupts but does nothing rather than using the
+  // default terminate action.
+  struct sigaction sa;
+  sa.sa_sigaction = do_nothing_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_INTERRUPT;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+  // Actually allow SIGALRM delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+  // Alarm in 20 second, which should be well after pause blocks below.
+  ASSERT_THAT(alarm(20), SyscallSucceeds());
+  ASSERT_THAT(pause(), SyscallFailsWithErrno(EINTR));
+}
+
+TEST(AlarmTest, UserModeSpinning) {
+  alarms_received = 0;
+
+  struct sigaction sa = {};
+  sa.sa_sigaction = inc_alarms_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+  // Actually allow SIGALRM delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+  // Alarm in 20 second, which should be well into the loop below.
+  ASSERT_THAT(alarm(20), SyscallSucceeds());
+  // Make sure that the signal gets delivered even if we are spinning in user
+  // mode when it arrives.
+  while (!alarms_received) {
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  // These tests depend on delivering SIGALRM to the main thread. Block SIGALRM
+  // so that any other threads created by TestInit will also have SIGALRM
+  // blocked.
+  sigset_t set;
+  sigemptyset(&set);
+  sigaddset(&set, SIGALRM);
+  TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+  gvisor::testing::TestInit(&argc, &argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc
new file mode 100644
index 000000000..5687ceb86
--- /dev/null
+++ b/test/syscalls/linux/arch_prctl.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+// glibc does not provide a prototype for arch_prctl() so declare it here.
+extern "C" int arch_prctl(int code, uintptr_t addr);
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ArchPrctlTest, GetSetFS) {
+  uintptr_t orig;
+  const uintptr_t kNonCanonicalFsbase = 0x4141414142424242;
+
+  // Get the original FS.base and then set it to the same value (this is
+  // intentional because FS.base is the TLS pointer so we cannot change it
+  // arbitrarily).
+  ASSERT_THAT(arch_prctl(ARCH_GET_FS, reinterpret_cast<uintptr_t>(&orig)),
+              SyscallSucceeds());
+  ASSERT_THAT(arch_prctl(ARCH_SET_FS, orig), SyscallSucceeds());
+
+  // Trying to set FS.base to a non-canonical value should return an error.
+  ASSERT_THAT(arch_prctl(ARCH_SET_FS, kNonCanonicalFsbase),
+              SyscallFailsWithErrno(EPERM));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
new file mode 100644
index 000000000..a2634a8bf
--- /dev/null
+++ b/test/syscalls/linux/bad.cc
@@ -0,0 +1,39 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(BadSyscallTest, NotImplemented) {
+  // get_kernel_syms is not supported in Linux > 2.6, and not implemented in
+  // gVisor.
+  EXPECT_THAT(syscall(SYS_get_kernel_syms), SyscallFailsWithErrno(ENOSYS));
+}
+
+TEST(BadSyscallTest, NegativeOne) {
+  EXPECT_THAT(syscall(-1), SyscallFailsWithErrno(ENOSYS));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/base_poll_test.cc b/test/syscalls/linux/base_poll_test.cc
new file mode 100644
index 000000000..bba0108ea
--- /dev/null
+++ b/test/syscalls/linux/base_poll_test.cc
@@ -0,0 +1,65 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/base_poll_test.h"
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+static volatile int timer_fired = 0;
+static void SigAlarmHandler(int, siginfo_t*, void*) { timer_fired = 1; }
+
+BasePollTest::BasePollTest() {
+  // Register our SIGALRM handler, but save the original so we can restore in
+  // the destructor.
+  struct sigaction sa = {};
+  sa.sa_sigaction = SigAlarmHandler;
+  sigfillset(&sa.sa_mask);
+  TEST_PCHECK(sigaction(SIGALRM, &sa, &original_alarm_sa_) == 0);
+}
+
+BasePollTest::~BasePollTest() {
+  ClearTimer();
+  TEST_PCHECK(sigaction(SIGALRM, &original_alarm_sa_, nullptr) == 0);
+}
+
+void BasePollTest::SetTimer(absl::Duration duration) {
+  pid_t tgid = getpid();
+  pid_t tid = gettid();
+  ClearTimer();
+
+  // Create a new timer thread.
+  timer_ = absl::make_unique<TimerThread>(absl::Now() + duration, tgid, tid);
+}
+
+bool BasePollTest::TimerFired() const { return timer_fired; }
+
+void BasePollTest::ClearTimer() {
+  timer_.reset();
+  timer_fired = 0;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/base_poll_test.h b/test/syscalls/linux/base_poll_test.h
new file mode 100644
index 000000000..9b9b81933
--- /dev/null
+++ b/test/syscalls/linux/base_poll_test.h
@@ -0,0 +1,101 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_BASE_POLL_TEST_H_
+#define GVISOR_TEST_SYSCALLS_BASE_POLL_TEST_H_
+
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// TimerThread is a cancelable timer.
+class TimerThread {
+ public:
+  TimerThread(absl::Time deadline, pid_t tgid, pid_t tid)
+      : thread_([=] {
+          mu_.Lock();
+          mu_.AwaitWithDeadline(absl::Condition(&cancel_), deadline);
+          if (!cancel_) {
+            TEST_PCHECK(tgkill(tgid, tid, SIGALRM) == 0);
+          }
+          mu_.Unlock();
+        }) {}
+
+  ~TimerThread() { Cancel(); }
+
+  void Cancel() {
+    absl::MutexLock ml(&mu_);
+    cancel_ = true;
+  }
+
+ private:
+  mutable absl::Mutex mu_;
+  bool cancel_ GUARDED_BY(mu_) = false;
+
+  // Must be last to ensure that the destructor for the thread is run before
+  // any other member of the object is destroyed.
+  ScopedThread thread_;
+};
+
+// Base test fixture for poll, select, ppoll, and pselect tests.
+//
+// This fixture makes use of SIGALRM. The handler is saved in SetUp() and
+// restored in TearDown().
+class BasePollTest : public ::testing::Test {
+ protected:
+  BasePollTest();
+  ~BasePollTest() override;
+
+  // Sets a timer that will send a signal to the calling thread after
+  // `duration`.
+  void SetTimer(absl::Duration duration);
+
+  // Returns true if the timer has fired.
+  bool TimerFired() const;
+
+  // Stops the pending timer (if any) and clear the "fired" state.
+  void ClearTimer();
+
+ private:
+  // Thread that implements the timer. If the timer is stopped, timer_ is null.
+  //
+  // We have to use a thread for this purpose because tests using this fixture
+  // expect to be interrupted by the timer signal, but itimers/alarm(2) send
+  // thread-group-directed signals, which may be handled by any thread in the
+  // test process.
+  std::unique_ptr<TimerThread> timer_;
+
+  // The original SIGALRM handler, to restore in destructor.
+  struct sigaction original_alarm_sa_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_BASE_POLL_TEST_H_
diff --git a/test/syscalls/linux/bind.cc b/test/syscalls/linux/bind.cc
new file mode 100644
index 000000000..354e8e53c
--- /dev/null
+++ b/test/syscalls/linux/bind.cc
@@ -0,0 +1,146 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(AllSocketPairTest, Bind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, BindTooLong) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  // first_addr is a sockaddr_storage being used as a sockaddr_un. Use the full
+  // length which is longer than expected for a Unix socket.
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sizeof(sockaddr_storage)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, DoubleBindSocket) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  EXPECT_THAT(
+      bind(sockets->first_fd(), sockets->first_addr(),
+           sockets->first_addr_size()),
+      // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+      // to EADDRINUSE.
+      AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+TEST_P(AllSocketPairTest, GetLocalAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  socklen_t addressLength = sockets->first_addr_size();
+  struct sockaddr_storage address = {};
+  ASSERT_THAT(getsockname(sockets->first_fd(), (struct sockaddr*)(&address),
+                          &addressLength),
+              SyscallSucceeds());
+  EXPECT_EQ(
+      0, memcmp(&address, sockets->first_addr(), sockets->first_addr_size()));
+}
+
+TEST_P(AllSocketPairTest, GetLocalAddrWithoutBind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  socklen_t addressLength = sockets->first_addr_size();
+  struct sockaddr_storage received_address = {};
+  ASSERT_THAT(
+      getsockname(sockets->first_fd(), (struct sockaddr*)(&received_address),
+                  &addressLength),
+      SyscallSucceeds());
+  struct sockaddr_storage want_address = {};
+  want_address.ss_family = sockets->first_addr()->sa_family;
+  EXPECT_EQ(0, memcmp(&received_address, &want_address, addressLength));
+}
+
+TEST_P(AllSocketPairTest, GetRemoteAddressWithoutConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  socklen_t addressLength = sockets->first_addr_size();
+  struct sockaddr_storage address = {};
+  ASSERT_THAT(getpeername(sockets->second_fd(), (struct sockaddr*)(&address),
+                          &addressLength),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(AllSocketPairTest, DoubleBindAddress) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  EXPECT_THAT(bind(sockets->second_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(AllSocketPairTest, Unbind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+
+  // Filesystem Unix sockets do not release their address when closed.
+  if (sockets->first_addr()->sa_data[0] != 0) {
+    ASSERT_THAT(bind(sockets->second_fd(), sockets->first_addr(),
+                     sockets->first_addr_size()),
+                SyscallFailsWithErrno(EADDRINUSE));
+    return;
+  }
+
+  ASSERT_THAT(bind(sockets->second_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, AllSocketPairTest,
+    ::testing::ValuesIn(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            FilesystemUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(
+                List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                List<int>{0, SOCK_NONBLOCK}, List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(
+                List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                List<int>{0, SOCK_NONBLOCK}, List<int>{0, SOCK_CLOEXEC})))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/brk.cc b/test/syscalls/linux/brk.cc
new file mode 100644
index 000000000..33d353959
--- /dev/null
+++ b/test/syscalls/linux/brk.cc
@@ -0,0 +1,31 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST(BrkTest, BrkSyscallReturnsOldBrkOnFailure) {
+  auto old_brk = sbrk(0);
+  EXPECT_THAT(syscall(SYS_brk, reinterpret_cast<void*>(-1)),
+              SyscallSucceedsWithValue(reinterpret_cast<uintptr_t>(old_brk)));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/chdir.cc b/test/syscalls/linux/chdir.cc
new file mode 100644
index 000000000..4905ffb23
--- /dev/null
+++ b/test/syscalls/linux/chdir.cc
@@ -0,0 +1,69 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/limits.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ChdirTest, Success) {
+  auto old_dir = GetAbsoluteTestTmpdir();
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(chdir(temp_dir.path().c_str()), SyscallSucceeds());
+  // Temp path destructor deletes the newly created tmp dir and Sentry rejects
+  // saving when its current dir is still pointing to the path. Switch to a
+  // permanent path here.
+  EXPECT_THAT(chdir(old_dir.c_str()), SyscallSucceeds());
+}
+
+TEST(ChdirTest, PermissionDenied) {
+  // Drop capabilities that allow us to override directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0666 /* mode */));
+  EXPECT_THAT(chdir(temp_dir.path().c_str()), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChdirTest, NotDir) {
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  EXPECT_THAT(chdir(temp_file.path().c_str()), SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(ChdirTest, NameTooLong) {
+  std::string name(NAME_MAX + 1, 'a');
+  ASSERT_THAT(chdir(name.c_str()), SyscallFailsWithErrno(ENAMETOOLONG));
+}
+
+TEST(ChdirTest, NotExist) {
+  EXPECT_THAT(chdir("/foo/bar"), SyscallFailsWithErrno(ENOENT));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc
new file mode 100644
index 000000000..b7fc17946
--- /dev/null
+++ b/test/syscalls/linux/chmod.cc
@@ -0,0 +1,262 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/eventfd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ChmodTest, ChmodFileSucceeds) {
+  // Drop capabilities that allow us to override file permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  ASSERT_THAT(chmod(file.path().c_str(), 0466), SyscallSucceeds());
+  EXPECT_THAT(open(file.path().c_str(), O_RDWR), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, ChmodDirSucceeds) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string fileInDir = NewTempAbsPathInDir(dir.path());
+
+  ASSERT_THAT(chmod(dir.path().c_str(), 0466), SyscallSucceeds());
+  EXPECT_THAT(open(fileInDir.c_str(), O_RDONLY), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodFileSucceeds_NoRandomSave) {
+  // Drop capabilities that allow us to file directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666));
+  int fd;
+  ASSERT_THAT(fd = open(file.path().c_str(), O_RDWR), SyscallSucceeds());
+
+  {
+    const DisableSave ds;  // File permissions are reduced.
+    ASSERT_THAT(fchmod(fd, 0444), SyscallSucceeds());
+    EXPECT_THAT(close(fd), SyscallSucceeds());
+  }
+
+  EXPECT_THAT(open(file.path().c_str(), O_RDWR), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodDirSucceeds_NoRandomSave) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  int fd;
+  ASSERT_THAT(fd = open(dir.path().c_str(), O_RDONLY | O_DIRECTORY),
+              SyscallSucceeds());
+
+  {
+    const DisableSave ds;  // File permissions are reduced.
+    ASSERT_THAT(fchmod(fd, 0), SyscallSucceeds());
+    EXPECT_THAT(close(fd), SyscallSucceeds());
+  }
+
+  EXPECT_THAT(open(dir.path().c_str(), O_RDONLY),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodBadF) {
+  ASSERT_THAT(fchmod(-1, 0444), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ChmodTest, FchmodatBadF) {
+  ASSERT_THAT(fchmodat(-1, "foo", 0444, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ChmodTest, FchmodatNotDir) {
+  ASSERT_THAT(fchmodat(-1, "", 0444, 0), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(ChmodTest, FchmodatFileAbsolutePath) {
+  // Drop capabilities that allow us to override file permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  ASSERT_THAT(fchmodat(-1, file.path().c_str(), 0444, 0), SyscallSucceeds());
+  EXPECT_THAT(open(file.path().c_str(), O_RDWR), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodatDirAbsolutePath) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  int fd;
+  ASSERT_THAT(fd = open(dir.path().c_str(), O_RDONLY | O_DIRECTORY),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  ASSERT_THAT(fchmodat(-1, dir.path().c_str(), 0, 0), SyscallSucceeds());
+  EXPECT_THAT(open(dir.path().c_str(), O_RDONLY),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodatFile) {
+  // Drop capabilities that allow us to override file permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  int parent_fd;
+  ASSERT_THAT(
+      parent_fd = open(GetAbsoluteTestTmpdir().c_str(), O_RDONLY | O_DIRECTORY),
+      SyscallSucceeds());
+
+  ASSERT_THAT(
+      fchmodat(parent_fd, std::string(Basename(temp_file.path())).c_str(), 0444, 0),
+      SyscallSucceeds());
+  EXPECT_THAT(close(parent_fd), SyscallSucceeds());
+
+  EXPECT_THAT(open(temp_file.path().c_str(), O_RDWR),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodatDir) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  int parent_fd;
+  ASSERT_THAT(
+      parent_fd = open(GetAbsoluteTestTmpdir().c_str(), O_RDONLY | O_DIRECTORY),
+      SyscallSucceeds());
+
+  int fd;
+  ASSERT_THAT(fd = open(dir.path().c_str(), O_RDONLY | O_DIRECTORY),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  ASSERT_THAT(fchmodat(parent_fd, std::string(Basename(dir.path())).c_str(), 0, 0),
+              SyscallSucceeds());
+  EXPECT_THAT(close(parent_fd), SyscallSucceeds());
+
+  EXPECT_THAT(open(dir.path().c_str(), O_RDONLY | O_DIRECTORY),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, ChmodDowngradeWritability_NoRandomSave) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666));
+
+  int fd;
+  ASSERT_THAT(fd = open(file.path().c_str(), O_RDWR), SyscallSucceeds());
+
+  const DisableSave ds;  // Permissions are dropped.
+  ASSERT_THAT(chmod(file.path().c_str(), 0444), SyscallSucceeds());
+  EXPECT_THAT(write(fd, "hello", 5), SyscallSucceedsWithValue(5));
+
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(ChmodTest, ChmodFileToNoPermissionsSucceeds) {
+  // Drop capabilities that allow us to override file permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666));
+
+  ASSERT_THAT(chmod(file.path().c_str(), 0), SyscallSucceeds());
+
+  EXPECT_THAT(open(file.path().c_str(), O_RDONLY),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodDowngradeWritability_NoRandomSave) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  int fd;
+  ASSERT_THAT(fd = open(file.path().c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+
+  const DisableSave ds;  // Permissions are dropped.
+  ASSERT_THAT(fchmod(fd, 0444), SyscallSucceeds());
+  EXPECT_THAT(write(fd, "hello", 5), SyscallSucceedsWithValue(5));
+
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(ChmodTest, FchmodFileToNoPermissionsSucceeds_NoRandomSave) {
+  // Drop capabilities that allow us to override file permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666));
+
+  int fd;
+  ASSERT_THAT(fd = open(file.path().c_str(), O_RDWR), SyscallSucceeds());
+
+  {
+    const DisableSave ds;  // Permissions are dropped.
+    ASSERT_THAT(fchmod(fd, 0), SyscallSucceeds());
+    EXPECT_THAT(close(fd), SyscallSucceeds());
+  }
+
+  EXPECT_THAT(open(file.path().c_str(), O_RDONLY),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// Verify that we can get a RW FD after chmod, even if a RO fd is left open.
+TEST(ChmodTest, ChmodWritableWithOpenFD) {
+  // FIXME: broken on hostfs.
+  if (IsRunningOnGvisor()) {
+    return;
+  }
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0444));
+
+  FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  ASSERT_THAT(fchmod(fd1.get(), 0644), SyscallSucceeds());
+
+  // This FD is writable, even though fd1 has a read-only reference to the file.
+  FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  // fd1 is not writable, but fd2 is.
+  char c = 'a';
+  EXPECT_THAT(WriteFd(fd1.get(), &c, 1), SyscallFailsWithErrno(EBADF));
+  EXPECT_THAT(WriteFd(fd2.get(), &c, 1), SyscallSucceedsWithValue(1));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/chown.cc b/test/syscalls/linux/chown.cc
new file mode 100644
index 000000000..aa1df05b1
--- /dev/null
+++ b/test/syscalls/linux/chown.cc
@@ -0,0 +1,200 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <grp.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_int32(scratch_uid1, 65534, "first scratch UID");
+DEFINE_int32(scratch_uid2, 65533, "second scratch UID");
+DEFINE_int32(scratch_gid, 65534, "first scratch GID");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ChownTest, FchownBadF) {
+  ASSERT_THAT(fchown(-1, 0, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ChownTest, FchownatBadF) {
+  ASSERT_THAT(fchownat(-1, "fff", 0, 0, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ChownTest, FchownatEmptyPath) {
+  const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY | O_RDONLY));
+  ASSERT_THAT(fchownat(fd.get(), "", 0, 0, 0), SyscallFailsWithErrno(ENOENT));
+}
+
+using Chown =
+    std::function<PosixError(const std::string&, uid_t owner, gid_t group)>;
+
+class ChownParamTest : public ::testing::TestWithParam<Chown> {};
+
+TEST_P(ChownParamTest, ChownFileSucceeds) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_CHOWN))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_CHOWN, false));
+  }
+
+  const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // At least *try* setting to a group other than the EGID.
+  gid_t gid;
+  EXPECT_THAT(gid = getegid(), SyscallSucceeds());
+  int num_groups;
+  EXPECT_THAT(num_groups = getgroups(0, nullptr), SyscallSucceeds());
+  if (num_groups > 0) {
+    std::vector<gid_t> list(num_groups);
+    EXPECT_THAT(getgroups(list.size(), list.data()), SyscallSucceeds());
+    gid = list[0];
+  }
+
+  EXPECT_NO_ERRNO(GetParam()(file.path(), geteuid(), gid));
+
+  struct stat s = {};
+  ASSERT_THAT(stat(file.path().c_str(), &s), SyscallSucceeds());
+  EXPECT_EQ(s.st_uid, geteuid());
+  EXPECT_EQ(s.st_gid, gid);
+}
+
+TEST_P(ChownParamTest, ChownFilePermissionDenied) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+  const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0777));
+
+  // Drop privileges and change IDs only in child thread, or else this parent
+  // thread won't be able to open some log files after the test ends.
+  ScopedThread([&] {
+    // Drop privileges.
+    if (HaveCapability(CAP_CHOWN).ValueOrDie()) {
+      EXPECT_NO_ERRNO(SetCapability(CAP_CHOWN, false));
+    }
+
+    // Change EUID and EGID.
+    //
+    // See note about POSIX below.
+    EXPECT_THAT(syscall(SYS_setresgid, -1, FLAGS_scratch_gid, -1),
+                SyscallSucceeds());
+    EXPECT_THAT(syscall(SYS_setresuid, -1, FLAGS_scratch_uid1, -1),
+                SyscallSucceeds());
+
+    EXPECT_THAT(GetParam()(file.path(), geteuid(), getegid()),
+                PosixErrorIs(EPERM, ::testing::ContainsRegex("chown")));
+  });
+}
+
+TEST_P(ChownParamTest, ChownFileSucceedsAsRoot) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_CHOWN))));
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_SETUID))));
+
+  const std::string filename = NewTempAbsPath();
+
+  absl::Notification fileCreated, fileChowned;
+  // Change UID only in child thread, or else this parent thread won't be able
+  // to open some log files after the test ends.
+  ScopedThread t([&] {
+    // POSIX requires that all threads in a process share the same UIDs, so
+    // the NPTL setresuid wrappers use signals to make all threads execute the
+    // setresuid syscall. However, we want this thread to have its own set of
+    // credentials different from the parent process, so we use the raw
+    // syscall.
+    EXPECT_THAT(syscall(SYS_setresuid, -1, FLAGS_scratch_uid2, -1),
+                SyscallSucceeds());
+
+    // Create file and immediately close it.
+    FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT | O_RDWR, 0644));
+    fd.reset();  // Close the fd.
+
+    fileCreated.Notify();
+    fileChowned.WaitForNotification();
+
+    EXPECT_THAT(open(filename.c_str(), O_RDWR), SyscallFailsWithErrno(EACCES));
+    FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_RDONLY));
+  });
+
+  fileCreated.WaitForNotification();
+
+  // Set file's owners to someone different.
+  EXPECT_NO_ERRNO(GetParam()(filename, FLAGS_scratch_uid1, FLAGS_scratch_gid));
+
+  struct stat s;
+  EXPECT_THAT(stat(filename.c_str(), &s), SyscallSucceeds());
+  EXPECT_EQ(s.st_uid, FLAGS_scratch_uid1);
+  EXPECT_EQ(s.st_gid, FLAGS_scratch_gid);
+
+  fileChowned.Notify();
+}
+
+PosixError errorFromReturn(const std::string& name, int ret) {
+  if (ret == -1) {
+    return PosixError(errno, absl::StrCat(name, " failed"));
+  }
+  return NoError();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ChownKinds, ChownParamTest,
+    ::testing::Values(
+        [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+          int rc = chown(path.c_str(), owner, group);
+          MaybeSave();
+          return errorFromReturn("chown", rc);
+        },
+        [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+          int rc = lchown(path.c_str(), owner, group);
+          MaybeSave();
+          return errorFromReturn("lchown", rc);
+        },
+        [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+          ASSIGN_OR_RETURN_ERRNO(auto fd, Open(path, O_RDWR));
+          int rc = fchown(fd.get(), owner, group);
+          MaybeSave();
+          return errorFromReturn("fchown", rc);
+        },
+        [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+          ASSIGN_OR_RETURN_ERRNO(auto fd, Open(path, O_RDWR));
+          int rc = fchownat(fd.get(), "", owner, group, AT_EMPTY_PATH);
+          MaybeSave();
+          return errorFromReturn("fchownat-fd", rc);
+        },
+        [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+          ASSIGN_OR_RETURN_ERRNO(
+              auto dirfd, Open(std::string(Dirname(path)), O_DIRECTORY | O_RDONLY));
+          int rc = fchownat(dirfd.get(), std::string(Basename(path)).c_str(), owner,
+                            group, 0);
+          MaybeSave();
+          return errorFromReturn("fchownat-dirfd", rc);
+        }));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
new file mode 100644
index 000000000..f921f9025
--- /dev/null
+++ b/test/syscalls/linux/chroot.cc
@@ -0,0 +1,364 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/mount_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::HasSubstr;
+using ::testing::Not;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ChrootTest, Success) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(chroot(temp_dir.path().c_str()), SyscallSucceeds());
+}
+
+TEST(ChrootTest, PermissionDenied) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  // CAP_DAC_READ_SEARCH and CAP_DAC_OVERRIDE may override Execute permission on
+  // directories.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0666 /* mode */));
+  EXPECT_THAT(chroot(temp_dir.path().c_str()), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChrootTest, NotDir) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  EXPECT_THAT(chroot(temp_file.path().c_str()), SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(ChrootTest, NotExist) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  EXPECT_THAT(chroot("/foo/bar"), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(ChrootTest, WithoutCapability) {
+  // Unset CAP_SYS_CHROOT.
+  ASSERT_NO_ERRNO(SetCapability(CAP_SYS_CHROOT, false));
+
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(chroot(temp_dir.path().c_str()), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(ChrootTest, CreatesNewRoot) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  // Grab the initial cwd.
+  char initial_cwd[1024];
+  ASSERT_THAT(syscall(__NR_getcwd, initial_cwd, sizeof(initial_cwd)),
+              SyscallSucceeds());
+
+  auto new_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto file_in_new_root =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(new_root.path()));
+
+  // chroot into new_root.
+  ASSERT_THAT(chroot(new_root.path().c_str()), SyscallSucceeds());
+
+  // getcwd should return "(unreachable)" followed by the initial_cwd.
+  char cwd[1024];
+  ASSERT_THAT(syscall(__NR_getcwd, cwd, sizeof(cwd)), SyscallSucceeds());
+  std::string expected_cwd = "(unreachable)";
+  expected_cwd += initial_cwd;
+  EXPECT_STREQ(cwd, expected_cwd.c_str());
+
+  // Should not be able to stat file by its full path.
+  struct stat statbuf;
+  EXPECT_THAT(stat(file_in_new_root.path().c_str(), &statbuf),
+              SyscallFailsWithErrno(ENOENT));
+
+  // Should be able to stat file at new rooted path.
+  auto basename = std::string(Basename(file_in_new_root.path()));
+  auto rootedFile = "/" + basename;
+  ASSERT_THAT(stat(rootedFile.c_str(), &statbuf), SyscallSucceeds());
+
+  // Should be able to stat cwd at '.' even though it's outside root.
+  ASSERT_THAT(stat(".", &statbuf), SyscallSucceeds());
+
+  // chdir into new root.
+  ASSERT_THAT(chdir("/"), SyscallSucceeds());
+
+  // getcwd should return "/".
+  EXPECT_THAT(syscall(__NR_getcwd, cwd, sizeof(cwd)), SyscallSucceeds());
+  EXPECT_STREQ(cwd, "/");
+
+  // Statting '.', '..', '/', and '/..' all return the same dev and inode.
+  struct stat statbuf_dot;
+  ASSERT_THAT(stat(".", &statbuf_dot), SyscallSucceeds());
+  struct stat statbuf_dotdot;
+  ASSERT_THAT(stat("..", &statbuf_dotdot), SyscallSucceeds());
+  EXPECT_EQ(statbuf_dot.st_dev, statbuf_dotdot.st_dev);
+  EXPECT_EQ(statbuf_dot.st_ino, statbuf_dotdot.st_ino);
+  struct stat statbuf_slash;
+  ASSERT_THAT(stat("/", &statbuf_slash), SyscallSucceeds());
+  EXPECT_EQ(statbuf_dot.st_dev, statbuf_slash.st_dev);
+  EXPECT_EQ(statbuf_dot.st_ino, statbuf_slash.st_ino);
+  struct stat statbuf_slashdotdot;
+  ASSERT_THAT(stat("/..", &statbuf_slashdotdot), SyscallSucceeds());
+  EXPECT_EQ(statbuf_dot.st_dev, statbuf_slashdotdot.st_dev);
+  EXPECT_EQ(statbuf_dot.st_ino, statbuf_slashdotdot.st_ino);
+}
+
+TEST(ChrootTest, DotDotFromOpenFD) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  auto dir_outside_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(dir_outside_root.path(), O_RDONLY | O_DIRECTORY));
+  auto new_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  // chroot into new_root.
+  ASSERT_THAT(chroot(new_root.path().c_str()), SyscallSucceeds());
+
+  // openat on fd with path .. will succeed.
+  int other_fd;
+  ASSERT_THAT(other_fd = openat(fd.get(), "..", O_RDONLY), SyscallSucceeds());
+  EXPECT_THAT(close(other_fd), SyscallSucceeds());
+
+  // getdents on fd should not error.
+  char buf[1024];
+  ASSERT_THAT(syscall(SYS_getdents, fd.get(), buf, sizeof(buf)),
+              SyscallSucceeds());
+}
+
+// Test that link resolution in a chroot can escape the root by following an
+// open proc fd.
+TEST(ChrootTest, ProcFdLinkResolutionInChroot) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  const TempPath file_outside_chroot =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file_outside_chroot.path(), O_RDONLY));
+
+  const FileDescriptor proc_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open("/proc", O_DIRECTORY | O_RDONLY | O_CLOEXEC));
+
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  ASSERT_THAT(chroot(temp_dir.path().c_str()), SyscallSucceeds());
+
+  // Opening relative to an already open fd to a node outside the chroot works.
+  const FileDescriptor proc_self_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      OpenAt(proc_fd.get(), "self/fd", O_DIRECTORY | O_RDONLY | O_CLOEXEC));
+
+  // Proc fd symlinks can escape the chroot if the fd the symlink refers to
+  // refers to an object outside the chroot.
+  struct stat s = {};
+  EXPECT_THAT(
+      fstatat(proc_self_fd.get(), absl::StrCat(fd.get()).c_str(), &s, 0),
+      SyscallSucceeds());
+
+  // Try to stat the stdin fd. Internally, this is handled differently from a
+  // proc fd entry pointing to a file, since stdin is backed by a host fd, and
+  // isn't a walkable path on the filesystem inside the sandbox.
+  EXPECT_THAT(fstatat(proc_self_fd.get(), "0", &s, 0), SyscallSucceeds());
+}
+
+// This test will verify that when you hold a fd to proc before entering
+// a chroot that any files inside the chroot will appear rooted to the
+// base chroot when examining /proc/self/fd/{num}.
+TEST(ChrootTest, ProcMemSelfFdsNoEscapeProcOpen) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  // Get a FD to /proc before we enter the chroot.
+  const FileDescriptor proc =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY));
+
+  // Create and enter a chroot directory.
+  const auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  ASSERT_THAT(chroot(temp_dir.path().c_str()), SyscallSucceeds());
+
+  // Open a file inside the chroot at /foo.
+  const FileDescriptor foo =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/foo", O_CREAT | O_RDONLY, 0644));
+
+  // Examine /proc/self/fd/{foo_fd} to see if it exposes the fact that we're
+  // inside a chroot, the path should be /foo and NOT {chroot_dir}/foo.
+  const std::string fd_path = absl::StrCat("self/fd/", foo.get());
+  char buf[1024] = {};
+  size_t bytes_read = 0;
+  ASSERT_THAT(bytes_read =
+                  readlinkat(proc.get(), fd_path.c_str(), buf, sizeof(buf) - 1),
+              SyscallSucceeds());
+
+  // The link should resolve to something.
+  ASSERT_GT(bytes_read, 0);
+
+  // Assert that the link doesn't contain the chroot path and is only /foo.
+  EXPECT_STREQ(buf, "/foo");
+}
+
+// This test will verify that a file inside a chroot when mmapped will not
+// expose the full file path via /proc/self/maps and instead honor the chroot.
+TEST(ChrootTest, ProcMemSelfMapsNoEscapeProcOpen) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+  // Get a FD to /proc before we enter the chroot.
+  const FileDescriptor proc =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY));
+
+  // Create and enter a chroot directory.
+  const auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  ASSERT_THAT(chroot(temp_dir.path().c_str()), SyscallSucceeds());
+
+  // Open a file inside the chroot at /foo.
+  const FileDescriptor foo =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/foo", O_CREAT | O_RDONLY, 0644));
+
+  // Mmap the newly created file.
+  void* foo_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                       foo.get(), 0);
+  ASSERT_THAT(reinterpret_cast<int64_t>(foo_map), SyscallSucceeds());
+
+  // Always unmap.
+  auto cleanup_map = Cleanup(
+      [&] { EXPECT_THAT(munmap(foo_map, kPageSize), SyscallSucceeds()); });
+
+  // Examine /proc/self/maps to be sure that /foo doesn't appear to be
+  // mapped with the full chroot path.
+  const FileDescriptor maps =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAt(proc.get(), "self/maps", O_RDONLY));
+
+  size_t bytes_read = 0;
+  char buf[8 * 1024] = {};
+  ASSERT_THAT(bytes_read = ReadFd(maps.get(), buf, sizeof(buf)),
+              SyscallSucceeds());
+
+  // The maps file should have something.
+  ASSERT_GT(bytes_read, 0);
+
+  // Finally we want to make sure the maps don't contain the chroot path
+  ASSERT_EQ(std::string(buf, bytes_read).find(temp_dir.path()), std::string::npos);
+}
+
+// Test that mounts outside the chroot will not appear in /proc/self/mounts or
+// /proc/self/mountinfo.
+TEST(ChrootTest, ProcMountsMountinfoNoEscape) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  // We are going to create some mounts and then chroot. In order to be able to
+  // unmount the mounts after the test run, we must chdir to the root and use
+  // relative paths for all mounts. That way, as long as we never chdir into
+  // the new root, we can access the mounts via relative paths and unmount them.
+  ASSERT_THAT(chdir("/"), SyscallSucceeds());
+
+  // Create nested tmpfs mounts. Note the use of relative paths in Mount calls.
+  auto const outer_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const outer_mount = ASSERT_NO_ERRNO_AND_VALUE(Mount(
+      "none", JoinPath(".", outer_dir.path()), "tmpfs", 0, "mode=0700", 0));
+
+  auto const inner_dir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(outer_dir.path()));
+  auto const inner_mount = ASSERT_NO_ERRNO_AND_VALUE(Mount(
+      "none", JoinPath(".", inner_dir.path()), "tmpfs", 0, "mode=0700", 0));
+
+  // Filenames that will be checked for mounts, all relative to /proc dir.
+  std::string paths[3] = {"mounts", "self/mounts", "self/mountinfo"};
+
+  for (const std::string& path : paths) {
+    // We should have both inner and outer mounts.
+    const std::string contents =
+        ASSERT_NO_ERRNO_AND_VALUE(GetContents(JoinPath("/proc", path)));
+    EXPECT_THAT(contents, AllOf(HasSubstr(outer_dir.path()),
+                                HasSubstr(inner_dir.path())));
+    // We better have at least two mounts: the mounts we created plus the root.
+    std::vector<absl::string_view> submounts =
+        absl::StrSplit(contents, '\n', absl::SkipWhitespace());
+    EXPECT_GT(submounts.size(), 2);
+  }
+
+  // Get a FD to /proc before we enter the chroot.
+  const FileDescriptor proc =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY));
+
+  // Chroot to outer mount.
+  ASSERT_THAT(chroot(outer_dir.path().c_str()), SyscallSucceeds());
+
+  for (const std::string& path : paths) {
+    const FileDescriptor proc_file =
+        ASSERT_NO_ERRNO_AND_VALUE(OpenAt(proc.get(), path, O_RDONLY));
+
+    // Only two mounts visible from this chroot: the inner and outer.  Both
+    // paths should be relative to the new chroot.
+    const std::string contents =
+        ASSERT_NO_ERRNO_AND_VALUE(GetContentsFD(proc_file.get()));
+    EXPECT_THAT(contents,
+                AllOf(HasSubstr(absl::StrCat(Basename(inner_dir.path()))),
+                      Not(HasSubstr(outer_dir.path())),
+                      Not(HasSubstr(inner_dir.path()))));
+    std::vector<absl::string_view> submounts =
+        absl::StrSplit(contents, '\n', absl::SkipWhitespace());
+    EXPECT_EQ(submounts.size(), 2);
+  }
+
+  // Chroot to inner mount.  We must use an absolute path accessible to our
+  // chroot.
+  const std::string inner_dir_basename =
+      absl::StrCat("/", Basename(inner_dir.path()));
+  ASSERT_THAT(chroot(inner_dir_basename.c_str()), SyscallSucceeds());
+
+  for (const std::string& path : paths) {
+    const FileDescriptor proc_file =
+        ASSERT_NO_ERRNO_AND_VALUE(OpenAt(proc.get(), path, O_RDONLY));
+    const std::string contents =
+        ASSERT_NO_ERRNO_AND_VALUE(GetContentsFD(proc_file.get()));
+
+    // Only the inner mount visible from this chroot.
+    std::vector<absl::string_view> submounts =
+        absl::StrSplit(contents, '\n', absl::SkipWhitespace());
+    EXPECT_EQ(submounts.size(), 1);
+  }
+
+  // Chroot back to ".".
+  ASSERT_THAT(chroot("."), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/clock_getres.cc b/test/syscalls/linux/clock_getres.cc
new file mode 100644
index 000000000..8f8842299
--- /dev/null
+++ b/test/syscalls/linux/clock_getres.cc
@@ -0,0 +1,37 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/time.h>
+#include <time.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// clock_getres works regardless of whether or not a timespec is passed.
+TEST(ClockGetres, Timespec) {
+  struct timespec ts;
+  EXPECT_THAT(clock_getres(CLOCK_MONOTONIC, &ts), SyscallSucceeds());
+  EXPECT_THAT(clock_getres(CLOCK_MONOTONIC, nullptr), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
new file mode 100644
index 000000000..5003928be
--- /dev/null
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -0,0 +1,156 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <cerrno>
+#include <cstdint>
+#include <ctime>
+#include <list>
+#include <memory>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+int64_t clock_gettime_nsecs(clockid_t id) {
+  struct timespec ts;
+  TEST_PCHECK(clock_gettime(id, &ts) == 0);
+  return (ts.tv_sec * 1000000000 + ts.tv_nsec);
+}
+
+// Spin on the CPU for at least ns nanoseconds, based on
+// CLOCK_THREAD_CPUTIME_ID.
+void spin_ns(int64_t ns) {
+  int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
+  int64_t end = start + ns;
+
+  do {
+    constexpr int kLoopCount = 1000000;  // large and arbitrary
+    // volatile to prevent the compiler from skipping this loop.
+    for (volatile int i = 0; i < kLoopCount; i++) {
+    }
+  } while (clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID) < end);
+}
+
+// Test that CLOCK_PROCESS_CPUTIME_ID is a superset of CLOCK_THREAD_CPUTIME_ID.
+TEST(ClockGettime, CputimeId) {
+  constexpr int kNumThreads = 13;  // arbitrary
+
+  absl::Duration spin_time = absl::Seconds(1);
+
+  // Start off the worker threads and compute the aggregate time spent by
+  // the workers. Note that we test CLOCK_PROCESS_CPUTIME_ID by having the
+  // workers execute in parallel and verifying that CLOCK_PROCESS_CPUTIME_ID
+  // accumulates the runtime of all threads.
+  int64_t start = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
+
+  // Create a kNumThreads threads.
+  std::list<ScopedThread> threads;
+  for (int i = 0; i < kNumThreads; i++) {
+    threads.emplace_back(
+        [spin_time] { spin_ns(absl::ToInt64Nanoseconds(spin_time)); });
+  }
+  for (auto& t : threads) {
+    t.Join();
+  }
+
+  int64_t end = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
+
+  // The aggregate time spent in the worker threads must be at least
+  // 'kNumThreads' times the time each thread spun.
+  ASSERT_GE(end - start, kNumThreads * absl::ToInt64Nanoseconds(spin_time));
+}
+
+TEST(ClockGettime, JavaThreadTime) {
+  clockid_t clockid;
+  ASSERT_EQ(0, pthread_getcpuclockid(pthread_self(), &clockid));
+  struct timespec tp;
+  ASSERT_THAT(clock_getres(clockid, &tp), SyscallSucceeds());
+  ASSERT_THAT(clock_gettime(clockid, &tp), SyscallSucceeds());
+  EXPECT_TRUE(tp.tv_sec > 0 || tp.tv_nsec > 0);
+}
+
+// There is not much to test here, since CLOCK_REALTIME may be discontiguous.
+TEST(ClockGettime, RealtimeWorks) {
+  struct timespec tp;
+  EXPECT_THAT(clock_gettime(CLOCK_REALTIME, &tp), SyscallSucceeds());
+}
+
+class MonotonicClockTest : public ::testing::TestWithParam<clockid_t> {};
+
+TEST_P(MonotonicClockTest, IsMonotonic) {
+  auto end = absl::Now() + absl::Seconds(5);
+
+  struct timespec tp;
+  EXPECT_THAT(clock_gettime(GetParam(), &tp), SyscallSucceeds());
+
+  auto prev = absl::TimeFromTimespec(tp);
+  while (absl::Now() < end) {
+    EXPECT_THAT(clock_gettime(GetParam(), &tp), SyscallSucceeds());
+    auto now = absl::TimeFromTimespec(tp);
+    EXPECT_GE(now, prev);
+    prev = now;
+  }
+}
+
+std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) {
+  switch (info.param) {
+    case CLOCK_MONOTONIC:
+      return "CLOCK_MONOTONIC";
+    case CLOCK_MONOTONIC_COARSE:
+      return "CLOCK_MONOTONIC_COARSE";
+    case CLOCK_MONOTONIC_RAW:
+      return "CLOCK_MONOTONIC_RAW";
+    default:
+      return absl::StrCat(info.param);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(ClockGettime, MonotonicClockTest,
+                        ::testing::Values(CLOCK_MONOTONIC,
+                                          CLOCK_MONOTONIC_COARSE,
+                                          CLOCK_MONOTONIC_RAW),
+                        PrintClockId);
+
+TEST(ClockGettime, UnimplementedReturnsEINVAL) {
+  SKIP_IF(!IsRunningOnGvisor());
+
+  struct timespec tp;
+  EXPECT_THAT(clock_gettime(CLOCK_BOOTTIME, &tp),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(clock_gettime(CLOCK_REALTIME_ALARM, &tp),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(clock_gettime(CLOCK_BOOTTIME_ALARM, &tp),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(ClockGettime, InvalidClockIDReturnsEINVAL) {
+  struct timespec tp;
+  EXPECT_THAT(clock_gettime(-1, &tp), SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/clock_nanosleep.cc b/test/syscalls/linux/clock_nanosleep.cc
new file mode 100644
index 000000000..96bb961b4
--- /dev/null
+++ b/test/syscalls/linux/clock_nanosleep.cc
@@ -0,0 +1,153 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <time.h>
+
+#include <atomic>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// sys_clock_nanosleep is defined because the glibc clock_nanosleep returns
+// error numbers directly and does not set errno. This makes our Syscall
+// matchers look a little weird when expecting failure:
+// "SyscallSucceedsWithValue(ERRNO)".
+int sys_clock_nanosleep(clockid_t clkid, int flags,
+                        const struct timespec* request,
+                        struct timespec* remain) {
+  return syscall(SYS_clock_nanosleep, clkid, flags, request, remain);
+}
+
+PosixErrorOr<absl::Time> GetTime(clockid_t clk) {
+  struct timespec ts = {};
+  int rc = clock_gettime(clk, &ts);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "clock_gettime");
+  }
+  return absl::TimeFromTimespec(ts);
+}
+
+class WallClockNanosleepTest : public ::testing::TestWithParam<clockid_t> {};
+
+TEST_P(WallClockNanosleepTest, InvalidValues) {
+  const struct timespec invalid[] = {
+      {.tv_sec = -1, .tv_nsec = -1},       {.tv_sec = 0, .tv_nsec = INT32_MIN},
+      {.tv_sec = 0, .tv_nsec = INT32_MAX}, {.tv_sec = 0, .tv_nsec = -1},
+      {.tv_sec = -1, .tv_nsec = 0},
+  };
+
+  for (auto const ts : invalid) {
+    EXPECT_THAT(sys_clock_nanosleep(GetParam(), 0, &ts, nullptr),
+                SyscallFailsWithErrno(EINVAL));
+  }
+}
+
+TEST_P(WallClockNanosleepTest, SleepOneSecond) {
+  absl::Duration const duration = absl::Seconds(1);
+  struct timespec dur = absl::ToTimespec(duration);
+
+  absl::Time const before = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+  EXPECT_THAT(RetryEINTR(sys_clock_nanosleep)(GetParam(), 0, &dur, &dur),
+              SyscallSucceeds());
+  absl::Time const after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+
+  EXPECT_GE(after - before, duration);
+}
+
+TEST_P(WallClockNanosleepTest, InterruptedNanosleep) {
+  absl::Duration const duration = absl::Seconds(60);
+  struct timespec dur = absl::ToTimespec(duration);
+
+  // Install no-op signal handler for SIGALRM.
+  struct sigaction sa = {};
+  sigfillset(&sa.sa_mask);
+  sa.sa_handler = +[](int signo) {};
+  auto const cleanup_sa =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+  // Measure time since setting the alarm, since the alarm will interrupt the
+  // sleep and hence determine how long we sleep.
+  absl::Time const before = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+
+  // Set an alarm to go off while sleeping.
+  struct itimerval timer = {};
+  timer.it_value.tv_sec = 1;
+  timer.it_value.tv_usec = 0;
+  timer.it_interval.tv_sec = 1;
+  timer.it_interval.tv_usec = 0;
+  auto const cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, timer));
+
+  EXPECT_THAT(sys_clock_nanosleep(GetParam(), 0, &dur, &dur),
+              SyscallFailsWithErrno(EINTR));
+  absl::Time const after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+
+  absl::Duration const remaining = absl::DurationFromTimespec(dur);
+  EXPECT_GE(after - before + remaining, duration);
+}
+
+TEST_P(WallClockNanosleepTest, SleepUntil) {
+  absl::Time const now = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+  absl::Time const until = now + absl::Seconds(2);
+  struct timespec ts = absl::ToTimespec(until);
+
+  EXPECT_THAT(
+      RetryEINTR(sys_clock_nanosleep)(GetParam(), TIMER_ABSTIME, &ts, nullptr),
+      SyscallSucceeds());
+  absl::Time const after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+
+  EXPECT_GE(after, until);
+}
+
+INSTANTIATE_TEST_CASE_P(Sleepers, WallClockNanosleepTest,
+                        ::testing::Values(CLOCK_REALTIME, CLOCK_MONOTONIC));
+
+TEST(ClockNanosleepProcessTest, SleepFiveSeconds) {
+  absl::Duration const kDuration = absl::Seconds(5);
+  struct timespec dur = absl::ToTimespec(kDuration);
+
+  // Ensure that CLOCK_PROCESS_CPUTIME_ID advances.
+  std::atomic<bool> done(false);
+  ScopedThread t([&] {
+    while (!done.load()) {
+    }
+  });
+  auto const cleanup_done = Cleanup([&] { done.store(true); });
+
+  absl::Time const before =
+      ASSERT_NO_ERRNO_AND_VALUE(GetTime(CLOCK_PROCESS_CPUTIME_ID));
+  EXPECT_THAT(
+      RetryEINTR(sys_clock_nanosleep)(CLOCK_PROCESS_CPUTIME_ID, 0, &dur, &dur),
+      SyscallSucceeds());
+  absl::Time const after =
+      ASSERT_NO_ERRNO_AND_VALUE(GetTime(CLOCK_PROCESS_CPUTIME_ID));
+  EXPECT_GE(after - before, kDuration);
+}
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
new file mode 100644
index 000000000..2c13b315c
--- /dev/null
+++ b/test/syscalls/linux/concurrency.cc
@@ -0,0 +1,124 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <atomic>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// Test that a thread that never yields to the OS does not prevent other threads
+// from running.
+TEST(ConcurrencyTest, SingleProcessMultithreaded) {
+  std::atomic<int> a(0);
+
+  ScopedThread t([&a]() {
+    while (!a.load()) {
+    }
+  });
+
+  absl::SleepFor(absl::Seconds(1));
+
+  // We are still able to execute code in this thread. The other hasn't
+  // permanently hung execution in both threads.
+  a.store(1);
+}
+
+// Test that multiple threads in this process continue to execute in parallel,
+// even if an unrelated second process is spawned.
+TEST(ConcurrencyTest, MultiProcessMultithreaded) {
+  // In PID 1, start TIDs 1 and 2, and put both to sleep.
+  //
+  // Start PID 3, which spins for 5 seconds, then exits.
+  //
+  // TIDs 1 and 2 wake and attempt to Activate, which cannot occur until PID 3
+  // exits.
+  //
+  // Both TIDs 1 and 2 should be woken. If they are not both woken, the test
+  // hangs.
+  //
+  // This is all fundamentally racy. If we are failing to wake all threads, the
+  // expectation is that this test becomes flaky, rather than consistently
+  // failing.
+  //
+  // If additional background threads fail to block, we may never schedule the
+  // child, at which point this test effectively becomes
+  // MultiProcessConcurrency. That's not expected to occur.
+
+  std::atomic<int> a(0);
+  ScopedThread t([&a]() {
+    // Block so that PID 3 can execute and we can wait on its exit.
+    absl::SleepFor(absl::Seconds(1));
+    while (!a.load()) {
+    }
+  });
+
+  pid_t child_pid;
+  ASSERT_THAT(child_pid = fork(), SyscallSucceeds());
+  if (child_pid == 0) {
+    // Busy wait without making any blocking syscalls.
+    auto end = absl::Now() + absl::Seconds(5);
+    while (absl::Now() < end) {
+    }
+    _exit(0);
+  }
+
+  absl::SleepFor(absl::Seconds(1));
+
+  // If only TID 1 is woken, thread.Join will hang.
+  // If only TID 2 is woken, both will hang.
+  a.store(1);
+  t.Join();
+
+  int status = 0;
+  EXPECT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+// Test that multiple processes can execute concurrently, even if one process
+// never yields.
+TEST(ConcurrencyTest, MultiProcessConcurrency) {
+
+  pid_t child_pid;
+  ASSERT_THAT(child_pid = fork(), SyscallSucceeds());
+  if (child_pid == 0) {
+    while (true) {
+    }
+    __builtin_unreachable();
+  }
+
+  absl::SleepFor(absl::Seconds(5));
+
+  // We are still able to execute code in this process. The other hasn't
+  // permanently hung execution in both processes.
+  ASSERT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+  int status = 0;
+
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  ASSERT_TRUE(WIFSIGNALED(status));
+  ASSERT_EQ(WTERMSIG(status), SIGKILL);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/creat.cc b/test/syscalls/linux/creat.cc
new file mode 100644
index 000000000..72a016b4c
--- /dev/null
+++ b/test/syscalls/linux/creat.cc
@@ -0,0 +1,57 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kMode = 0666;
+
+TEST(CreatTest, CreatCreatesNewFile) {
+  std::string const path = NewTempAbsPath();
+  struct stat buf;
+  int fd;
+  ASSERT_THAT(stat(path.c_str(), &buf), SyscallFailsWithErrno(ENOENT));
+  ASSERT_THAT(fd = creat(path.c_str(), kMode), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  EXPECT_THAT(stat(path.c_str(), &buf), SyscallSucceeds());
+}
+
+TEST(CreatTest, CreatTruncatesExistingFile) {
+  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  int fd;
+  ASSERT_NO_ERRNO(SetContents(temp_path.path(), "non-empty"));
+  ASSERT_THAT(fd = creat(temp_path.path().c_str(), kMode), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  std::string new_contents;
+  ASSERT_NO_ERRNO(GetContents(temp_path.path(), &new_contents));
+  EXPECT_EQ("", new_contents);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
new file mode 100644
index 000000000..a140d3b30
--- /dev/null
+++ b/test/syscalls/linux/dev.cc
@@ -0,0 +1,149 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(DevTest, LseekDevUrandom) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/urandom", O_RDONLY));
+  EXPECT_THAT(lseek(fd.get(), -10, SEEK_CUR), SyscallSucceeds());
+  EXPECT_THAT(lseek(fd.get(), -10, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+}
+
+TEST(DevTest, LseekDevNull) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+  EXPECT_THAT(lseek(fd.get(), -10, SEEK_CUR), SyscallSucceeds());
+  EXPECT_THAT(lseek(fd.get(), -10, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(DevTest, LseekDevZero) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(DevTest, LseekDevFull) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/full", O_RDONLY));
+  EXPECT_THAT(lseek(fd.get(), 123, SEEK_SET), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lseek(fd.get(), 123, SEEK_CUR), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lseek(fd.get(), 123, SEEK_END), SyscallSucceedsWithValue(0));
+}
+
+TEST(DevTest, LseekDevNullFreshFile) {
+  // Seeks to /dev/null always return 0.
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+  const FileDescriptor fd2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+
+  EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lseek(fd1.get(), 1000, SEEK_CUR), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd3 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+  EXPECT_THAT(lseek(fd3.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+}
+
+TEST(DevTest, OpenTruncate) {
+  // Truncation is ignored on linux and gvisor for device files.
+  ASSERT_NO_ERRNO_AND_VALUE(
+      Open("/dev/null", O_CREAT | O_TRUNC | O_WRONLY, 0644));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      Open("/dev/zero", O_CREAT | O_TRUNC | O_WRONLY, 0644));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      Open("/dev/full", O_CREAT | O_TRUNC | O_WRONLY, 0644));
+}
+
+TEST(DevTest, Pread64DevNull) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+  char buf[1];
+  EXPECT_THAT(pread64(fd.get(), buf, 1, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST(DevTest, Pread64DevZero) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+  char buf[1];
+  EXPECT_THAT(pread64(fd.get(), buf, 1, 0), SyscallSucceedsWithValue(1));
+}
+
+TEST(DevTest, Pread64DevFull) {
+  // /dev/full behaves like /dev/zero with respect to reads.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/full", O_RDONLY));
+  char buf[1];
+  EXPECT_THAT(pread64(fd.get(), buf, 1, 0), SyscallSucceedsWithValue(1));
+}
+
+TEST(DevTest, ReadDevNull) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+  std::vector<char> buf(1);
+  EXPECT_THAT(ReadFd(fd.get(), buf.data(), 1), SyscallSucceeds());
+}
+
+// Do not allow random save as it could lead to partial reads.
+TEST(DevTest, ReadDevZero_NoRandomSave) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+
+  constexpr int kReadSize = 128 * 1024;
+  std::vector<char> buf(kReadSize, 1);
+  EXPECT_THAT(ReadFd(fd.get(), buf.data(), kReadSize),
+              SyscallSucceedsWithValue(kReadSize));
+  EXPECT_EQ(std::vector<char>(kReadSize, 0), buf);
+}
+
+TEST(DevTest, WriteDevNull) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_WRONLY));
+  EXPECT_THAT(WriteFd(fd.get(), "a", 1), SyscallSucceedsWithValue(1));
+}
+
+TEST(DevTest, WriteDevZero) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_WRONLY));
+  EXPECT_THAT(WriteFd(fd.get(), "a", 1), SyscallSucceedsWithValue(1));
+}
+
+TEST(DevTest, WriteDevFull) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/full", O_WRONLY));
+  EXPECT_THAT(WriteFd(fd.get(), "a", 1), SyscallFailsWithErrno(ENOSPC));
+}
+
+}  // namespace
+}  // namespace testing
+
+}  // namespace gvisor
diff --git a/test/syscalls/linux/dup.cc b/test/syscalls/linux/dup.cc
new file mode 100644
index 000000000..fc11844fb
--- /dev/null
+++ b/test/syscalls/linux/dup.cc
@@ -0,0 +1,139 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<FileDescriptor> Dup2(const FileDescriptor& fd, int target_fd) {
+  int new_fd = dup2(fd.get(), target_fd);
+  if (new_fd < 0) {
+    return PosixError(errno, "Dup2");
+  }
+  return FileDescriptor(new_fd);
+}
+
+PosixErrorOr<FileDescriptor> Dup3(const FileDescriptor& fd, int target_fd,
+                                  int flags) {
+  int new_fd = dup3(fd.get(), target_fd, flags);
+  if (new_fd < 0) {
+    return PosixError(errno, "Dup2");
+  }
+  return FileDescriptor(new_fd);
+}
+
+void CheckSameFile(const FileDescriptor& fd1, const FileDescriptor& fd2) {
+  struct stat stat_result1, stat_result2;
+  ASSERT_THAT(fstat(fd1.get(), &stat_result1), SyscallSucceeds());
+  ASSERT_THAT(fstat(fd2.get(), &stat_result2), SyscallSucceeds());
+  EXPECT_EQ(stat_result1.st_dev, stat_result2.st_dev);
+  EXPECT_EQ(stat_result1.st_ino, stat_result2.st_ino);
+}
+
+TEST(DupTest, Dup) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+  // Dup the descriptor and make sure it's the same file.
+  FileDescriptor nfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+  ASSERT_NE(fd.get(), nfd.get());
+  CheckSameFile(fd, nfd);
+}
+
+TEST(DupTest, DupClearsCloExec) {
+  FileDescriptor nfd;
+
+  // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag set.
+  int event_fd = 0;
+  ASSERT_THAT(event_fd = eventfd(0, EFD_CLOEXEC), SyscallSucceeds());
+  FileDescriptor event_fd_closer(event_fd);
+
+  EXPECT_THAT(fcntl(event_fd_closer.get(), F_GETFD),
+              SyscallSucceedsWithValue(FD_CLOEXEC));
+
+  // Duplicate the descriptor. Ensure that it doesn't have FD_CLOEXEC set.
+  nfd = ASSERT_NO_ERRNO_AND_VALUE(event_fd_closer.Dup());
+  ASSERT_NE(event_fd_closer.get(), nfd.get());
+  CheckSameFile(event_fd_closer, nfd);
+  EXPECT_THAT(fcntl(nfd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+TEST(DupTest, Dup2) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+  // Regular dup once.
+  FileDescriptor nfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+
+  ASSERT_NE(fd.get(), nfd.get());
+  CheckSameFile(fd, nfd);
+
+  // Dup over the file above.
+  int target_fd = nfd.release();
+  FileDescriptor nfd2 = ASSERT_NO_ERRNO_AND_VALUE(Dup2(fd, target_fd));
+  EXPECT_EQ(target_fd, nfd2.get());
+  CheckSameFile(fd, nfd2);
+}
+
+TEST(DupTest, Dup2SameFD) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+  // Should succeed.
+  ASSERT_THAT(dup2(fd.get(), fd.get()), SyscallSucceedsWithValue(fd.get()));
+}
+
+TEST(DupTest, Dup3) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+  // Regular dup once.
+  FileDescriptor nfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+  ASSERT_NE(fd.get(), nfd.get());
+  CheckSameFile(fd, nfd);
+
+  // Dup over the file above, check that it has no CLOEXEC.
+  nfd = ASSERT_NO_ERRNO_AND_VALUE(Dup3(fd, nfd.release(), 0));
+  CheckSameFile(fd, nfd);
+  EXPECT_THAT(fcntl(nfd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+
+  // Dup over the file again, check that it does not CLOEXEC.
+  nfd = ASSERT_NO_ERRNO_AND_VALUE(Dup3(fd, nfd.release(), O_CLOEXEC));
+  CheckSameFile(fd, nfd);
+  EXPECT_THAT(fcntl(nfd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST(DupTest, Dup3FailsSameFD) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+  // Only dup3 fails if the new and old fd are the same.
+  ASSERT_THAT(dup3(fd.get(), fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
new file mode 100644
index 000000000..9ae87c00b
--- /dev/null
+++ b/test/syscalls/linux/epoll.cc
@@ -0,0 +1,468 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kFDsPerEpoll = 3;
+constexpr uint64_t kMagicConstant = 0x0102030405060708;
+
+// Returns a new epoll file descriptor.
+PosixErrorOr<FileDescriptor> NewEpollFD() {
+  // "Since Linux 2.6.8, the size argument is ignored, but must be greater than
+  // zero." - epoll_create(2)
+  int fd = epoll_create(/* size = */ 1);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "epoll_create");
+  }
+  return FileDescriptor(fd);
+}
+
+// Returns a new eventfd.
+PosixErrorOr<FileDescriptor> NewEventFD() {
+  int fd = eventfd(/* initval = */ 0, /* flags = */ 0);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "eventfd");
+  }
+  return FileDescriptor(fd);
+}
+
+// Registers `target_fd` with the epoll instance represented by `epoll_fd` for
+// the epoll events `events`. Events on `target_fd` will be indicated by setting
+// data.u64 to `data` in the returned epoll_event.
+PosixError RegisterEpollFD(int epoll_fd, int target_fd, int events,
+                           uint64_t data) {
+  struct epoll_event event;
+  event.events = events;
+  event.data.u64 = data;
+  int rc = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, target_fd, &event);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "epoll_ctl");
+  }
+  return NoError();
+}
+
+uint64_t ms_elapsed(const struct timespec* begin, const struct timespec* end) {
+  return (end->tv_sec - begin->tv_sec) * 1000 +
+         (end->tv_nsec - begin->tv_nsec) / 1000000;
+}
+
+TEST(EpollTest, AllWritable) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(),
+                                    EPOLLIN | EPOLLOUT, kMagicConstant + i));
+  }
+
+  struct epoll_event result[kFDsPerEpoll];
+  ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(kFDsPerEpoll));
+  // TODO: Why do some tests check epoll_event::data, and others
+  // don't? Does Linux actually guarantee that, in any of these test cases,
+  // epoll_wait will necessarily write out the epoll_events in the order that
+  // they were registered?
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    ASSERT_EQ(result[i].events, EPOLLOUT);
+  }
+}
+
+TEST(EpollTest, LastReadable) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(),
+                                    EPOLLIN | EPOLLOUT, kMagicConstant + i));
+  }
+
+  uint64_t tmp = 1;
+  ASSERT_THAT(WriteFd(eventfds[kFDsPerEpoll - 1].get(), &tmp, sizeof(tmp)),
+              SyscallSucceedsWithValue(sizeof(tmp)));
+
+  struct epoll_event result[kFDsPerEpoll];
+  ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(kFDsPerEpoll));
+
+  int i;
+  for (i = 0; i < kFDsPerEpoll - 1; i++) {
+    EXPECT_EQ(result[i].events, EPOLLOUT);
+  }
+  EXPECT_EQ(result[i].events, EPOLLOUT | EPOLLIN);
+  EXPECT_EQ(result[i].data.u64, kMagicConstant + i);
+}
+
+TEST(EpollTest, LastNonWritable) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(),
+                                    EPOLLIN | EPOLLOUT, kMagicConstant + i));
+  }
+
+  // Write the maximum value to the event fd so that writing to it again would
+  // block.
+  uint64_t tmp = ULLONG_MAX - 1;
+  ASSERT_THAT(WriteFd(eventfds[kFDsPerEpoll - 1].get(), &tmp, sizeof(tmp)),
+              SyscallSucceedsWithValue(sizeof(tmp)));
+
+  struct epoll_event result[kFDsPerEpoll];
+  ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(kFDsPerEpoll));
+
+  int i;
+  for (i = 0; i < kFDsPerEpoll - 1; i++) {
+    EXPECT_EQ(result[i].events, EPOLLOUT);
+  }
+  EXPECT_EQ(result[i].events, EPOLLIN);
+  EXPECT_THAT(ReadFd(eventfds[kFDsPerEpoll - 1].get(), &tmp, sizeof(tmp)),
+              sizeof(tmp));
+  EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(kFDsPerEpoll));
+
+  for (i = 0; i < kFDsPerEpoll; i++) {
+    EXPECT_EQ(result[i].events, EPOLLOUT);
+  }
+}
+
+TEST(EpollTest, Timeout_NoRandomSave) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN,
+                                    kMagicConstant + i));
+  }
+
+  constexpr int kTimeoutMs = 200;
+  struct timespec begin;
+  struct timespec end;
+  struct epoll_event result[kFDsPerEpoll];
+
+  {
+    const DisableSave ds;  // Timing-related.
+    EXPECT_THAT(clock_gettime(CLOCK_MONOTONIC, &begin), SyscallSucceeds());
+
+    ASSERT_THAT(
+        RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, kTimeoutMs),
+        SyscallSucceedsWithValue(0));
+    EXPECT_THAT(clock_gettime(CLOCK_MONOTONIC, &end), SyscallSucceeds());
+  }
+
+  // Check the lower bound on the timeout.  Checking for an upper bound is
+  // fragile because Linux can overrun the timeout due to scheduling delays.
+  EXPECT_GT(ms_elapsed(&begin, &end), kTimeoutMs - 1);
+}
+
+void* writer(void* arg) {
+  int fd = *reinterpret_cast<int*>(arg);
+  uint64_t tmp = 1;
+
+  usleep(200000);
+  if (WriteFd(fd, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+    fprintf(stderr, "writer failed: errno %s\n", strerror(errno));
+  }
+
+  return nullptr;
+}
+
+TEST(EpollTest, WaitThenUnblock) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN,
+                                    kMagicConstant + i));
+  }
+
+  // Fire off a thread that will make at least one of the event fds readable.
+  pthread_t thread;
+  int make_readable = eventfds[0].get();
+  ASSERT_THAT(pthread_create(&thread, nullptr, writer, &make_readable),
+              SyscallSucceedsWithValue(0));
+
+  struct epoll_event result[kFDsPerEpoll];
+  EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(1));
+  EXPECT_THAT(pthread_detach(thread), SyscallSucceeds());
+}
+
+void sighandler(int s) {}
+
+void* signaler(void* arg) {
+  pthread_t* t = reinterpret_cast<pthread_t*>(arg);
+  // Repeatedly send the real-time signal until we are detached, because it's
+  // difficult to know exactly when epoll_wait on another thread (which this
+  // is intending to interrupt) has started blocking.
+  while (1) {
+    usleep(200000);
+    pthread_kill(*t, SIGRTMIN);
+  }
+  return nullptr;
+}
+
+TEST(EpollTest, UnblockWithSignal) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN,
+                                    kMagicConstant + i));
+  }
+
+  signal(SIGRTMIN, sighandler);
+  // Unblock the real time signals that InitGoogle blocks :(
+  sigset_t unblock;
+  sigemptyset(&unblock);
+  sigaddset(&unblock, SIGRTMIN);
+  ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &unblock, nullptr), SyscallSucceeds());
+
+  pthread_t thread;
+  pthread_t cur = pthread_self();
+  ASSERT_THAT(pthread_create(&thread, nullptr, signaler, &cur),
+              SyscallSucceedsWithValue(0));
+
+  struct epoll_event result[kFDsPerEpoll];
+  EXPECT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallFailsWithErrno(EINTR));
+  EXPECT_THAT(pthread_cancel(thread), SyscallSucceeds());
+  EXPECT_THAT(pthread_detach(thread), SyscallSucceeds());
+}
+
+TEST(EpollTest, TimeoutNoFds) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  struct epoll_event result[kFDsPerEpoll];
+  EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+              SyscallSucceedsWithValue(0));
+}
+
+struct addr_ctx {
+  int epollfd;
+  int eventfd;
+};
+
+void* fd_adder(void* arg) {
+  struct addr_ctx* actx = reinterpret_cast<struct addr_ctx*>(arg);
+  struct epoll_event event;
+  event.events = EPOLLIN | EPOLLOUT;
+  event.data.u64 = 0xdeadbeeffacefeed;
+
+  usleep(200000);
+  if (epoll_ctl(actx->epollfd, EPOLL_CTL_ADD, actx->eventfd, &event) == -1) {
+    fprintf(stderr, "epoll_ctl failed: %s\n", strerror(errno));
+  }
+
+  return nullptr;
+}
+
+TEST(EpollTest, UnblockWithNewFD) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+
+  pthread_t thread;
+  struct addr_ctx actx = {epollfd.get(), eventfd.get()};
+  ASSERT_THAT(pthread_create(&thread, nullptr, fd_adder, &actx),
+              SyscallSucceedsWithValue(0));
+
+  struct epoll_event result[kFDsPerEpoll];
+  // Wait while no FDs are ready, but after 200ms fd_adder will add a ready FD
+  // to epoll which will wake us up.
+  EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(1));
+  EXPECT_THAT(pthread_detach(thread), SyscallSucceeds());
+  EXPECT_EQ(result[0].data.u64, 0xdeadbeeffacefeed);
+}
+
+TEST(EpollTest, Oneshot) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN,
+                                    kMagicConstant + i));
+  }
+
+  struct epoll_event event;
+  event.events = EPOLLOUT | EPOLLONESHOT;
+  event.data.u64 = kMagicConstant;
+  ASSERT_THAT(
+      epoll_ctl(epollfd.get(), EPOLL_CTL_MOD, eventfds[0].get(), &event),
+      SyscallSucceeds());
+
+  struct epoll_event result[kFDsPerEpoll];
+  // One-shot entry means that the first epoll_wait should succeed.
+  ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+  // One-shot entry means that the second epoll_wait should timeout.
+  EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST(EpollTest, EdgeTriggered_NoRandomSave) {
+  // Test edge-triggered entry: make it edge-triggered, first wait should
+  // return it, second one should time out, make it writable again, third wait
+  // should return it, fourth wait should timeout.
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+  ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfd.get(),
+                                  EPOLLOUT | EPOLLET, kMagicConstant));
+
+  struct epoll_event result[kFDsPerEpoll];
+
+  {
+    const DisableSave ds;  // May trigger spurious event.
+
+    // Edge-triggered entry means that the first epoll_wait should return the
+    // event.
+    ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, -1),
+                SyscallSucceedsWithValue(1));
+    EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+    // Edge-triggered entry means that the second epoll_wait should time out.
+    ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, 100),
+                SyscallSucceedsWithValue(0));
+  }
+
+  uint64_t tmp = ULLONG_MAX - 1;
+
+  // Make an fd non-writable.
+  ASSERT_THAT(WriteFd(eventfd.get(), &tmp, sizeof(tmp)),
+              SyscallSucceedsWithValue(sizeof(tmp)));
+
+  // Make the same fd non-writable to trigger a change, which will trigger an
+  // edge-triggered event.
+  ASSERT_THAT(ReadFd(eventfd.get(), &tmp, sizeof(tmp)),
+              SyscallSucceedsWithValue(sizeof(tmp)));
+
+  {
+    const DisableSave ds;  // May trigger spurious event.
+
+    // An edge-triggered event should now be returned.
+    ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, -1),
+                SyscallSucceedsWithValue(1));
+    EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+    // The edge-triggered event had been consumed above, we don't expect to
+    // get it again.
+    ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, 100),
+                SyscallSucceedsWithValue(0));
+  }
+}
+
+TEST(EpollTest, OneshotAndEdgeTriggered) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+  ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfd.get(),
+                                  EPOLLOUT | EPOLLET | EPOLLONESHOT,
+                                  kMagicConstant));
+
+  struct epoll_event result[kFDsPerEpoll];
+  // First time one shot edge-triggered entry means that epoll_wait should
+  // return the event.
+  ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+  // Edge-triggered entry means that the second epoll_wait should time out.
+  ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+              SyscallSucceedsWithValue(0));
+
+  uint64_t tmp = ULLONG_MAX - 1;
+  // Make an fd non-writable.
+  ASSERT_THAT(WriteFd(eventfd.get(), &tmp, sizeof(tmp)),
+              SyscallSucceedsWithValue(sizeof(tmp)));
+  // Make the same fd non-writable to trigger a change, which will not trigger
+  // an edge-triggered event because we've also included EPOLLONESHOT.
+  ASSERT_THAT(ReadFd(eventfd.get(), &tmp, sizeof(tmp)),
+              SyscallSucceedsWithValue(sizeof(tmp)));
+  ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST(EpollTest, CycleOfOneDisallowed) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+
+  struct epoll_event event;
+  event.events = EPOLLOUT;
+  event.data.u64 = kMagicConstant;
+
+  ASSERT_THAT(epoll_ctl(epollfd.get(), EPOLL_CTL_ADD, epollfd.get(), &event),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(EpollTest, CycleOfThreeDisallowed) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  auto epollfd1 = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  auto epollfd2 = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+
+  ASSERT_NO_ERRNO(
+      RegisterEpollFD(epollfd.get(), epollfd1.get(), EPOLLIN, kMagicConstant));
+  ASSERT_NO_ERRNO(
+      RegisterEpollFD(epollfd1.get(), epollfd2.get(), EPOLLIN, kMagicConstant));
+
+  struct epoll_event event;
+  event.events = EPOLLIN;
+  event.data.u64 = kMagicConstant;
+  EXPECT_THAT(epoll_ctl(epollfd2.get(), EPOLL_CTL_ADD, epollfd.get(), &event),
+              SyscallFailsWithErrno(ELOOP));
+}
+
+TEST(EpollTest, CloseFile) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+  ASSERT_NO_ERRNO(
+      RegisterEpollFD(epollfd.get(), eventfd.get(), EPOLLOUT, kMagicConstant));
+
+  struct epoll_event result[kFDsPerEpoll];
+  ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+  // Close the event fd early.
+  eventfd.reset();
+
+  EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+              SyscallSucceedsWithValue(0));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
new file mode 100644
index 000000000..ffcd20622
--- /dev/null
+++ b/test/syscalls/linux/eventfd.cc
@@ -0,0 +1,189 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(EventfdTest, Nonblock) {
+  int efd;
+  ASSERT_THAT(efd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE),
+              SyscallSucceeds());
+
+  uint64_t l;
+  ASSERT_THAT(read(efd, &l, sizeof(l)), SyscallFailsWithErrno(EAGAIN));
+
+  l = 1;
+  ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+
+  l = 0;
+  ASSERT_THAT(read(efd, &l, sizeof(l)), SyscallSucceeds());
+  EXPECT_EQ(l, 1);
+
+  ASSERT_THAT(read(efd, &l, sizeof(l)), SyscallFailsWithErrno(EAGAIN));
+}
+
+void* read_three_times(void* arg) {
+  int efd = *reinterpret_cast<int*>(arg);
+  uint64_t l;
+  read(efd, &l, sizeof(l));
+  read(efd, &l, sizeof(l));
+  read(efd, &l, sizeof(l));
+  return nullptr;
+}
+
+TEST(EventfdTest, BlockingWrite) {
+  int efd;
+  ASSERT_THAT(efd = eventfd(0, EFD_SEMAPHORE), SyscallSucceeds());
+
+  pthread_t p;
+  ASSERT_THAT(pthread_create(&p, nullptr, read_three_times,
+                             reinterpret_cast<void*>(&efd)),
+              SyscallSucceeds());
+
+  uint64_t l = 1;
+  ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+  EXPECT_EQ(l, 1);
+
+  ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+  EXPECT_EQ(l, 1);
+
+  ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+  EXPECT_EQ(l, 1);
+
+  ASSERT_THAT(pthread_join(p, nullptr), SyscallSucceeds());
+}
+
+TEST(EventfdTest, SmallWrite) {
+  int efd;
+  ASSERT_THAT(efd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE),
+              SyscallSucceeds());
+
+  uint64_t l = 16;
+  ASSERT_THAT(write(efd, &l, 4), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(EventfdTest, SmallRead) {
+  int efd;
+  ASSERT_THAT(efd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE),
+              SyscallSucceeds());
+
+  uint64_t l = 1;
+  ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+
+  l = 0;
+  ASSERT_THAT(read(efd, &l, 4), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(EventfdTest, BigWrite) {
+  int efd;
+  ASSERT_THAT(efd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE),
+              SyscallSucceeds());
+
+  uint64_t big[16];
+  big[0] = 16;
+  ASSERT_THAT(write(efd, big, sizeof(big)), SyscallSucceeds());
+}
+
+TEST(EventfdTest, BigRead) {
+  int efd;
+  ASSERT_THAT(efd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE),
+              SyscallSucceeds());
+
+  uint64_t l = 1;
+  ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+
+  uint64_t big[16];
+  ASSERT_THAT(read(efd, big, sizeof(big)), SyscallSucceeds());
+  EXPECT_EQ(big[0], 1);
+}
+
+TEST(EventfdTest, BigWriteBigRead) {
+  int efd;
+  ASSERT_THAT(efd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE),
+              SyscallSucceeds());
+
+  uint64_t l[16];
+  l[0] = 16;
+  ASSERT_THAT(write(efd, l, sizeof(l)), SyscallSucceeds());
+  ASSERT_THAT(read(efd, l, sizeof(l)), SyscallSucceeds());
+  EXPECT_EQ(l[0], 1);
+}
+
+// NotifyNonZero is inherently racy, so random save is disabled.
+TEST(EventfdTest, NotifyNonZero_NoRandomSave) {
+  // Waits will time out at 10 seconds.
+  constexpr int kEpollTimeoutMs = 10000;
+  // Create an eventfd descriptor.
+  int efd;
+  ASSERT_THAT(efd = eventfd(7, EFD_SEMAPHORE | EFD_NONBLOCK),
+              SyscallSucceeds());
+  // Create an epoll fd to listen to efd.
+  int epollfd;
+  ASSERT_THAT(epollfd = epoll_create1(0), SyscallSucceeds());
+  // Add efd to epoll.
+  struct epoll_event add_ev;
+  add_ev.events = EPOLLIN | EPOLLET;
+  add_ev.data.fd = efd;
+  ASSERT_THAT(epoll_ctl(epollfd, EPOLL_CTL_ADD, efd, &add_ev),
+              SyscallSucceeds());
+
+  // Use epoll to get a value from efd.
+  struct epoll_event out_ev;
+  int wait_out = epoll_wait(epollfd, &out_ev, 1, kEpollTimeoutMs);
+  EXPECT_EQ(wait_out, 1);
+  EXPECT_EQ(efd, out_ev.data.fd);
+  uint64_t val = 0;
+  ASSERT_THAT(read(efd, &val, sizeof(val)), SyscallSucceeds());
+  EXPECT_EQ(val, 1);
+
+  // Start a thread that, after this thread blocks on epoll_wait, will write to
+  // efd. This is racy -- it's possible that this write will happen after
+  // epoll_wait times out.
+  ScopedThread t([efd] {
+    sleep(5);
+    uint64_t val = 1;
+    write(efd, &val, sizeof(val));
+  });
+
+  // epoll_wait should return once the thread writes.
+  wait_out = epoll_wait(epollfd, &out_ev, 1, kEpollTimeoutMs);
+  EXPECT_EQ(wait_out, 1);
+  EXPECT_EQ(efd, out_ev.data.fd);
+
+  val = 0;
+  ASSERT_THAT(read(efd, &val, sizeof(val)), SyscallSucceeds());
+  EXPECT_EQ(val, 1);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
new file mode 100644
index 000000000..72ab354e3
--- /dev/null
+++ b/test/syscalls/linux/exceptions.cc
@@ -0,0 +1,146 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+void inline Halt() { asm("hlt\r\n"); }
+
+void inline SetAlignmentCheck() {
+  asm("pushf\r\n"
+      "pop %%rax\r\n"
+      "or $0x40000, %%rax\r\n"
+      "push %%rax\r\n"
+      "popf\r\n"
+      :
+      :
+      : "ax");
+}
+
+void inline ClearAlignmentCheck() {
+  asm("pushf\r\n"
+      "pop %%rax\r\n"
+      "mov $0x40000, %%rbx\r\n"
+      "not %%rbx\r\n"
+      "and %%rbx, %%rax\r\n"
+      "push %%rax\r\n"
+      "popf\r\n"
+      :
+      :
+      : "ax", "bx");
+}
+
+void inline Int3Normal() { asm(".byte 0xcd, 0x03\r\n"); }
+
+void inline Int3Compact() { asm(".byte 0xcc\r\n"); }
+
+TEST(ExceptionTest, Halt) {
+  // In order to prevent the regular handler from messing with things (and
+  // perhaps refaulting until some other signal occurs), we reset the handler to
+  // the default action here and ensure that it dies correctly.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+  EXPECT_EXIT(Halt(), ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+TEST(ExceptionTest, DivideByZero) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+  EXPECT_EXIT(
+      {
+        uint32_t remainder;
+        uint32_t quotient;
+        uint32_t divisor = 0;
+        uint64_t value = 1;
+        asm("divl 0(%2)\r\n"
+            : "=d"(remainder), "=a"(quotient)
+            : "r"(&divisor), "d"(value >> 32), "a"(value));
+        TEST_CHECK(quotient > 0);  // Force dependency.
+      },
+      ::testing::KilledBySignal(SIGFPE), "");
+}
+
+TEST(ExceptionTest, Alignment) {
+  SetAlignmentCheck();
+  ClearAlignmentCheck();
+}
+
+TEST(ExceptionTest, AlignmentHalt) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+  // Reported upstream. We need to ensure that bad flags are cleared even in
+  // fault paths. Set the alignment flag and then generate an exception.
+  EXPECT_EXIT(
+      {
+        SetAlignmentCheck();
+        Halt();
+      },
+      ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+TEST(ExceptionTest, AlignmentCheck) {
+
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGBUS, sa));
+
+  EXPECT_EXIT(
+      {
+        char array[16];
+        SetAlignmentCheck();
+        for (int i = 0; i < 8; i++) {
+          // At least 7/8 offsets will be unaligned here.
+          uint64_t* ptr = reinterpret_cast<uint64_t*>(&array[i]);
+          asm("mov %0, 0(%0)\r\n" : : "r"(ptr) : "ax");
+        }
+      },
+      ::testing::KilledBySignal(SIGBUS), "");
+}
+
+TEST(ExceptionTest, Int3Normal) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGTRAP, sa));
+
+  EXPECT_EXIT(Int3Normal(), ::testing::KilledBySignal(SIGTRAP), "");
+}
+
+TEST(ExceptionTest, Int3Compact) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGTRAP, sa));
+
+  EXPECT_EXIT(Int3Compact(), ::testing::KilledBySignal(SIGTRAP), "");
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
new file mode 100644
index 000000000..1ef40b502
--- /dev/null
+++ b/test/syscalls/linux/exec.cc
@@ -0,0 +1,625 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/exec.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/eventfd.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kBasicWorkload[] = "exec_basic_workload";
+constexpr char kExitScript[] = "exit_script";
+constexpr char kStateWorkload[] = "exec_state_workload";
+constexpr char kProcExeWorkload[] = "exec_proc_exe_workload";
+constexpr char kAssertClosedWorkload[] = "exec_assert_closed_workload";
+constexpr char kPriorityWorkload[] = "priority_execve";
+
+std::string WorkloadPath(absl::string_view binary) {
+  std::string full_path;
+  char* test_src = getenv("TEST_SRCDIR");
+  if (test_src) {
+    full_path = JoinPath(test_src, "__main__/test/syscalls/linux", binary);
+  }
+  TEST_CHECK(full_path.empty() == false);
+  return full_path;
+}
+
+constexpr char kExit42[] = "--exec_exit_42";
+constexpr char kExecWithThread[] = "--exec_exec_with_thread";
+constexpr char kExecFromThread[] = "--exec_exec_from_thread";
+
+// Runs filename with argv and checks that the exit status is expect_status and
+// that stderr contains expect_stderr.
+void CheckOutput(const std::string& filename, const ExecveArray& argv,
+                 const ExecveArray& envv, int expect_status,
+                 const std::string& expect_stderr) {
+  int pipe_fds[2];
+  ASSERT_THAT(pipe2(pipe_fds, O_CLOEXEC), SyscallSucceeds());
+
+  FileDescriptor read_fd(pipe_fds[0]);
+  FileDescriptor write_fd(pipe_fds[1]);
+
+  pid_t child;
+  int execve_errno;
+
+  const auto remap_stderr = [pipe_fds] {
+    // Remap stdin and stdout to /dev/null.
+    int fd = open("/dev/null", O_RDWR | O_CLOEXEC);
+    if (fd < 0) {
+      _exit(errno);
+    }
+
+    int ret = dup2(fd, 0);
+    if (ret < 0) {
+      _exit(errno);
+    }
+
+    ret = dup2(fd, 1);
+    if (ret < 0) {
+      _exit(errno);
+    }
+
+    // And stderr to the pipe.
+    ret = dup2(pipe_fds[1], 2);
+    if (ret < 0) {
+      _exit(errno);
+    }
+
+    // Here, we'd ideally close all other FDs inherited from the parent.
+    // However, that's not worth the effort and CloexecNormalFile and
+    // CloexecEventfd depend on that not happening.
+  };
+
+  auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(filename, argv, envv, remap_stderr, &child, &execve_errno));
+
+  ASSERT_EQ(0, execve_errno);
+
+  // Not needed anymore.
+  write_fd.reset();
+
+  // Read stderr until the child exits.
+  std::string output;
+  constexpr int kSize = 128;
+  char buf[kSize];
+  int n;
+  do {
+    ASSERT_THAT(n = ReadFd(read_fd.get(), buf, kSize), SyscallSucceeds());
+    if (n > 0) {
+      output.append(buf, n);
+    }
+  } while (n > 0);
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+  EXPECT_EQ(status, expect_status);
+
+  // Process cleanup no longer needed.
+  kill.Release();
+
+  EXPECT_TRUE(absl::StrContains(output, expect_stderr)) << output;
+}
+
+TEST(ExecDeathTest, EmptyPath) {
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec("", {}, {}, nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecDeathTest, Basic) {
+  CheckOutput(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {},
+              ArgEnvExitStatus(0, 0),
+              absl::StrCat(WorkloadPath(kBasicWorkload), "\n"));
+}
+
+TEST(ExecDeathTest, OneArg) {
+  CheckOutput(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "1"},
+              {}, ArgEnvExitStatus(1, 0),
+              absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+}
+
+TEST(ExecDeathTest, FiveArg) {
+  CheckOutput(WorkloadPath(kBasicWorkload),
+              {WorkloadPath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
+              ArgEnvExitStatus(5, 0),
+              absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+}
+
+TEST(ExecDeathTest, OneEnv) {
+  CheckOutput(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)},
+              {"1"}, ArgEnvExitStatus(0, 1),
+              absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+}
+
+TEST(ExecDeathTest, FiveEnv) {
+  CheckOutput(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)},
+              {"1", "2", "3", "4", "5"}, ArgEnvExitStatus(0, 5),
+              absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+}
+
+TEST(ExecDeathTest, OneArgOneEnv) {
+  CheckOutput(WorkloadPath(kBasicWorkload),
+              {WorkloadPath(kBasicWorkload), "arg"}, {"env"},
+              ArgEnvExitStatus(1, 1),
+              absl::StrCat(WorkloadPath(kBasicWorkload), "\narg\nenv\n"));
+}
+
+TEST(ExecDeathTest, InterpreterScript) {
+  CheckOutput(WorkloadPath(kExitScript), {WorkloadPath(kExitScript), "25"}, {},
+              ArgEnvExitStatus(25, 0), "");
+}
+
+// Everything after the path in the interpreter script is a single argument.
+TEST(ExecDeathTest, InterpreterScriptArgSplit) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo bar"),
+      0755));
+
+  CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+              absl::StrCat(link.path(), "\nfoo bar\n", script.path(), "\n"));
+}
+
+// Original argv[0] is replaced with the script path.
+TEST(ExecDeathTest, InterpreterScriptArgvZero) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
+
+  CheckOutput(script.path(), {"REPLACED"}, {}, ArgEnvExitStatus(1, 0),
+              absl::StrCat(link.path(), "\n", script.path(), "\n"));
+}
+
+// Original argv[0] is replaced with the script path, exactly as passed to
+// execve.
+TEST(ExecDeathTest, InterpreterScriptArgvZeroRelative) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
+
+  auto cwd = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+  auto script_relative =
+      ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, script.path()));
+
+  CheckOutput(script_relative, {"REPLACED"}, {}, ArgEnvExitStatus(1, 0),
+              absl::StrCat(link.path(), "\n", script_relative, "\n"));
+}
+
+// argv[0] is added as the script path, even if there was none.
+TEST(ExecDeathTest, InterpreterScriptArgvZeroAdded) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
+
+  CheckOutput(script.path(), {}, {}, ArgEnvExitStatus(1, 0),
+              absl::StrCat(link.path(), "\n", script.path(), "\n"));
+}
+
+// A NUL byte in the script line ends parsing.
+TEST(ExecDeathTest, InterpreterScriptArgNUL) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(),
+      absl::StrCat("#!", link.path(), " foo", std::string(1, '\0'), "bar"), 0755));
+
+  CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+              absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n"));
+}
+
+// Trailing whitespace following interpreter path is ignored.
+TEST(ExecDeathTest, InterpreterScriptTrailingWhitespace) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), "  "), 0755));
+
+  CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(1, 0),
+              absl::StrCat(link.path(), "\n", script.path(), "\n"));
+}
+
+// Multiple whitespace characters between interpreter and arg allowed.
+TEST(ExecDeathTest, InterpreterScriptArgWhitespace) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), "  foo"), 0755));
+
+  CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+              absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n"));
+}
+
+TEST(ExecDeathTest, InterpreterScriptNoPath) {
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "#!", 0755));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script.path(), {script.path()}, {}, nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOEXEC);
+}
+
+// AT_EXECFN is the path passed to execve.
+TEST(ExecDeathTest, ExecFn) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " PrintExecFn"),
+      0755));
+
+  // Pass the script as a relative path and assert that is what appears in
+  // AT_EXECFN.
+  auto cwd = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+  auto script_relative =
+      ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, script.path()));
+
+  CheckOutput(script_relative, {script_relative}, {}, ArgEnvExitStatus(0, 0),
+              absl::StrCat(script_relative, "\n"));
+}
+
+TEST(ExecDeathTest, ExecName) {
+  std::string path = WorkloadPath(kStateWorkload);
+
+  CheckOutput(path, {path, "PrintExecName"}, {}, ArgEnvExitStatus(0, 0),
+              absl::StrCat(Basename(path).substr(0, 15), "\n"));
+}
+
+TEST(ExecDeathTest, ExecNameScript) {
+  // Symlink through /tmp to ensure the path is short enough.
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(),
+      absl::StrCat("#!", link.path(), " PrintExecName"), 0755));
+
+  std::string script_path = script.path();
+
+  CheckOutput(script_path, {script_path}, {}, ArgEnvExitStatus(0, 0),
+              absl::StrCat(Basename(script_path).substr(0, 15), "\n"));
+}
+
+// execve may be called by a multithreaded process.
+TEST(ExecDeathTest, WithSiblingThread) {
+  CheckOutput("/proc/self/exe", {"/proc/self/exe", kExecWithThread}, {},
+              W_EXITCODE(42, 0), "");
+}
+
+// execve may be called from a thread other than the leader of a multithreaded
+// process.
+TEST(ExecDeathTest, FromSiblingThread) {
+  CheckOutput("/proc/self/exe", {"/proc/self/exe", kExecFromThread}, {},
+              W_EXITCODE(42, 0), "");
+}
+
+TEST(ExecTest, NotFound) {
+  char* const argv[] = {nullptr};
+  char* const envp[] = {nullptr};
+  EXPECT_THAT(execve("/file/does/not/exist", argv, envp),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(ExecTest, NoExecPerm) {
+  char* const argv[] = {nullptr};
+  char* const envp[] = {nullptr};
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  EXPECT_THAT(execve(f.path().c_str(), argv, envp),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// A signal handler we never expect to be called.
+void SignalHandler(int signo) {
+  std::cerr << "Signal " << signo << " raised." << std::endl;
+  exit(1);
+}
+
+// Signal handlers are reset on execve(2), unless they have default or ignored
+// disposition.
+TEST(ExecStateDeathTest, HandlerReset) {
+  struct sigaction sa;
+  sa.sa_handler = SignalHandler;
+  ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+  ExecveArray args = {
+      WorkloadPath(kStateWorkload),
+      "CheckSigHandler",
+      absl::StrCat(SIGUSR1),
+      absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_DFL))),
+  };
+
+  CheckOutput(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+}
+
+// Ignored signal dispositions are not reset.
+TEST(ExecStateDeathTest, IgnorePreserved) {
+  struct sigaction sa;
+  sa.sa_handler = SIG_IGN;
+  ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+  ExecveArray args = {
+      WorkloadPath(kStateWorkload),
+      "CheckSigHandler",
+      absl::StrCat(SIGUSR1),
+      absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_IGN))),
+  };
+
+  CheckOutput(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+}
+
+// Signal masks are not reset on exec
+TEST(ExecStateDeathTest, SignalMask) {
+  sigset_t s;
+  sigemptyset(&s);
+  sigaddset(&s, SIGUSR1);
+  ASSERT_THAT(sigprocmask(SIG_BLOCK, &s, nullptr), SyscallSucceeds());
+
+  ExecveArray args = {
+      WorkloadPath(kStateWorkload),
+      "CheckSigBlocked",
+      absl::StrCat(SIGUSR1),
+  };
+
+  CheckOutput(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+}
+
+// itimers persist across execve.
+// N.B. Timers created with timer_create(2) should not be preserved!
+TEST(ExecStateDeathTest, ItimerPreserved) {
+  // The fork in ForkAndExec clears itimers, so only set them up after fork.
+  auto setup_itimer = [] {
+    // Ignore SIGALRM, as we don't actually care about timer
+    // expirations.
+    struct sigaction sa;
+    sa.sa_handler = SIG_IGN;
+    int ret = sigaction(SIGALRM, &sa, nullptr);
+    if (ret < 0) {
+      _exit(errno);
+    }
+
+    struct itimerval itv;
+    itv.it_interval.tv_sec = 1;
+    itv.it_interval.tv_usec = 0;
+    itv.it_value.tv_sec = 1;
+    itv.it_value.tv_usec = 0;
+    ret = setitimer(ITIMER_REAL, &itv, nullptr);
+    if (ret < 0) {
+      _exit(errno);
+    }
+  };
+
+  std::string filename = WorkloadPath(kStateWorkload);
+  ExecveArray argv = {
+      filename,
+      "CheckItimerEnabled",
+      absl::StrCat(ITIMER_REAL),
+  };
+
+  pid_t child;
+  int execve_errno;
+  auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(filename, argv, {}, setup_itimer, &child, &execve_errno));
+  ASSERT_EQ(0, execve_errno);
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+  EXPECT_EQ(0, status);
+
+  // Process cleanup no longer needed.
+  kill.Release();
+}
+
+TEST(ProcSelfExe, ChangesAcrossExecve) {
+  // See exec_proc_exe_workload for more details. We simply
+  // assert that the /proc/self/exe link changes across execve.
+  CheckOutput(WorkloadPath(kProcExeWorkload),
+              {WorkloadPath(kProcExeWorkload),
+               ASSERT_NO_ERRNO_AND_VALUE(ProcessExePath(getpid()))},
+              {}, W_EXITCODE(0, 0), "");
+}
+
+TEST(ExecTest, CloexecNormalFile) {
+  const FileDescriptor fd_closed_on_exec = ASSERT_NO_ERRNO_AND_VALUE(
+      Open("/usr/share/zoneinfo", O_RDONLY | O_CLOEXEC));
+
+  CheckOutput(WorkloadPath(kAssertClosedWorkload),
+              {WorkloadPath(kAssertClosedWorkload),
+               absl::StrCat(fd_closed_on_exec.get())},
+              {}, W_EXITCODE(0, 0), "");
+
+  // The assert closed workload exits with code 2 if the file still exists.  We
+  // can use this to do a negative test.
+  const FileDescriptor fd_open_on_exec =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/usr/share/zoneinfo", O_RDONLY));
+
+  CheckOutput(WorkloadPath(kAssertClosedWorkload),
+              {WorkloadPath(kAssertClosedWorkload),
+               absl::StrCat(fd_open_on_exec.get())},
+              {}, W_EXITCODE(2, 0), "");
+}
+
+TEST(ExecTest, CloexecEventfd) {
+  int efd;
+  ASSERT_THAT(efd = eventfd(0, EFD_CLOEXEC), SyscallSucceeds());
+  FileDescriptor fd(efd);
+
+  CheckOutput(WorkloadPath(kAssertClosedWorkload),
+              {WorkloadPath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
+              W_EXITCODE(0, 0), "");
+}
+
+// Priority consistent across calls to execve()
+TEST(GetpriorityTest, ExecveMaintainsPriority) {
+  int prio = 16;
+  ASSERT_THAT(setpriority(PRIO_PROCESS, getpid(), prio), SyscallSucceeds());
+
+  // To avoid trying to use negative exit values, check for
+  // 20 - prio. Since prio should always be in the range [-20, 19],
+  // this leave expected_exit_code in the range [1, 40].
+  int expected_exit_code = 20 - prio;
+
+  // Program run (priority_execve) will exit(X) where
+  // X=getpriority(PRIO_PROCESS,0). Check that this exit value is prio.
+  CheckOutput(WorkloadPath(kPriorityWorkload),
+              {WorkloadPath(kPriorityWorkload)}, {},
+              W_EXITCODE(expected_exit_code, 0), "");
+}
+
+void ExecWithThread() {
+  // Used to ensure that the thread has actually started.
+  absl::Mutex mu;
+  bool started = false;
+
+  ScopedThread t([&] {
+    mu.Lock();
+    started = true;
+    mu.Unlock();
+
+    while (true) {
+      pause();
+    }
+  });
+
+  mu.LockWhen(absl::Condition(&started));
+  mu.Unlock();
+
+  const ExecveArray argv = {"/proc/self/exe", kExit42};
+  const ExecveArray envv;
+
+  execve("/proc/self/exe", argv.get(), envv.get());
+  exit(errno);
+}
+
+void ExecFromThread() {
+  ScopedThread t([] {
+    const ExecveArray argv = {"/proc/self/exe", kExit42};
+    const ExecveArray envv;
+
+    execve("/proc/self/exe", argv.get(), envv.get());
+    exit(errno);
+  });
+
+  while (true) {
+    pause();
+  }
+}
+
+bool ValidateProcCmdlineVsArgv(const int argc, const char* const* argv) {
+  auto contents_or = GetContents("/proc/self/cmdline");
+  if (!contents_or.ok()) {
+    LOG(ERROR) << "Unable to get /proc/self/cmdline: " << contents_or.error();
+    return false;
+  }
+  auto contents = contents_or.ValueOrDie();
+  if (contents.back() != '\0') {
+    LOG(ERROR) << "Non-null terminated /proc/self/cmdline!";
+    return false;
+  }
+  contents.pop_back();
+  std::vector<std::string> procfs_cmdline = absl::StrSplit(contents, '\0');
+
+  if (static_cast<int>(procfs_cmdline.size()) != argc) {
+    LOG(ERROR) << "argc = " << argc << " != " << procfs_cmdline.size();
+    return false;
+  }
+
+  for (int i = 0; i < argc; ++i) {
+    if (procfs_cmdline[i] != argv[i]) {
+      LOG(ERROR) << "Procfs command line argument " << i << " mismatch "
+                 << procfs_cmdline[i] << " != " << argv[i];
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  // Start by validating that the stack argv is consistent with procfs.
+  if (!gvisor::testing::ValidateProcCmdlineVsArgv(argc, argv)) {
+    return 1;
+  }
+
+  // Some of these tests require no background threads, so check for them before
+  // TestInit.
+  for (int i = 0; i < argc; i++) {
+    absl::string_view arg(argv[i]);
+
+    if (arg == gvisor::testing::kExit42) {
+      return 42;
+    }
+    if (arg == gvisor::testing::kExecWithThread) {
+      gvisor::testing::ExecWithThread();
+      return 1;
+    }
+    if (arg == gvisor::testing::kExecFromThread) {
+      gvisor::testing::ExecFromThread();
+      return 1;
+    }
+  }
+
+  gvisor::testing::TestInit(&argc, &argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/exec.h b/test/syscalls/linux/exec.h
new file mode 100644
index 000000000..b82bfffd1
--- /dev/null
+++ b/test/syscalls/linux/exec.h
@@ -0,0 +1,34 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_EXEC_H_
+#define GVISOR_TEST_SYSCALLS_EXEC_H_
+
+#include <sys/wait.h>
+
+namespace gvisor {
+namespace testing {
+
+// Returns the exit code used by exec_basic_workload.
+inline int ArgEnvExitCode(int args, int envs) { return args + envs * 10; }
+
+// Returns the exit status used by exec_basic_workload.
+inline int ArgEnvExitStatus(int args, int envs) {
+  return W_EXITCODE(ArgEnvExitCode(args, envs), 0);
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_EXEC_H_
diff --git a/test/syscalls/linux/exec_assert_closed_workload.cc b/test/syscalls/linux/exec_assert_closed_workload.cc
new file mode 100644
index 000000000..4448431e1
--- /dev/null
+++ b/test/syscalls/linux/exec_assert_closed_workload.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "absl/strings/numbers.h"
+
+int main(int argc, char** argv) {
+  if (argc != 2) {
+    std::cerr << "need two arguments, got " << argc;
+    exit(1);
+  }
+  int fd;
+  if (!absl::SimpleAtoi(argv[1], &fd)) {
+    std::cerr << "fd: " << argv[1] << " could not be parsed" << std::endl;
+    exit(1);
+  }
+  struct stat s;
+  if (fstat(fd, &s) == 0) {
+    std::cerr << "fd: " << argv[1] << " should not be valid" << std::endl;
+    exit(2);
+  }
+  if (errno != EBADF) {
+    std::cerr << "fstat fd: " << argv[1] << " got errno: " << errno
+              << " wanted: " << EBADF << std::endl;
+    exit(1);
+  }
+  return 0;
+}
diff --git a/test/syscalls/linux/exec_basic_workload.cc b/test/syscalls/linux/exec_basic_workload.cc
new file mode 100644
index 000000000..d4bdf511f
--- /dev/null
+++ b/test/syscalls/linux/exec_basic_workload.cc
@@ -0,0 +1,31 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+
+#include <iostream>
+
+#include "test/syscalls/linux/exec.h"
+
+int main(int argc, char** argv, char** envp) {
+  int i;
+  for (i = 0; i < argc; i++) {
+    std::cerr << argv[i] << std::endl;
+  }
+  for (i = 0; envp[i] != nullptr; i++) {
+    std::cerr << envp[i] << std::endl;
+  }
+  exit(gvisor::testing::ArgEnvExitCode(argc - 1, i));
+  return 0;
+}
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
new file mode 100644
index 000000000..cfc898699
--- /dev/null
+++ b/test/syscalls/linux/exec_binary.cc
@@ -0,0 +1,1367 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <elf.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <unistd.h>
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+#ifndef __x86_64__
+// The assembly stub and ELF internal details must be ported to other arches.
+#error "Test only supported on x86-64"
+#endif  // __x86_64__
+
+// amd64 stub that calls PTRACE_TRACEME and sends itself SIGSTOP.
+const char kPtraceCode[] = {
+    // movq $101, %rax  /* ptrace */
+    '\x48',
+    '\xc7',
+    '\xc0',
+    '\x65',
+    '\x00',
+    '\x00',
+    '\x00',
+    // movq $0, %rsi  /* PTRACE_TRACEME */
+    '\x48',
+    '\xc7',
+    '\xc6',
+    '\x00',
+    '\x00',
+    '\x00',
+    '\x00',
+    // movq $0, %rdi
+    '\x48',
+    '\xc7',
+    '\xc7',
+    '\x00',
+    '\x00',
+    '\x00',
+    '\x00',
+    // movq $0, %rdx
+    '\x48',
+    '\xc7',
+    '\xc2',
+    '\x00',
+    '\x00',
+    '\x00',
+    '\x00',
+    // movq $0, %r10
+    '\x49',
+    '\xc7',
+    '\xc2',
+    '\x00',
+    '\x00',
+    '\x00',
+    '\x00',
+    // syscall
+    '\x0f',
+    '\x05',
+
+    // movq $39, %rax  /* getpid */
+    '\x48',
+    '\xc7',
+    '\xc0',
+    '\x27',
+    '\x00',
+    '\x00',
+    '\x00',
+    // syscall
+    '\x0f',
+    '\x05',
+
+    // movq %rax, %rdi  /* pid */
+    '\x48',
+    '\x89',
+    '\xc7',
+    // movq $62, %rax  /* kill */
+    '\x48',
+    '\xc7',
+    '\xc0',
+    '\x3e',
+    '\x00',
+    '\x00',
+    '\x00',
+    // movq $19, %rsi  /* SIGSTOP */
+    '\x48',
+    '\xc7',
+    '\xc6',
+    '\x13',
+    '\x00',
+    '\x00',
+    '\x00',
+    // syscall
+    '\x0f',
+    '\x05',
+};
+
+// Size of a syscall instruction.
+constexpr int kSyscallSize = 2;
+
+// This test suite tests executable loading in the kernel (ELF and interpreter
+// scripts).
+
+// Parameterized ELF types for 64 and 32 bit.
+template <int Size>
+struct ElfTypes;
+
+template <>
+struct ElfTypes<64> {
+  typedef Elf64_Ehdr ElfEhdr;
+  typedef Elf64_Phdr ElfPhdr;
+};
+
+template <>
+struct ElfTypes<32> {
+  typedef Elf32_Ehdr ElfEhdr;
+  typedef Elf32_Phdr ElfPhdr;
+};
+
+template <int Size>
+struct ElfBinary {
+  using ElfEhdr = typename ElfTypes<Size>::ElfEhdr;
+  using ElfPhdr = typename ElfTypes<Size>::ElfPhdr;
+
+  ElfEhdr header = {};
+  std::vector<ElfPhdr> phdrs;
+  std::vector<char> data;
+
+  // UpdateOffsets updates p_offset, p_vaddr in all phdrs to account for the
+  // space taken by the header and phdrs.
+  //
+  // It also updates header.e_phnum and adds the offset to header.e_entry to
+  // account for the headers residing in the first PT_LOAD segment.
+  //
+  // Before calling UpdateOffsets each of those fields should be the appropriate
+  // offset into data.
+  void UpdateOffsets() {
+    size_t offset = sizeof(header) + phdrs.size() * sizeof(ElfPhdr);
+    header.e_entry += offset;
+    header.e_phnum = phdrs.size();
+    for (auto& p : phdrs) {
+      p.p_offset += offset;
+      p.p_vaddr += offset;
+    }
+  }
+
+  // AddInterpreter adds a PT_INTERP segment with the passed contents.
+  //
+  // A later call to UpdateOffsets is required to make the new phdr valid.
+  void AddInterpreter(std::vector<char> contents) {
+    const int start = data.size();
+    data.insert(data.end(), contents.begin(), contents.end());
+    const int size = data.size() - start;
+
+    ElfPhdr phdr = {};
+    phdr.p_type = PT_INTERP;
+    phdr.p_offset = start;
+    phdr.p_filesz = size;
+    phdr.p_memsz = size;
+    // "If [PT_INTERP] is present, it must precede any loadable segment entry."
+    phdrs.insert(phdrs.begin(), phdr);
+  }
+
+  // Writes the header, phdrs, and data to fd.
+  PosixError Write(int fd) const {
+    int ret = WriteFd(fd, &header, sizeof(header));
+    if (ret < 0) {
+      return PosixError(errno, "failed to write header");
+    } else if (ret != sizeof(header)) {
+      return PosixError(EIO, absl::StrCat("short write of header: ", ret));
+    }
+
+    for (auto const& p : phdrs) {
+      ret = WriteFd(fd, &p, sizeof(p));
+      if (ret < 0) {
+        return PosixError(errno, "failed to write phdr");
+      } else if (ret != sizeof(p)) {
+        return PosixError(EIO, absl::StrCat("short write of phdr: ", ret));
+      }
+    }
+
+    ret = WriteFd(fd, data.data(), data.size());
+    if (ret < 0) {
+      return PosixError(errno, "failed to write data");
+    } else if (ret != static_cast<int>(data.size())) {
+      return PosixError(EIO, absl::StrCat("short write of data: ", ret));
+    }
+
+    return NoError();
+  }
+};
+
+// Creates a new temporary executable ELF file in parent with elf as the
+// contents.
+template <int Size>
+PosixErrorOr<TempPath> CreateElfWith(absl::string_view parent,
+                                     ElfBinary<Size> const& elf) {
+  ASSIGN_OR_RETURN_ERRNO(
+      auto file, TempPath::CreateFileWith(parent, absl::string_view(), 0755));
+  ASSIGN_OR_RETURN_ERRNO(auto fd, Open(file.path(), O_RDWR));
+  RETURN_IF_ERRNO(elf.Write(fd.get()));
+  return std::move(file);
+}
+
+// Creates a new temporary executable ELF file with elf as the contents.
+template <int Size>
+PosixErrorOr<TempPath> CreateElfWith(ElfBinary<Size> const& elf) {
+  return CreateElfWith(GetAbsoluteTestTmpdir(), elf);
+}
+
+// Wait for pid to stop, and assert that it stopped via SIGSTOP.
+PosixError WaitStopped(pid_t pid) {
+  int status;
+  int ret = RetryEINTR(waitpid)(pid, &status, 0);
+  MaybeSave();
+  if (ret < 0) {
+    return PosixError(errno, "wait failed");
+  } else if (ret != pid) {
+    return PosixError(ESRCH, absl::StrCat("wait got ", ret, " want ", pid));
+  }
+
+  if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) {
+    return PosixError(EINVAL,
+                      absl::StrCat("pid did not SIGSTOP; status = ", status));
+  }
+
+  return NoError();
+}
+
+// Returns a valid ELF that PTRACE_TRACEME and SIGSTOPs itself.
+//
+// UpdateOffsets must be called before writing this ELF.
+ElfBinary<64> StandardElf() {
+  ElfBinary<64> elf;
+  elf.header.e_ident[EI_MAG0] = ELFMAG0;
+  elf.header.e_ident[EI_MAG1] = ELFMAG1;
+  elf.header.e_ident[EI_MAG2] = ELFMAG2;
+  elf.header.e_ident[EI_MAG3] = ELFMAG3;
+  elf.header.e_ident[EI_CLASS] = ELFCLASS64;
+  elf.header.e_ident[EI_DATA] = ELFDATA2LSB;
+  elf.header.e_ident[EI_VERSION] = EV_CURRENT;
+  elf.header.e_type = ET_EXEC;
+  elf.header.e_machine = EM_X86_64;
+  elf.header.e_version = EV_CURRENT;
+  elf.header.e_phoff = sizeof(elf.header);
+  elf.header.e_phentsize = sizeof(decltype(elf)::ElfPhdr);
+
+  // TODO: Always include a PT_GNU_STACK segment to disable
+  // executable stacks. With this omitted the stack (and all PROT_READ) mappings
+  // should be executable, but gVisor doesn't support that.
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_GNU_STACK;
+  phdr.p_flags = PF_R | PF_W;
+  elf.phdrs.push_back(phdr);
+
+  phdr = {};
+  phdr.p_type = PT_LOAD;
+  phdr.p_flags = PF_R | PF_X;
+  phdr.p_offset = 0;
+  phdr.p_vaddr = 0x40000;
+  phdr.p_filesz = sizeof(kPtraceCode);
+  phdr.p_memsz = phdr.p_filesz;
+  elf.phdrs.push_back(phdr);
+
+  elf.header.e_entry = phdr.p_vaddr;
+
+  elf.data.assign(kPtraceCode, kPtraceCode + sizeof(kPtraceCode));
+
+  return elf;
+}
+
+// Test that a trivial binary executes.
+TEST(ElfTest, Execute) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  // Ensure it made it to SIGSTOP.
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  struct user_regs_struct regs;
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  // RIP is just beyond the final syscall instruction.
+  EXPECT_EQ(regs.rip, elf.header.e_entry + sizeof(kPtraceCode));
+
+  EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+                         {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+                          file.path().c_str()},
+                     })));
+}
+
+// StandardElf without data completes execve, but faults once running.
+TEST(ElfTest, MissingText) {
+  ElfBinary<64> elf = StandardElf();
+  elf.data.clear();
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+              SyscallSucceedsWithValue(child));
+  // It runs off the end of the zeroes filling the end of the page.
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) << status;
+}
+
+// Typical ELF with a data + bss segment
+TEST(ElfTest, DataSegment) {
+  ElfBinary<64> elf = StandardElf();
+
+  // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+  // beginning of a multi-page data + bss segment.
+  elf.data.resize(kPageSize + kPageSize / 2);
+
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_LOAD;
+  phdr.p_flags = PF_R | PF_W;
+  phdr.p_offset = kPageSize;
+  phdr.p_vaddr = 0x41000;
+  phdr.p_filesz = kPageSize / 2;
+  // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+  // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+  phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+  elf.phdrs.push_back(phdr);
+
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  EXPECT_THAT(
+      child, ContainsMappings(std::vector<ProcMapsEntry>({
+                 // text page.
+                 {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+                  file.path().c_str()},
+                 // data + bss page from file.
+                 {0x41000, 0x42000, true, true, false, true, kPageSize, 0, 0, 0,
+                  file.path().c_str()},
+                 // bss page from anon.
+                 {0x42000, 0x43000, true, true, false, true, 0, 0, 0, 0, ""},
+             })));
+}
+
+// Linux will allow PT_LOAD segments to overlap.
+TEST(ElfTest, DirectlyOverlappingSegments) {
+  // NOTE: see PIEOutOfOrderSegments.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ElfBinary<64> elf = StandardElf();
+
+  // Same as the StandardElf mapping.
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_LOAD;
+  // Add PF_W so we can differentiate this mapping from the first.
+  phdr.p_flags = PF_R | PF_W | PF_X;
+  phdr.p_offset = 0;
+  phdr.p_vaddr = 0x40000;
+  phdr.p_filesz = sizeof(kPtraceCode);
+  phdr.p_memsz = phdr.p_filesz;
+  elf.phdrs.push_back(phdr);
+
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+                         {0x40000, 0x41000, true, true, true, true, 0, 0, 0, 0,
+                          file.path().c_str()},
+                     })));
+}
+
+// Linux allows out-of-order PT_LOAD segments.
+TEST(ElfTest, OutOfOrderSegments) {
+  // NOTE: see PIEOutOfOrderSegments.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ElfBinary<64> elf = StandardElf();
+
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_LOAD;
+  phdr.p_flags = PF_R | PF_X;
+  phdr.p_offset = 0;
+  phdr.p_vaddr = 0x20000;
+  phdr.p_filesz = sizeof(kPtraceCode);
+  phdr.p_memsz = phdr.p_filesz;
+  elf.phdrs.push_back(phdr);
+
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+                         {0x20000, 0x21000, true, false, true, true, 0, 0, 0, 0,
+                          file.path().c_str()},
+                         {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+                          file.path().c_str()},
+                     })));
+}
+
+// header.e_phoff is bound the end of the file.
+TEST(ElfTest, OutOfBoundsPhdrs) {
+  ElfBinary<64> elf = StandardElf();
+  elf.header.e_phoff = 0x100000;
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  // On Linux 3.11, this caused EIO. On newer Linux, it causes ENOEXEC.
+  EXPECT_THAT(execve_errno, AnyOf(Eq(ENOEXEC), Eq(EIO)));
+}
+
+// Claim there is a phdr beyond the end of the file, but don't include it.
+TEST(ElfTest, MissingPhdr) {
+  ElfBinary<64> elf = StandardElf();
+
+  // Clear data so the file ends immediately after the phdrs.
+  // N.B. Per ElfTest.MissingData, StandardElf without data completes execve
+  // without error.
+  elf.data.clear();
+  elf.UpdateOffsets();
+
+  // Claim that there is another phdr just beyond the end of the file. Of
+  // course, it isn't accessible.
+  elf.header.e_phnum++;
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  // On Linux 3.11, this caused EIO. On newer Linux, it causes ENOEXEC.
+  EXPECT_THAT(execve_errno, AnyOf(Eq(ENOEXEC), Eq(EIO)));
+}
+
+// No headers at all, just the ELF magic.
+TEST(ElfTest, MissingHeader) {
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0755));
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  const char kElfMagic[] = {0x7f, 'E', 'L', 'F'};
+
+  ASSERT_THAT(WriteFd(fd.get(), &kElfMagic, sizeof(kElfMagic)),
+              SyscallSucceeds());
+  fd.reset();
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOEXEC);
+}
+
+// Load a PIE ELF with a data + bss segment.
+TEST(ElfTest, PIE) {
+  ElfBinary<64> elf = StandardElf();
+
+  elf.header.e_type = ET_DYN;
+
+  // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+  // beginning of a multi-page data + bss segment.
+  elf.data.resize(kPageSize + kPageSize / 2);
+
+  elf.header.e_entry = 0x0;
+
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_LOAD;
+  phdr.p_flags = PF_R | PF_W;
+  phdr.p_offset = kPageSize;
+  // Put the data segment at a bit of an offset.
+  phdr.p_vaddr = 0x20000;
+  phdr.p_filesz = kPageSize / 2;
+  // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+  // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+  phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+  elf.phdrs.push_back(phdr);
+
+  elf.UpdateOffsets();
+
+  // The first segment really needs to start at 0 for a normal PIE binary, and
+  // thus includes the headers.
+  const uint64_t offset = elf.phdrs[1].p_offset;
+  elf.phdrs[1].p_offset = 0x0;
+  elf.phdrs[1].p_vaddr = 0x0;
+  elf.phdrs[1].p_filesz += offset;
+  elf.phdrs[1].p_memsz += offset;
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  // RIP tells us which page the first segment was loaded into.
+  struct user_regs_struct regs;
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+
+  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
+
+  EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+                         // text page.
+                         {load_addr, load_addr + 0x1000, true, false, true,
+                          true, 0, 0, 0, 0, file.path().c_str()},
+                         // data + bss page from file.
+                         {load_addr + 0x20000, load_addr + 0x21000, true, true,
+                          false, true, kPageSize, 0, 0, 0, file.path().c_str()},
+                         // bss page from anon.
+                         {load_addr + 0x21000, load_addr + 0x22000, true, true,
+                          false, true, 0, 0, 0, 0, ""},
+                     })));
+}
+
+// PIE binary with a non-zero start address.
+//
+// This is non-standard for a PIE binary, but valid. The binary is still loaded
+// at an arbitrary address, not the first PT_LOAD vaddr.
+//
+// N.B. Linux changed this behavior in d1fd836dcf00d2028c700c7e44d2c23404062c90.
+// Previously, with "randomization" enabled, PIE binaries with a non-zero start
+// address would be be loaded at the address they specified because mmap was
+// passed the load address, which wasn't 0 as expected.
+//
+// This change is present in kernel v4.1+.
+TEST(ElfTest, PIENonZeroStart) {
+  // gVisor has the newer behavior.
+  if (!IsRunningOnGvisor()) {
+    auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+    SKIP_IF(version.major < 4 || (version.major == 4 && version.minor < 1));
+  }
+
+  ElfBinary<64> elf = StandardElf();
+
+  elf.header.e_type = ET_DYN;
+
+  // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+  // beginning of a multi-page data + bss segment.
+  elf.data.resize(kPageSize + kPageSize / 2);
+
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_LOAD;
+  phdr.p_flags = PF_R | PF_W;
+  phdr.p_offset = kPageSize;
+  // Put the data segment at a bit of an offset.
+  phdr.p_vaddr = 0x60000;
+  phdr.p_filesz = kPageSize / 2;
+  // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+  // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+  phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+  elf.phdrs.push_back(phdr);
+
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  // RIP tells us which page the first segment was loaded into.
+  struct user_regs_struct regs;
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+
+  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
+
+  // The ELF is loaded at an arbitrary address, not the first PT_LOAD vaddr.
+  //
+  // N.B. this is technically flaky, but Linux is *extremely* unlikely to pick
+  // this as the start address, as it searches from the top down.
+  EXPECT_NE(load_addr, 0x40000);
+
+  EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+                         // text page.
+                         {load_addr, load_addr + 0x1000, true, false, true,
+                          true, 0, 0, 0, 0, file.path().c_str()},
+                         // data + bss page from file.
+                         {load_addr + 0x20000, load_addr + 0x21000, true, true,
+                          false, true, kPageSize, 0, 0, 0, file.path().c_str()},
+                         // bss page from anon.
+                         {load_addr + 0x21000, load_addr + 0x22000, true, true,
+                          false, true, 0, 0, 0, 0, ""},
+                     })));
+}
+
+TEST(ElfTest, PIEOutOfOrderSegments) {
+  // TODO: This triggers a bug in Linux where it computes the size
+  // of the binary as 0x20000 - 0x40000 = 0xfffffffffffe0000, which obviously
+  // fails to map.
+  //
+  // We test gVisor's behavior (of rejecting the binary) because I assert that
+  // Linux is wrong and needs to be fixed.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  ElfBinary<64> elf = StandardElf();
+
+  elf.header.e_type = ET_DYN;
+
+  // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+  // beginning of a multi-page data + bss segment.
+  elf.data.resize(kPageSize + kPageSize / 2);
+
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_LOAD;
+  phdr.p_flags = PF_R | PF_W;
+  phdr.p_offset = kPageSize;
+  // Put the data segment *before* the first segment.
+  phdr.p_vaddr = 0x20000;
+  phdr.p_filesz = kPageSize / 2;
+  // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+  // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+  phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+  elf.phdrs.push_back(phdr);
+
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOEXEC);
+}
+
+// Standard dynamically linked binary with an ELF interpreter.
+TEST(ElfTest, ELFInterpreter) {
+  ElfBinary<64> interpreter = StandardElf();
+  interpreter.header.e_type = ET_DYN;
+  interpreter.header.e_entry = 0x0;
+  interpreter.UpdateOffsets();
+
+  // The first segment really needs to start at 0 for a normal PIE binary, and
+  // thus includes the headers.
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
+  interpreter.phdrs[1].p_offset = 0x0;
+  interpreter.phdrs[1].p_vaddr = 0x0;
+  interpreter.phdrs[1].p_filesz += offset;
+  interpreter.phdrs[1].p_memsz += offset;
+
+  TempPath interpreter_file =
+      ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+
+  ElfBinary<64> binary = StandardElf();
+
+  // Append the interpreter path.
+  int const interp_data_start = binary.data.size();
+  for (char const c : interpreter_file.path()) {
+    binary.data.push_back(c);
+  }
+  // NUL-terminate.
+  binary.data.push_back(0);
+  int const interp_data_size = binary.data.size() - interp_data_start;
+
+  decltype(binary)::ElfPhdr phdr = {};
+  phdr.p_type = PT_INTERP;
+  phdr.p_offset = interp_data_start;
+  phdr.p_filesz = interp_data_size;
+  phdr.p_memsz = interp_data_size;
+  // "If [PT_INTERP] is present, it must precede any loadable segment entry."
+  //
+  // However, Linux allows it anywhere, so we just stick it at the end to make
+  // sure out-of-order PT_INTERP is OK.
+  binary.phdrs.push_back(phdr);
+
+  binary.UpdateOffsets();
+
+  TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+      binary_file.path(), {binary_file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  // RIP tells us which page the first segment of the interpreter was loaded
+  // into.
+  struct user_regs_struct regs;
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+
+  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
+
+  EXPECT_THAT(child,
+              ContainsMappings(std::vector<ProcMapsEntry>({
+                  // Main binary
+                  {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+                   binary_file.path().c_str()},
+                  // Interpreter
+                  {interp_load_addr, interp_load_addr + 0x1000, true, false,
+                   true, true, 0, 0, 0, 0, interpreter_file.path().c_str()},
+              })));
+}
+
+// Test parameter to ElfInterpterStaticTest cases. The first item is a suffix to
+// add to the end of the interpreter path in the PT_INTERP segment and the
+// second is the expected execve(2) errno.
+using ElfInterpreterStaticParam = std::tuple<std::vector<char>, int>;
+
+class ElfInterpreterStaticTest
+    : public ::testing::TestWithParam<ElfInterpreterStaticParam> {};
+
+// Statically linked ELF with a statically linked ELF interpreter.
+TEST_P(ElfInterpreterStaticTest, Test) {
+  const std::vector<char> segment_suffix = std::get<0>(GetParam());
+  const int expected_errno = std::get<1>(GetParam());
+
+  ElfBinary<64> interpreter = StandardElf();
+  interpreter.UpdateOffsets();
+  TempPath interpreter_file =
+      ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+
+  ElfBinary<64> binary = StandardElf();
+  // The PT_LOAD segment conflicts with the interpreter's PT_LOAD segment. The
+  // interpreter's will be mapped directly over the binary's.
+
+  // Interpreter path plus the parameterized suffix in the PT_INTERP segment.
+  const std::string path = interpreter_file.path();
+  std::vector<char> segment(path.begin(), path.end());
+  segment.insert(segment.end(), segment_suffix.begin(), segment_suffix.end());
+  binary.AddInterpreter(segment);
+
+  binary.UpdateOffsets();
+
+  TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+      binary_file.path(), {binary_file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, expected_errno);
+
+  if (expected_errno == 0) {
+    ASSERT_NO_ERRNO(WaitStopped(child));
+
+    EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+                           // Interpreter.
+                           {0x40000, 0x41000, true, false, true, true, 0, 0, 0,
+                            0, interpreter_file.path().c_str()},
+                       })));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Cases, ElfInterpreterStaticTest,
+    ::testing::ValuesIn({
+        // Simple NUL-terminator to run the interpreter as normal.
+        std::make_tuple(std::vector<char>({'\0'}), 0),
+        // Add some garbage to the segment followed by a NUL-terminator. This is
+        // ignored.
+        std::make_tuple(std::vector<char>({'\0', 'b', '\0'}), 0),
+        // Add some garbage to the segment without a NUL-terminator. Linux will
+        // reject
+        // this.
+        std::make_tuple(std::vector<char>({'\0', 'b'}), ENOEXEC),
+    }));
+
+// Test parameter to ElfInterpterBadPathTest cases. The first item is the
+// contents of the PT_INTERP segment and the second is the expected execve(2)
+// errno.
+using ElfInterpreterBadPathParam = std::tuple<std::vector<char>, int>;
+
+class ElfInterpreterBadPathTest
+    : public ::testing::TestWithParam<ElfInterpreterBadPathParam> {};
+
+TEST_P(ElfInterpreterBadPathTest, Test) {
+  const std::vector<char> segment = std::get<0>(GetParam());
+  const int expected_errno = std::get<1>(GetParam());
+
+  ElfBinary<64> binary = StandardElf();
+  binary.AddInterpreter(segment);
+  binary.UpdateOffsets();
+
+  TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+      binary_file.path(), {binary_file.path()}, {}, nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, expected_errno);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Cases, ElfInterpreterBadPathTest,
+    ::testing::ValuesIn({
+        // NUL-terminated fake path in the PT_INTERP segment.
+        std::make_tuple(std::vector<char>({'/', 'f', '/', 'b', '\0'}), ENOENT),
+        // ELF interpreter not NUL-terminated.
+        std::make_tuple(std::vector<char>({'/', 'f', '/', 'b'}), ENOEXEC),
+        // ELF interpreter path omitted entirely.
+        //
+        // fs/binfmt_elf.c:load_elf_binary returns ENOEXEC if p_filesz is < 2
+        // bytes.
+        std::make_tuple(std::vector<char>({'\0'}), ENOEXEC),
+        // ELF interpreter path = "\0".
+        //
+        // fs/binfmt_elf.c:load_elf_binary returns ENOEXEC if p_filesz is < 2
+        // bytes, so add an extra byte to pass that check.
+        //
+        // load_elf_binary -> open_exec -> do_open_execat fails to check that
+        // name != '\0' before calling do_filp_open, which thus opens the
+        // working directory. do_open_execat returns EACCES because the
+        // directory is not a regular file.
+        std::make_tuple(std::vector<char>({'\0', '\0'}), EACCES),
+    }));
+
+// Relative path to ELF interpreter.
+TEST(ElfTest, ELFInterpreterRelative) {
+  ElfBinary<64> interpreter = StandardElf();
+  interpreter.header.e_type = ET_DYN;
+  interpreter.header.e_entry = 0x0;
+  interpreter.UpdateOffsets();
+
+  // The first segment really needs to start at 0 for a normal PIE binary, and
+  // thus includes the headers.
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
+  interpreter.phdrs[1].p_offset = 0x0;
+  interpreter.phdrs[1].p_vaddr = 0x0;
+  interpreter.phdrs[1].p_filesz += offset;
+  interpreter.phdrs[1].p_memsz += offset;
+
+  TempPath interpreter_file =
+      ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+  auto cwd = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+  auto interpreter_relative =
+      ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, interpreter_file.path()));
+
+  ElfBinary<64> binary = StandardElf();
+
+  // NUL-terminated path in the PT_INTERP segment.
+  std::vector<char> segment(interpreter_relative.begin(),
+                            interpreter_relative.end());
+  segment.push_back(0);
+  binary.AddInterpreter(segment);
+
+  binary.UpdateOffsets();
+
+  TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+      binary_file.path(), {binary_file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  // RIP tells us which page the first segment of the interpreter was loaded
+  // into.
+  struct user_regs_struct regs;
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+
+  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
+
+  EXPECT_THAT(child,
+              ContainsMappings(std::vector<ProcMapsEntry>({
+                  // Main binary
+                  {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+                   binary_file.path().c_str()},
+                  // Interpreter
+                  {interp_load_addr, interp_load_addr + 0x1000, true, false,
+                   true, true, 0, 0, 0, 0, interpreter_file.path().c_str()},
+              })));
+}
+
+// ELF interpreter architecture doesn't match the binary.
+TEST(ElfTest, ELFInterpreterWrongArch) {
+  ElfBinary<64> interpreter = StandardElf();
+  interpreter.header.e_machine = EM_PPC64;
+  interpreter.header.e_type = ET_DYN;
+  interpreter.header.e_entry = 0x0;
+  interpreter.UpdateOffsets();
+
+  // The first segment really needs to start at 0 for a normal PIE binary, and
+  // thus includes the headers.
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
+  interpreter.phdrs[1].p_offset = 0x0;
+  interpreter.phdrs[1].p_vaddr = 0x0;
+  interpreter.phdrs[1].p_filesz += offset;
+  interpreter.phdrs[1].p_memsz += offset;
+
+  TempPath interpreter_file =
+      ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+
+  ElfBinary<64> binary = StandardElf();
+
+  // NUL-terminated path in the PT_INTERP segment.
+  const std::string path = interpreter_file.path();
+  std::vector<char> segment(path.begin(), path.end());
+  segment.push_back(0);
+  binary.AddInterpreter(segment);
+
+  binary.UpdateOffsets();
+
+  TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+      binary_file.path(), {binary_file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, ELIBBAD);
+}
+
+// No execute permissions on the binary.
+TEST(ElfTest, NoExecute) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  ASSERT_THAT(chmod(file.path().c_str(), 0644), SyscallSucceeds());
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  EXPECT_EQ(execve_errno, EACCES);
+}
+
+// Execute, but no read permissions on the binary works just fine.
+TEST(ElfTest, NoRead) {
+  // TODO: gVisor's backing filesystem may prevent the sentry from
+  // reading the executable.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  ASSERT_THAT(chmod(file.path().c_str(), 0111), SyscallSucceeds());
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  // TODO: A task with a non-readable executable is marked
+  // non-dumpable, preventing access to proc files. gVisor does not implement
+  // this behavior.
+}
+
+// No execute permissions on the ELF interpreter.
+TEST(ElfTest, ElfInterpreterNoExecute) {
+  ElfBinary<64> interpreter = StandardElf();
+  interpreter.header.e_type = ET_DYN;
+  interpreter.header.e_entry = 0x0;
+  interpreter.UpdateOffsets();
+
+  // The first segment really needs to start at 0 for a normal PIE binary, and
+  // thus includes the headers.
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
+  interpreter.phdrs[1].p_offset = 0x0;
+  interpreter.phdrs[1].p_vaddr = 0x0;
+  interpreter.phdrs[1].p_filesz += offset;
+  interpreter.phdrs[1].p_memsz += offset;
+
+  TempPath interpreter_file =
+      ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+
+  ElfBinary<64> binary = StandardElf();
+
+  // NUL-terminated path in the PT_INTERP segment.
+  const std::string path = interpreter_file.path();
+  std::vector<char> segment(path.begin(), path.end());
+  segment.push_back(0);
+  binary.AddInterpreter(segment);
+
+  binary.UpdateOffsets();
+
+  TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+  ASSERT_THAT(chmod(interpreter_file.path().c_str(), 0644), SyscallSucceeds());
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(interpreter_file.path(), {interpreter_file.path()}, {},
+                  &child, &execve_errno));
+  EXPECT_EQ(execve_errno, EACCES);
+}
+
+// Execute a basic interpreter script.
+TEST(InterpreterScriptTest, Execute) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  // Use /tmp explicitly to ensure the path is short enough.
+  TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary.path()), 0755));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Whitespace after #!.
+TEST(InterpreterScriptTest, Whitespace) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  // Use /tmp explicitly to ensure the path is short enough.
+  TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#! \t  \t", binary.path()), 0755));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Interpreter script is missing execute permission.
+TEST(InterpreterScriptTest, InterpreterScriptNoExecute) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  // Use /tmp explicitly to ensure the path is short enough.
+  TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary.path()), 0644));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, EACCES);
+}
+
+// Binary interpreter script refers to is missing execute permission.
+TEST(InterpreterScriptTest, BinaryNoExecute) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  // Use /tmp explicitly to ensure the path is short enough.
+  TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+  ASSERT_THAT(chmod(binary.path().c_str(), 0644), SyscallSucceeds());
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary.path()), 0755));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, EACCES);
+}
+
+// Linux will load interpreter scripts five levels deep, but no more.
+TEST(InterpreterScriptTest, MaxRecursion) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  // Use /tmp explicitly to ensure the path is short enough.
+  TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+  TempPath script1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      "/tmp", absl::StrCat("#!", binary.path()), 0755));
+  TempPath script2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      "/tmp", absl::StrCat("#!", script1.path()), 0755));
+  TempPath script3 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      "/tmp", absl::StrCat("#!", script2.path()), 0755));
+  TempPath script4 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      "/tmp", absl::StrCat("#!", script3.path()), 0755));
+  TempPath script5 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      "/tmp", absl::StrCat("#!", script4.path()), 0755));
+  TempPath script6 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      "/tmp", absl::StrCat("#!", script5.path()), 0755));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script6.path(), {script6.path()}, {}, &child, &execve_errno));
+  // Too many levels of recursion.
+  EXPECT_EQ(execve_errno, ELOOP);
+
+  // The next level up is OK.
+  auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script5.path(), {script5.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Interpreter script with a relative path.
+TEST(InterpreterScriptTest, RelativePath) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+  auto cwd = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+  auto binary_relative =
+      ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, binary.path()));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary_relative), 0755));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Interpreter script with .. in a path component.
+TEST(InterpreterScriptTest, UncleanPath) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  // Use /tmp explicitly to ensure the path is short enough.
+  TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!/tmp/../", binary.path()),
+      0755));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Passed interpreter script is a symlink.
+TEST(InterpreterScriptTest, Symlink) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  // Use /tmp explicitly to ensure the path is short enough.
+  TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary.path()), 0755));
+
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), script.path()));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(link.path(), {link.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Interpreter script points to a symlink loop.
+TEST(InterpreterScriptTest, SymlinkLoop) {
+  std::string const link1 = NewTempAbsPathInDir("/tmp");
+  std::string const link2 = NewTempAbsPathInDir("/tmp");
+
+  ASSERT_THAT(symlink(link2.c_str(), link1.c_str()), SyscallSucceeds());
+  auto remove_link1 = Cleanup(
+      [&link1] { EXPECT_THAT(unlink(link1.c_str()), SyscallSucceeds()); });
+
+  ASSERT_THAT(symlink(link1.c_str(), link2.c_str()), SyscallSucceeds());
+  auto remove_link2 = Cleanup(
+      [&link2] { EXPECT_THAT(unlink(link2.c_str()), SyscallSucceeds()); });
+
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::StrCat("#!", link1), 0755));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+  EXPECT_EQ(execve_errno, ELOOP);
+}
+
+// Binary is a symlink loop.
+TEST(ExecveTest, SymlinkLoop) {
+  std::string const link1 = NewTempAbsPathInDir("/tmp");
+  std::string const link2 = NewTempAbsPathInDir("/tmp");
+
+  ASSERT_THAT(symlink(link2.c_str(), link1.c_str()), SyscallSucceeds());
+  auto remove_link = Cleanup(
+      [&link1] { EXPECT_THAT(unlink(link1.c_str()), SyscallSucceeds()); });
+
+  ASSERT_THAT(symlink(link1.c_str(), link2.c_str()), SyscallSucceeds());
+  auto remove_link2 = Cleanup(
+      [&link2] { EXPECT_THAT(unlink(link2.c_str()), SyscallSucceeds()); });
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(link1, {link1}, {}, &child, &execve_errno));
+  EXPECT_EQ(execve_errno, ELOOP);
+}
+
+// Binary is a directory.
+TEST(ExecveTest, Directory) {
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec("/tmp", {"/tmp"}, {}, &child, &execve_errno));
+  EXPECT_EQ(execve_errno, EACCES);
+}
+
+// Pass a valid binary as a directory (extra / on the end).
+TEST(ExecveTest, BinaryAsDirectory) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  std::string const path = absl::StrCat(file.path(), "/");
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(path, {path}, {}, &child, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOTDIR);
+}
+
+// The initial brk value is after the page at the end of the binary.
+TEST(ExecveTest, BrkAfterBinary) {
+  ElfBinary<64> elf = StandardElf();
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  // Ensure it made it to SIGSTOP.
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  struct user_regs_struct regs;
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+
+  // RIP is just beyond the final syscall instruction. Rewind to execute a brk
+  // syscall.
+  regs.rip -= kSyscallSize;
+  regs.rax = __NR_brk;
+  regs.rdi = 0;
+  ASSERT_THAT(ptrace(PTRACE_SETREGS, child, 0, &regs), SyscallSucceeds());
+
+  // Resume the child, waiting for syscall entry.
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child, 0, 0), SyscallSucceeds());
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+              SyscallSucceedsWithValue(child));
+  ASSERT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+      << "status = " << status;
+
+  // Execute the syscall.
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+              SyscallSucceedsWithValue(child));
+  ASSERT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+      << "status = " << status;
+
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+
+  // brk is after the text page.
+  //
+  // The kernel does brk randomization, so we can't be sure what the exact
+  // address will be, but it is always beyond the final page in the binary.
+  // i.e., it does not start immediately after memsz in the middle of a page.
+  // Userspace may expect to use that space.
+  EXPECT_GE(regs.rax, 0x41000);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
new file mode 100644
index 000000000..b9a4ac749
--- /dev/null
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -0,0 +1,35 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+
+int main(int argc, char** argv, char** envp) {
+  std::string exe = gvisor::testing::ProcessExePath(getpid()).ValueOrDie();
+  if (exe[0] != '/') {
+    std::cerr << "relative path: " << exe << std::endl;
+    exit(1);
+  }
+  if (exe.find(argv[1]) != std::string::npos) {
+    std::cerr << "matching path: " << exe << std::endl;
+    exit(1);
+  }
+
+  return 0;
+}
diff --git a/test/syscalls/linux/exec_state_workload.cc b/test/syscalls/linux/exec_state_workload.cc
new file mode 100644
index 000000000..b66e22565
--- /dev/null
+++ b/test/syscalls/linux/exec_state_workload.cc
@@ -0,0 +1,202 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/time.h>
+#include <iostream>
+#include <ostream>
+#include <string>
+
+// Pretty-print a sigset_t.
+std::ostream& operator<<(std::ostream& out, const sigset_t& s) {
+  out << "{ ";
+
+  for (int i = 0; i < NSIG; i++) {
+    if (sigismember(&s, i)) {
+      out << i << " ";
+    }
+  }
+
+  out << "}";
+  return out;
+}
+
+// Verify that the signo handler is handler.
+int CheckSigHandler(uint32_t signo, uintptr_t handler) {
+  struct sigaction sa;
+  int ret = sigaction(signo, nullptr, &sa);
+  if (ret < 0) {
+    perror("sigaction");
+    return 1;
+  }
+
+  if (reinterpret_cast<void (*)(int)>(handler) != sa.sa_handler) {
+    std::cerr << "signo " << signo << " handler got: " << sa.sa_handler
+              << " expected: " << std::hex << handler;
+    return 1;
+  }
+  return 0;
+}
+
+// Verify that the signo is blocked.
+int CheckSigBlocked(uint32_t signo) {
+  sigset_t s;
+  int ret = sigprocmask(SIG_SETMASK, nullptr, &s);
+  if (ret < 0) {
+    perror("sigprocmask");
+    return 1;
+  }
+
+  if (!sigismember(&s, signo)) {
+    std::cerr << "signal " << signo << " not blocked in signal mask: " << s
+              << std::endl;
+    return 1;
+  }
+  return 0;
+}
+
+// Verify that the itimer is enabled.
+int CheckItimerEnabled(uint32_t timer) {
+  struct itimerval itv;
+  int ret = getitimer(timer, &itv);
+  if (ret < 0) {
+    perror("getitimer");
+    return 1;
+  }
+
+  if (!itv.it_value.tv_sec && !itv.it_value.tv_usec &&
+      !itv.it_interval.tv_sec && !itv.it_interval.tv_usec) {
+    std::cerr << "timer " << timer
+              << " not enabled. value sec: " << itv.it_value.tv_sec
+              << " usec: " << itv.it_value.tv_usec
+              << " interval sec: " << itv.it_interval.tv_sec
+              << " usec: " << itv.it_interval.tv_usec << std::endl;
+    return 1;
+  }
+  return 0;
+}
+
+int PrintExecFn() {
+  unsigned long execfn = getauxval(AT_EXECFN);
+  if (!execfn) {
+    std::cerr << "AT_EXECFN missing" << std::endl;
+    return 1;
+  }
+
+  std::cerr << reinterpret_cast<const char*>(execfn) << std::endl;
+  return 0;
+}
+
+int PrintExecName() {
+  const size_t name_length = 20;
+  char name[name_length] = {0};
+  if (prctl(PR_GET_NAME, name) < 0) {
+    std::cerr << "prctl(PR_GET_NAME) failed" << std::endl;
+    return 1;
+  }
+
+  std::cerr << name << std::endl;
+  return 0;
+}
+
+void usage(const std::string& prog) {
+  std::cerr << "usage:\n"
+            << "\t" << prog << " CheckSigHandler <signo> <handler addr (hex)>\n"
+            << "\t" << prog << " CheckSigBlocked <signo>\n"
+            << "\t" << prog << " CheckTimerDisabled <timer>\n"
+            << "\t" << prog << " PrintExecFn\n"
+            << "\t" << prog << " PrintExecName" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    usage(argv[0]);
+    return 1;
+  }
+
+  std::string func(argv[1]);
+
+  if (func == "CheckSigHandler") {
+    if (argc != 4) {
+      usage(argv[0]);
+      return 1;
+    }
+
+    char* end;
+    uint32_t signo = strtoul(argv[2], &end, 10);
+    if (end == argv[2]) {
+      std::cerr << "invalid signo: " << argv[2] << std::endl;
+      return 1;
+    }
+
+    uintptr_t handler = strtoull(argv[3], &end, 16);
+    if (end == argv[3]) {
+      std::cerr << "invalid handler: " << std::hex << argv[3] << std::endl;
+      return 1;
+    }
+
+    return CheckSigHandler(signo, handler);
+  }
+
+  if (func == "CheckSigBlocked") {
+    if (argc != 3) {
+      usage(argv[0]);
+      return 1;
+    }
+
+    char* end;
+    uint32_t signo = strtoul(argv[2], &end, 10);
+    if (end == argv[2]) {
+      std::cerr << "invalid signo: " << argv[2] << std::endl;
+      return 1;
+    }
+
+    return CheckSigBlocked(signo);
+  }
+
+  if (func == "CheckItimerEnabled") {
+    if (argc != 3) {
+      usage(argv[0]);
+      return 1;
+    }
+
+    char* end;
+    uint32_t timer = strtoul(argv[2], &end, 10);
+    if (end == argv[2]) {
+      std::cerr << "invalid signo: " << argv[2] << std::endl;
+      return 1;
+    }
+
+    return CheckItimerEnabled(timer);
+  }
+
+  if (func == "PrintExecFn") {
+    // N.B. This will be called as an interpreter script, with the script passed
+    // as the third argument. We don't care about that script.
+    return PrintExecFn();
+  }
+
+  if (func == "PrintExecName") {
+    // N.B. This may be called as an interpreter script like PrintExecFn.
+    return PrintExecName();
+  }
+
+  std::cerr << "Invalid function: " << func << std::endl;
+  return 1;
+}
diff --git a/test/syscalls/linux/exit.cc b/test/syscalls/linux/exit.cc
new file mode 100644
index 000000000..7246a7b3b
--- /dev/null
+++ b/test/syscalls/linux/exit.cc
@@ -0,0 +1,77 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void TestExit(int code) {
+  pid_t pid = fork();
+  if (pid == 0) {
+    _exit(code);
+  }
+
+  ASSERT_THAT(pid, SyscallSucceeds());
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == code) << status;
+}
+
+TEST(ExitTest, Success) { TestExit(0); }
+
+TEST(ExitTest, Failure) { TestExit(1); }
+
+// This test ensures that a process's file descriptors are closed when it calls
+// exit(). In order to test this, the parent tries to read from a pipe whose
+// write end is held by the child. While the read is blocking, the child exits,
+// which should cause the parent to read 0 bytes due to EOF.
+TEST(ExitTest, CloseFds) {
+  int pipe_fds[2];
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+  FileDescriptor read_fd(pipe_fds[0]);
+  FileDescriptor write_fd(pipe_fds[1]);
+
+  pid_t pid = fork();
+  if (pid == 0) {
+    read_fd.reset();
+
+    SleepSafe(absl::Seconds(10));
+
+    _exit(0);
+  }
+
+  EXPECT_THAT(pid, SyscallSucceeds());
+
+  write_fd.reset();
+
+  char buf[10];
+  EXPECT_THAT(ReadFd(read_fd.get(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(0));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/exit_script.sh b/test/syscalls/linux/exit_script.sh
new file mode 100755
index 000000000..f014fcf99
--- /dev/null
+++ b/test/syscalls/linux/exit_script.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 exit_code"
+  exit 255
+fi
+
+exit $1
diff --git a/test/syscalls/linux/fadvise64.cc b/test/syscalls/linux/fadvise64.cc
new file mode 100644
index 000000000..041e8b7b6
--- /dev/null
+++ b/test/syscalls/linux/fadvise64.cc
@@ -0,0 +1,72 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+TEST(FAdvise64Test, Basic) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  // fadvise64 is noop in gVisor, so just test that it succeeds.
+  ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_NORMAL),
+              SyscallSucceeds());
+  ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_RANDOM),
+              SyscallSucceeds());
+  ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_SEQUENTIAL),
+              SyscallSucceeds());
+  ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_WILLNEED),
+              SyscallSucceeds());
+  ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_DONTNEED),
+              SyscallSucceeds());
+  ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_NOREUSE),
+              SyscallSucceeds());
+}
+
+TEST(FAdvise64Test, InvalidArgs) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  // Note: offset is allowed to be negative.
+  ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, static_cast<off_t>(-1),
+                      POSIX_FADV_NORMAL),
+              SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, 12345),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FAdvise64Test, NoPipes) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor read(fds[0]);
+  const FileDescriptor write(fds[1]);
+
+  ASSERT_THAT(syscall(__NR_fadvise64, read.get(), 0, 10, POSIX_FADV_NORMAL),
+              SyscallFailsWithErrno(ESPIPE));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
new file mode 100644
index 000000000..53aedd4e4
--- /dev/null
+++ b/test/syscalls/linux/fallocate.cc
@@ -0,0 +1,57 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// These tests are very rudimentary because fallocate is not
+// implemented.  We just want to make sure the expected error codes are
+// returned.
+
+TEST(FallocateTest, NotImplemented) {
+  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
+
+  // Test that a completely unassigned fallocate mode returns EOPNOTSUPP.
+  ASSERT_THAT(fallocate(fd.get(), 0x80, 0, 32768),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST(FallocateTest, BadOffset) {
+  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
+  ASSERT_THAT(fallocate(fd.get(), 0, -1, 32768), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FallocateTest, BadLength) {
+  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
+  ASSERT_THAT(fallocate(fd.get(), 0, 0, -1), SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/fault.cc b/test/syscalls/linux/fault.cc
new file mode 100644
index 000000000..cfa7d0d1f
--- /dev/null
+++ b/test/syscalls/linux/fault.cc
@@ -0,0 +1,71 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define _GNU_SOURCE 1
+#include <signal.h>
+#include <ucontext.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+__attribute__((noinline)) void Fault(void) {
+  volatile int* foo = nullptr;
+  *foo = 0;
+}
+
+int GetPcFromUcontext(ucontext_t* uc, uintptr_t* pc) {
+#if defined(__x86_64__)
+  *pc = uc->uc_mcontext.gregs[REG_RIP];
+  return 1;
+#elif defined(__i386__)
+  *pc = uc->uc_mcontext.gregs[REG_EIP];
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+void sigact_handler(int sig, siginfo_t* siginfo, void* context) {
+  uintptr_t pc;
+  if (GetPcFromUcontext(reinterpret_cast<ucontext_t*>(context), &pc)) {
+    /* Expect Fault() to be at most 64 bytes in size. */
+    uintptr_t fault_addr = reinterpret_cast<uintptr_t>(&Fault);
+    EXPECT_GE(pc, fault_addr);
+    EXPECT_LT(pc, fault_addr + 64);
+    exit(0);
+  }
+}
+
+TEST(FaultTest, InRange) {
+  // Reset the signal handler to do nothing so that it doesn't freak out
+  // the test runner when we fire an alarm.
+  struct sigaction sa = {};
+  sa.sa_sigaction = sigact_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  ASSERT_THAT(sigaction(SIGSEGV, &sa, nullptr), SyscallSucceeds());
+
+  Fault();
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/fchdir.cc b/test/syscalls/linux/fchdir.cc
new file mode 100644
index 000000000..2b13e36c3
--- /dev/null
+++ b/test/syscalls/linux/fchdir.cc
@@ -0,0 +1,77 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(FchdirTest, Success) {
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  int fd;
+  ASSERT_THAT(fd = open(temp_dir.path().c_str(), O_DIRECTORY | O_RDONLY),
+              SyscallSucceeds());
+
+  EXPECT_THAT(fchdir(fd), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  // Change CWD to a permanent location as temp dirs will be cleaned up.
+  EXPECT_THAT(chdir("/"), SyscallSucceeds());
+}
+
+TEST(FchdirTest, InvalidFD) {
+  EXPECT_THAT(fchdir(-1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FchdirTest, PermissionDenied) {
+  // Drop capabilities that allow us to override directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0666 /* mode */));
+
+  int fd;
+  ASSERT_THAT(fd = open(temp_dir.path().c_str(), O_DIRECTORY | O_RDONLY),
+              SyscallSucceeds());
+
+  EXPECT_THAT(fchdir(fd), SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(FchdirTest, NotDir) {
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  int fd;
+  ASSERT_THAT(fd = open(temp_file.path().c_str(), O_CREAT | O_RDONLY, 0777),
+              SyscallSucceeds());
+
+  EXPECT_THAT(fchdir(fd), SyscallFailsWithErrno(ENOTDIR));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
new file mode 100644
index 000000000..355334bfa
--- /dev/null
+++ b/test/syscalls/linux/fcntl.cc
@@ -0,0 +1,978 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/eventfd.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/base/port.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/timer_util.h"
+
+DEFINE_string(child_setlock_on, "",
+              "Contains the path to try to set a file lock on.");
+DEFINE_bool(child_setlock_write, false,
+            "Whether to set a writable lock (otherwise readable)");
+DEFINE_bool(blocking, false,
+            "Whether to set a blocking lock (otherwise non-blocking).");
+DEFINE_bool(retry_eintr, false, "Whether to retry in the subprocess on EINTR.");
+DEFINE_uint64(child_setlock_start, 0, "The value of struct flock start");
+DEFINE_uint64(child_setlock_len, 0, "The value of struct flock len");
+DEFINE_int32(socket_fd, -1,
+             "A socket to use for communicating more state back "
+             "to the parent.");
+
+namespace gvisor {
+namespace testing {
+
+// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
+// because "it isn't needed", even though Linux can return it via F_GETFL.
+constexpr int kOLargeFile = 00100000;
+
+class FcntlLockTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    // Let's make a socket pair.
+    ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, fds_), SyscallSucceeds());
+  }
+
+  void TearDown() override {
+    EXPECT_THAT(close(fds_[0]), SyscallSucceeds());
+    EXPECT_THAT(close(fds_[1]), SyscallSucceeds());
+  }
+
+  int64_t GetSubprocessFcntlTimeInUsec() {
+    int64_t ret = 0;
+    EXPECT_THAT(ReadFd(fds_[0], reinterpret_cast<void*>(&ret), sizeof(ret)),
+                SyscallSucceedsWithValue(sizeof(ret)));
+    return ret;
+  }
+
+  // The first fd will remain with the process creating the subprocess
+  // and the second will go to the subprocess.
+  int fds_[2] = {};
+};
+
+namespace {
+
+PosixErrorOr<Cleanup> SubprocessLock(std::string const& path, bool for_write,
+                                     bool blocking, bool retry_eintr, int fd,
+                                     off_t start, off_t length, pid_t* child) {
+  std::vector<std::string> args = {
+      "/proc/self/exe",        "--child_setlock_on", path,
+      "--child_setlock_start", absl::StrCat(start),  "--child_setlock_len",
+      absl::StrCat(length),    "--socket_fd",        absl::StrCat(fd)};
+
+  if (for_write) {
+    args.push_back("--child_setlock_write");
+  }
+
+  if (blocking) {
+    args.push_back("--blocking");
+  }
+
+  if (retry_eintr) {
+    args.push_back("--retry_eintr");
+  }
+
+  int execve_errno = 0;
+  ASSIGN_OR_RETURN_ERRNO(
+      auto cleanup,
+      ForkAndExec("/proc/self/exe", ExecveArray(args.begin(), args.end()), {},
+                  nullptr, child, &execve_errno));
+
+  if (execve_errno != 0) {
+    return PosixError(execve_errno, "execve");
+  }
+
+  return std::move(cleanup);
+}
+
+PosixErrorOr<FileDescriptor> Eventfd(int count, int flags) {
+  int efd = eventfd(count, flags);
+  if (efd < 0) {
+    return PosixError(errno, "Eventfd");
+  }
+  return FileDescriptor(efd);
+}
+
+TEST(FcntlTest, SetCloExec) {
+  // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag not set.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Eventfd(0, 0));
+  ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+
+  // Set the FD_CLOEXEC flag.
+  ASSERT_THAT(fcntl(fd.get(), F_SETFD, FD_CLOEXEC), SyscallSucceeds());
+  ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST(FcntlTest, ClearCloExec) {
+  // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag set.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Eventfd(0, EFD_CLOEXEC));
+  ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+
+  // Clear the FD_CLOEXEC flag.
+  ASSERT_THAT(fcntl(fd.get(), F_SETFD, 0), SyscallSucceeds());
+  ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+TEST(FcntlTest, IndependentDescriptorFlags) {
+  // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag not set.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Eventfd(0, 0));
+  ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+
+  // Duplicate the descriptor. Ensure that it also doesn't have FD_CLOEXEC.
+  FileDescriptor newfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+  ASSERT_THAT(fcntl(newfd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+
+  // Set FD_CLOEXEC on the first FD.
+  ASSERT_THAT(fcntl(fd.get(), F_SETFD, FD_CLOEXEC), SyscallSucceeds());
+  ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+
+  // Ensure that the second FD is unaffected by the change on the first.
+  ASSERT_THAT(fcntl(newfd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+// All file description flags passed to open appear in F_GETFL.
+TEST(FcntlTest, GetAllFlags) {
+  TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  int flags = O_RDWR | O_DIRECT | O_SYNC | O_NONBLOCK | O_APPEND;
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), flags));
+
+  // Linux forces O_LARGEFILE on all 64-bit kernels and gVisor's is 64-bit.
+  int expected = flags | kOLargeFile;
+
+  int rflags;
+  EXPECT_THAT(rflags = fcntl(fd.get(), F_GETFL), SyscallSucceeds());
+  EXPECT_EQ(rflags, expected);
+}
+
+TEST(FcntlTest, SetFlags) {
+  TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), 0));
+
+  int const flags = O_RDWR | O_DIRECT | O_SYNC | O_NONBLOCK | O_APPEND;
+  EXPECT_THAT(fcntl(fd.get(), F_SETFL, flags), SyscallSucceeds());
+
+  // Can't set O_RDWR or O_SYNC.
+  // Linux forces O_LARGEFILE on all 64-bit kernels and gVisor's is 64-bit.
+  int expected = O_DIRECT | O_NONBLOCK | O_APPEND | kOLargeFile;
+
+  int rflags;
+  EXPECT_THAT(rflags = fcntl(fd.get(), F_GETFL), SyscallSucceeds());
+  EXPECT_EQ(rflags, expected);
+}
+
+TEST_F(FcntlLockTest, SetLockBadFd) {
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // len 0 has a special meaning: lock all bytes despite how
+  // large the file grows.
+  fl.l_len = 0;
+  EXPECT_THAT(fcntl(-1, F_SETLK, &fl), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(FcntlLockTest, SetLockPipe) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd, but doesn't matter, we expect this to fail.
+  fl.l_len = 0;
+  EXPECT_THAT(fcntl(fds[0], F_SETLK, &fl), SyscallFailsWithErrno(EBADF));
+  EXPECT_THAT(close(fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(FcntlLockTest, SetLockDir) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0666));
+
+  struct flock fl;
+  fl.l_type = F_RDLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+}
+
+TEST_F(FcntlLockTest, SetLockBadOpenFlagsWrite) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY, 0666));
+
+  struct flock fl0;
+  fl0.l_type = F_WRLCK;
+  fl0.l_whence = SEEK_SET;
+  fl0.l_start = 0;
+  // Same as SetLockBadFd.
+  fl0.l_len = 0;
+
+  // Expect that setting a write lock using a read only file descriptor
+  // won't work.
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(FcntlLockTest, SetLockBadOpenFlagsRead) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY, 0666));
+
+  struct flock fl1;
+  fl1.l_type = F_RDLCK;
+  fl1.l_whence = SEEK_SET;
+  fl1.l_start = 0;
+  // Same as SetLockBadFd.
+  fl1.l_len = 0;
+
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(FcntlLockTest, SetLockUnlockOnNothing) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_UNLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+}
+
+TEST_F(FcntlLockTest, SetWriteLockSingleProc) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd0 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+
+  EXPECT_THAT(fcntl(fd0.get(), F_SETLK, &fl), SyscallSucceeds());
+  // Expect to be able to take the same lock on the same fd no problem.
+  EXPECT_THAT(fcntl(fd0.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  // Expect to be able to take the same lock from a different fd but for
+  // the same process.
+  EXPECT_THAT(fcntl(fd1.get(), F_SETLK, &fl), SyscallSucceeds());
+}
+
+TEST_F(FcntlLockTest, SetReadLockMultiProc) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_RDLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // spawn a child process to take a read lock on the same file.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), false /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetReadThenWriteLockMultiProc) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_RDLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // Assert that another process trying to lock on the same file will fail
+  // with EAGAIN.  It's important that we keep the fd above open so that
+  // that the other process will contend with the lock.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), true /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+      << "Exited with code: " << status;
+
+  // Close the fd: we want to test that another process can acquire the
+  // lock after this point.
+  fd.reset();
+  // Assert that another process can now acquire the lock.
+
+  child_pid = 0;
+  auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), true /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetWriteThenReadLockMultiProc) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+  // Same as SetReadThenWriteLockMultiProc.
+
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+
+  // Same as SetReadThenWriteLockMultiProc.
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // Same as SetReadThenWriteLockMultiProc.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), false /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+      << "Exited with code: " << status;
+
+  // Same as SetReadThenWriteLockMultiProc.
+  fd.reset();  // Close the fd.
+
+  // Same as SetReadThenWriteLockMultiProc.
+  child_pid = 0;
+  auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), false /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetWriteLockMultiProc) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+  // Same as SetReadThenWriteLockMultiProc.
+
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+
+  // Same as SetReadWriteLockMultiProc.
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // Same as SetReadWriteLockMultiProc.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), true /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+      << "Exited with code: " << status;
+
+  fd.reset();  // Close the FD.
+  // Same as SetReadWriteLockMultiProc.
+  child_pid = 0;
+  auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), true /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockIsRegional) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  fl.l_len = 4096;
+
+  // Same as SetReadWriteLockMultiProc.
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // Same as SetReadWriteLockMultiProc.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), true /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_len, 0, &child_pid));
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockUpgradeDowngrade) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_RDLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+
+  // Same as SetReadWriteLockMultiProc.
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // Upgrade to a write lock.  This will prevent anyone else from taking
+  // the lock.
+  fl.l_type = F_WRLCK;
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // Same as SetReadWriteLockMultiProc.,
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), false /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+      << "Exited with code: " << status;
+
+  // Downgrade back to a read lock.
+  fl.l_type = F_RDLCK;
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // Do the same stint as before, but this time it should succeed.
+  child_pid = 0;
+  auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), false /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockDroppedOnClose) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  // While somewhat surprising, obtaining another fd to the same file and
+  // then closing it in this process drops *all* locks.
+  FileDescriptor other_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+  // Same as SetReadThenWriteLockMultiProc.
+
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+
+  // Same as SetReadWriteLockMultiProc.
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  other_fd.reset();  // Close.
+
+  // Expect to be able to get the lock, given that the close above dropped it.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(file.path(), true /* write lock */,
+                     false /* nonblocking */, false /* no eintr retry */,
+                     -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockUnlock) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  // Setup two regional locks with different permissions.
+  struct flock fl0;
+  fl0.l_type = F_WRLCK;
+  fl0.l_whence = SEEK_SET;
+  fl0.l_start = 0;
+  fl0.l_len = 4096;
+
+  struct flock fl1;
+  fl1.l_type = F_RDLCK;
+  fl1.l_whence = SEEK_SET;
+  fl1.l_start = 4096;
+  // Same as SetLockBadFd.
+  fl1.l_len = 0;
+
+  // Set both region locks.
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl0), SyscallSucceeds());
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl1), SyscallSucceeds());
+
+  // Another process should fail to take a read lock on the entire file
+  // due to the regional write lock.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+      file.path(), false /* write lock */, false /* nonblocking */,
+      false /* no eintr retry */, -1 /* no socket fd */, 0, 0, &child_pid));
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+      << "Exited with code: " << status;
+
+  // Then only unlock the writable one.  This should ensure that other
+  // processes can take any read lock that it wants.
+  fl0.l_type = F_UNLCK;
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl0), SyscallSucceeds());
+
+  // Another process should now succeed to get a read lock on the entire file.
+  child_pid = 0;
+  auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+      file.path(), false /* write lock */, false /* nonblocking */,
+      false /* no eintr retry */, -1 /* no socket fd */, 0, 0, &child_pid));
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockAcrossRename) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  // Setup two regional locks with different permissions.
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  // Same as SetLockBadFd.
+  fl.l_len = 0;
+
+  // Set the region lock.
+  EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+  // Rename the file to someplace nearby.
+  std::string const newpath = NewTempAbsPath();
+  EXPECT_THAT(rename(file.path().c_str(), newpath.c_str()), SyscallSucceeds());
+
+  // Another process should fail to take a read lock on the renamed file
+  // since we still have an open handle to the inode.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      SubprocessLock(newpath, false /* write lock */, false /* nonblocking */,
+                     false /* no eintr retry */, -1 /* no socket fd */,
+                     fl.l_start, fl.l_len, &child_pid));
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+      << "Exited with code: " << status;
+}
+
+// NOTE: The blocking tests below aren't perfect. It's hard to assert exactly
+// what the kernel did while handling a syscall. These tests are timing based
+// because there really isn't any other reasonable way to assert that correct
+// blocking behavior happened.
+
+// This test will verify that blocking works as expected when another process
+// holds a write lock when obtaining a write lock. This test will hold the lock
+// for some amount of time and then wait for the second process to send over the
+// socket_fd the amount of time it was blocked for before the lock succeeded.
+TEST_F(FcntlLockTest, SetWriteLockThenBlockingWriteLock) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  fl.l_len = 0;
+
+  // Take the write lock.
+  ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+  // Attempt to take the read lock in a sub process. This will immediately block
+  // so we will release our lock after some amount of time and then assert the
+  // amount of time the other process was blocked for.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+      file.path(), true /* write lock */, true /* Blocking Lock */,
+      true /* Retry on EINTR */, fds_[1] /* Socket fd for timing information */,
+      fl.l_start, fl.l_len, &child_pid));
+
+  // We will wait kHoldLockForSec before we release our lock allowing the
+  // subprocess to obtain it.
+  constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+
+  absl::SleepFor(kHoldLockFor);
+
+  // Unlock our write lock.
+  fl.l_type = F_UNLCK;
+  ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+  // Read the blocked time from the subprocess socket.
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+
+  // We must have been waiting at least kMinBlockTime.
+  EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
+
+  // The FCNTL write lock must always succeed as it will simply block until it
+  // can obtain the lock.
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+// This test will veirfy that blocking works as expected when another process
+// holds a read lock when obtaining a write lock. This test will hold the lock
+// for some amount of time and then wait for the second process to send over the
+// socket_fd the amount of time it was blocked for before the lock succeeded.
+TEST_F(FcntlLockTest, SetReadLockThenBlockingWriteLock) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_RDLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  fl.l_len = 0;
+
+  // Take the write lock.
+  ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+  // Attempt to take the read lock in a sub process. This will immediately block
+  // so we will release our lock after some amount of time and then assert the
+  // amount of time the other process was blocked for.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+      file.path(), true /* write lock */, true /* Blocking Lock */,
+      true /* Retry on EINTR */, fds_[1] /* Socket fd for timing information */,
+      fl.l_start, fl.l_len, &child_pid));
+
+  // We will wait kHoldLockForSec before we release our lock allowing the
+  // subprocess to obtain it.
+  constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
+
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+
+  absl::SleepFor(kHoldLockFor);
+
+  // Unlock our READ lock.
+  fl.l_type = F_UNLCK;
+  ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+  // Read the blocked time from the subprocess socket.
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+
+  // We must have been waiting at least kMinBlockTime.
+  EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
+
+  // The FCNTL write lock must always succeed as it will simply block until it
+  // can obtain the lock.
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+// This test will veirfy that blocking works as expected when another process
+// holds a write lock when obtaining a read lock. This test will hold the lock
+// for some amount of time and then wait for the second process to send over the
+// socket_fd the amount of time it was blocked for before the lock succeeded.
+TEST_F(FcntlLockTest, SetWriteLockThenBlockingReadLock) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  fl.l_len = 0;
+
+  // Take the write lock.
+  ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+  // Attempt to take the read lock in a sub process. This will immediately block
+  // so we will release our lock after some amount of time and then assert the
+  // amount of time the other process was blocked for.
+  pid_t child_pid = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+      file.path(), false /* read lock */, true /* Blocking Lock */,
+      true /* Retry on EINTR */, fds_[1] /* Socket fd for timing information */,
+      fl.l_start, fl.l_len, &child_pid));
+
+  // We will wait kHoldLockForSec before we release our lock allowing the
+  // subprocess to obtain it.
+  constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
+
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+
+  absl::SleepFor(kHoldLockFor);
+
+  // Unlock our write lock.
+  fl.l_type = F_UNLCK;
+  ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+  // Read the blocked time from the subprocess socket.
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+
+  // We must have been waiting at least kMinBlockTime.
+  EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
+
+  // The FCNTL read lock must always succeed as it will simply block until it
+  // can obtain the lock.
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+// This test will verify that when one process only holds a read lock that
+// another will not block while obtaining a read lock when F_SETLKW is used.
+TEST_F(FcntlLockTest, SetReadLockThenBlockingReadLock) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+  struct flock fl;
+  fl.l_type = F_RDLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = 0;
+  fl.l_len = 0;
+
+  // Take the READ lock.
+  ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+  // Attempt to take the read lock in a sub process. Since multiple processes
+  // can hold a read lock this should immediately return without blocking
+  // even though we used F_SETLKW in the subprocess.
+  pid_t child_pid = 0;
+  auto sp = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+      file.path(), false /* read lock */, true /* Blocking Lock */,
+      true /* Retry on EINTR */, -1 /* No fd, should not block */, fl.l_start,
+      fl.l_len, &child_pid));
+
+  // We never release the lock and the subprocess should still obtain it without
+  // blocking for any period of time.
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+TEST(FcntlTest, GetO_ASYNC) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  int flag_fl = -1;
+  ASSERT_THAT(flag_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+  EXPECT_EQ(flag_fl & O_ASYNC, 0);
+
+  int flag_fd = -1;
+  ASSERT_THAT(flag_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+  EXPECT_EQ(flag_fd & O_ASYNC, 0);
+}
+
+TEST(FcntlTest, SetFlO_ASYNC) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  int before_fl = -1;
+  ASSERT_THAT(before_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+
+  int before_fd = -1;
+  ASSERT_THAT(before_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+
+  ASSERT_THAT(fcntl(s.get(), F_SETFL, before_fl | O_ASYNC), SyscallSucceeds());
+
+  int after_fl = -1;
+  ASSERT_THAT(after_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+  EXPECT_EQ(after_fl, before_fl | O_ASYNC);
+
+  int after_fd = -1;
+  ASSERT_THAT(after_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+  EXPECT_EQ(after_fd, before_fd);
+}
+
+TEST(FcntlTest, SetFdO_ASYNC) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  int before_fl = -1;
+  ASSERT_THAT(before_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+
+  int before_fd = -1;
+  ASSERT_THAT(before_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+
+  ASSERT_THAT(fcntl(s.get(), F_SETFD, before_fd | O_ASYNC), SyscallSucceeds());
+
+  int after_fl = -1;
+  ASSERT_THAT(after_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+  EXPECT_EQ(after_fl, before_fl);
+
+  int after_fd = -1;
+  ASSERT_THAT(after_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+  EXPECT_EQ(after_fd, before_fd);
+}
+
+TEST(FcntlTest, DupAfterO_ASYNC) {
+  FileDescriptor s1 = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  int before = -1;
+  ASSERT_THAT(before = fcntl(s1.get(), F_GETFL), SyscallSucceeds());
+
+  ASSERT_THAT(fcntl(s1.get(), F_SETFL, before | O_ASYNC), SyscallSucceeds());
+
+  FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(s1.Dup());
+
+  int after = -1;
+  ASSERT_THAT(after = fcntl(fd2.get(), F_GETFL), SyscallSucceeds());
+  EXPECT_EQ(after & O_ASYNC, O_ASYNC);
+}
+
+TEST(FcntlTest, GetOwn) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+              SyscallSucceedsWithValue(0));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  gvisor::testing::TestInit(&argc, &argv);
+
+  if (!FLAGS_child_setlock_on.empty()) {
+    int socket_fd = FLAGS_socket_fd;
+    int fd = open(FLAGS_child_setlock_on.c_str(), O_RDWR, 0666);
+    if (fd == -1 && errno != 0) {
+      int err = errno;
+      std::cerr << "CHILD open " << FLAGS_child_setlock_on << " failed " << err
+                << std::endl;
+      exit(err);
+    }
+
+    struct flock fl;
+    if (FLAGS_child_setlock_write) {
+      fl.l_type = F_WRLCK;
+    } else {
+      fl.l_type = F_RDLCK;
+    }
+    fl.l_whence = SEEK_SET;
+    fl.l_start = FLAGS_child_setlock_start;
+    fl.l_len = FLAGS_child_setlock_len;
+
+    // Test the fcntl, no need to log, the error is unambiguously
+    // from fcntl at this point.
+    int err = 0;
+    int ret = 0;
+
+    gvisor::testing::MonotonicTimer timer;
+    timer.Start();
+    do {
+      ret = fcntl(fd, FLAGS_blocking ? F_SETLKW : F_SETLK, &fl);
+    } while (FLAGS_retry_eintr && ret == -1 && errno == EINTR);
+    auto usec = absl::ToInt64Microseconds(timer.Duration());
+
+    if (ret == -1 && errno != 0) {
+      err = errno;
+    }
+
+    // If there is a socket fd let's send back the time in microseconds it took
+    // to execute this syscall.
+    if (socket_fd != -1) {
+      gvisor::testing::WriteFd(socket_fd, reinterpret_cast<void*>(&usec),
+                                     sizeof(usec));
+      close(socket_fd);
+    }
+
+    close(fd);
+    exit(err);
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
new file mode 100644
index 000000000..19c9a5053
--- /dev/null
+++ b/test/syscalls/linux/file_base.h
@@ -0,0 +1,206 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_FILE_BASE_H_
+#define GVISOR_TEST_SYSCALLS_FILE_BASE_H_
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <cstring>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+class FileTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    test_pipe_[0] = -1;
+    test_pipe_[1] = -1;
+
+    test_file_name_ = NewTempAbsPath();
+    test_file_fd_ = ASSERT_NO_ERRNO_AND_VALUE(
+        Open(test_file_name_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR));
+
+    // FIXME: enable when mknod syscall is supported.
+    // test_fifo_name_ = NewTempAbsPath();
+    // ASSERT_THAT(mknod(test_fifo_name_.c_str()), S_IFIFO|0644, 0,
+    //             SyscallSucceeds());
+    // ASSERT_THAT(test_fifo_[1] = open(test_fifo_name_.c_str(),
+    //                                             O_WRONLY),
+    //             SyscallSucceeds());
+    // ASSERT_THAT(test_fifo_[0] = open(test_fifo_name_.c_str(),
+    //                                             O_RDONLY),
+    //             SyscallSucceeds());
+
+    ASSERT_THAT(pipe(test_pipe_), SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_pipe_[0], F_SETFL, O_NONBLOCK), SyscallSucceeds());
+  }
+
+  // CloseFile will allow the test to manually close the file descriptor.
+  void CloseFile() { test_file_fd_.reset(); }
+
+  // UnlinkFile will allow the test to manually unlink the file.
+  void UnlinkFile() {
+    if (!test_file_name_.empty()) {
+      EXPECT_THAT(unlink(test_file_name_.c_str()), SyscallSucceeds());
+      test_file_name_.clear();
+    }
+  }
+
+  // ClosePipes will allow the test to manually close the pipes.
+  void ClosePipes() {
+    if (test_pipe_[0] > 0) {
+      EXPECT_THAT(close(test_pipe_[0]), SyscallSucceeds());
+    }
+
+    if (test_pipe_[1] > 0) {
+      EXPECT_THAT(close(test_pipe_[1]), SyscallSucceeds());
+    }
+
+    test_pipe_[0] = -1;
+    test_pipe_[1] = -1;
+  }
+
+  void TearDown() override {
+    CloseFile();
+    UnlinkFile();
+    ClosePipes();
+
+    // FIXME: enable when mknod syscall is supported.
+    // close(test_fifo_[0]);
+    // close(test_fifo_[1]);
+    // unlink(test_fifo_name_.c_str());
+  }
+
+  std::string test_file_name_;
+  std::string test_fifo_name_;
+  FileDescriptor test_file_fd_;
+
+  int test_fifo_[2];
+  int test_pipe_[2];
+};
+
+class SocketTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    test_unix_stream_socket_[0] = -1;
+    test_unix_stream_socket_[1] = -1;
+    test_unix_dgram_socket_[0] = -1;
+    test_unix_dgram_socket_[1] = -1;
+    test_unix_seqpacket_socket_[0] = -1;
+    test_unix_seqpacket_socket_[1] = -1;
+    test_tcp_socket_[0] = -1;
+    test_tcp_socket_[1] = -1;
+
+    ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_),
+                SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+    ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_),
+                SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_),
+        SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+  }
+
+  void TearDown() override {
+    close(test_unix_stream_socket_[0]);
+    close(test_unix_stream_socket_[1]);
+
+    close(test_unix_dgram_socket_[0]);
+    close(test_unix_dgram_socket_[1]);
+
+    close(test_unix_seqpacket_socket_[0]);
+    close(test_unix_seqpacket_socket_[1]);
+
+    close(test_tcp_socket_[0]);
+    close(test_tcp_socket_[1]);
+  }
+
+  int test_unix_stream_socket_[2];
+  int test_unix_dgram_socket_[2];
+  int test_unix_seqpacket_socket_[2];
+  int test_tcp_socket_[2];
+};
+
+// MatchesStringLength checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the std::string length strlen.
+MATCHER_P(MatchesStringLength, strlen, "") {
+  struct iovec* iovs = arg.first;
+  int niov = arg.second;
+  int offset = 0;
+  for (int i = 0; i < niov; i++) {
+    offset += iovs[i].iov_len;
+  }
+  if (offset != static_cast<int>(strlen)) {
+    *result_listener << offset;
+    return false;
+  }
+  return true;
+}
+
+// MatchesStringValue checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the std::string value str.
+MATCHER_P(MatchesStringValue, str, "") {
+  struct iovec* iovs = arg.first;
+  int len = strlen(str);
+  int niov = arg.second;
+  int offset = 0;
+  for (int i = 0; i < niov; i++) {
+    struct iovec iov = iovs[i];
+    if (len < offset) {
+      *result_listener << "strlen " << len << " < offset " << offset;
+      return false;
+    }
+    if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) {
+      absl::string_view iovec_string(static_cast<char*>(iov.iov_base),
+                                     iov.iov_len);
+      *result_listener << iovec_string << " @offset " << offset;
+      return false;
+    }
+    offset += iov.iov_len;
+  }
+  return true;
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_FILE_BASE_H_
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
new file mode 100644
index 000000000..fb93c8034
--- /dev/null
+++ b/test/syscalls/linux/flock.cc
@@ -0,0 +1,588 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/file.h>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class FlockTest : public FileTest {};
+
+TEST_F(FlockTest, BadFD) {
+  // EBADF: fd is not an open file descriptor.
+  ASSERT_THAT(flock(-1, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(FlockTest, InvalidOpCombinations) {
+  // The operation cannot be both exclusive and shared.
+  EXPECT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_SH | LOCK_NB),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Locking and Unlocking doesn't make sense.
+  EXPECT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_UN | LOCK_NB),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_UN | LOCK_NB),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(FlockTest, NoOperationSpecified) {
+  // Not specifying an operation is invalid.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FlockTestNoFixture, FlockSupportsPipes) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  EXPECT_THAT(flock(fds[0], LOCK_EX | LOCK_NB), SyscallSucceeds());
+  EXPECT_THAT(close(fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(FlockTest, TestSimpleExLock) {
+  // Test that we can obtain an exclusive lock (no other holders)
+  // and that we can unlock it.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestSimpleShLock) {
+  // Test that we can obtain a shared lock (no other holders)
+  // and that we can unlock it.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestLockableAnyMode) {
+  // flock(2): A shared or exclusive lock can be placed on a file
+  // regardless of the mode in which the file was opened.
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(test_file_name_, O_RDONLY));  // open read only to test
+
+  // Mode shouldn't prevent us from taking an exclusive lock.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceedsWithValue(0));
+
+  // Unlock
+  ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestUnlockWithNoHolders) {
+  // Test that unlocking when no one holds a lock succeeeds.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestRepeatedExLockingBySameHolder) {
+  // Test that repeated locking by the same holder for the
+  // same type of lock works correctly.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestRepeatedExLockingSingleUnlock) {
+  // Test that repeated locking by the same holder for the
+  // same type of lock works correctly and that a single unlock is required.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+  // Should be unlocked at this point
+  ASSERT_THAT(flock(fd.get(), LOCK_NB | LOCK_EX), SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestRepeatedShLockingBySameHolder) {
+  // Test that repeated locking by the same holder for the
+  // same type of lock works correctly.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_SH),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_SH),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestSingleHolderUpgrade) {
+  // Test that a shared lock is upgradable when no one else holds a lock.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_SH),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestSingleHolderDowngrade) {
+  // Test single holder lock downgrade case.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestMultipleShared) {
+  // This is a simple test to verify that multiple independent shared
+  // locks will be granted.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // A shared lock should be granted as there only exists other shared locks.
+  ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB), SyscallSucceedsWithValue(0));
+
+  // Unlock both.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+/*
+ * flock(2): If a process uses open(2) (or similar) to obtain more than one
+ * descriptor for the same file, these descriptors are treated
+ * independently by flock(). An attempt to lock the file using one of
+ * these file descriptors may be denied by a lock that the calling process
+ * has already placed via another descriptor.
+ */
+TEST_F(FlockTest, TestMultipleHolderSharedExclusive) {
+  // This test will verify that an exclusive lock will not be granted
+  // while a shared is held.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Verify We're unable to get an exlcusive lock via the second FD.
+  // because someone is holding a shared lock.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Unlock
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestSharedLockFailExclusiveHolder) {
+  // This test will verify that a shared lock is denied while
+  // someone holds an exclusive lock.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Verify we're unable to get an shared lock via the second FD.
+  // because someone is holding an exclusive lock.
+  ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Unlock
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestExclusiveLockFailExclusiveHolder) {
+  // This test will verify that an exclusive lock is denied while
+  // someone already holds an exclsuive lock.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Verify we're unable to get an exclusive lock via the second FD
+  // because someone is already holding an exclusive lock.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Unlock
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestMultipleHolderSharedExclusiveUpgrade) {
+  // This test will verify that we cannot obtain an exclusive lock while
+  // a shared lock is held by another descriptor, then verify that an upgrade
+  // is possible on a shared lock once all other shared locks have closed.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Verify we're unable to get an exclusive lock via the second FD because
+  // a shared lock is held.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Verify that we can get a shared lock via the second descriptor instead
+  ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB), SyscallSucceedsWithValue(0));
+
+  // Unlock the first and there will only be one shared lock remaining.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+
+  // Upgrade 2nd fd.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceedsWithValue(0));
+
+  // Finally unlock the second
+  ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestMultipleHolderSharedExclusiveDowngrade) {
+  // This test will verify that a shared lock is not obtainable while an
+  // exclusive lock is held but that once the first is downgraded that
+  // the second independent file descriptor can also get a shared lock.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Verify We're unable to get a shared lock via the second FD because
+  // an exclusive lock is held.
+  ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Verify that we can downgrade the first.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  // Now verify that we can obtain a shared lock since the first was downgraded.
+  ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB), SyscallSucceedsWithValue(0));
+
+  // Finally unlock both.
+  ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+/*
+ * flock(2): Locks created by flock() are associated with an open file table
+ * entry. This means that duplicate file descriptors (created by, for example,
+ * fork(2) or dup(2)) refer to the same lock, and this lock may be modified or
+ * released using any of these descriptors. Furthermore, the lock is released
+ * either by an explicit LOCK_UN operation on any of these duplicate descriptors
+ * or when all such descriptors have been closed.
+ */
+TEST_F(FlockTest, TestDupFdUpgrade) {
+  // This test will verify that a shared lock is upgradeable via a dupped
+  // file descriptor, if the FD wasn't dupped this would fail.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+  // Now we should be able to upgrade via the dupped fd.
+  ASSERT_THAT(flock(dup_fd.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  // Validate unlock via dupped fd.
+  ASSERT_THAT(flock(dup_fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestDupFdDowngrade) {
+  // This test will verify that a exclusive lock is downgradable via a dupped
+  // file descriptor, if the FD wasn't dupped this would fail.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+  // Now we should be able to downgrade via the dupped fd.
+  ASSERT_THAT(flock(dup_fd.get(), LOCK_SH | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  // Validate unlock via dupped fd
+  ASSERT_THAT(flock(dup_fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestDupFdCloseRelease) {
+  // flock(2): Furthermore, the lock is released either by an explicit LOCK_UN
+  // operation on any of these duplicate descriptors, or when all such
+  // descriptors have been closed.
+  //
+  // This test will verify that a dupped fd closing will not release the
+  // underlying lock until all such dupped fds have closed.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+  // At this point we have ONE exclusive locked referenced by two different fds.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Validate that we cannot get a lock on a new unrelated FD.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Closing the dupped fd shouldn't affect the lock until all are closed.
+  dup_fd.reset();  // Closed the duped fd.
+
+  // Validate that we still cannot get a lock on a new unrelated FD.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Closing the first fd
+  CloseFile();  // Will validate the syscall succeeds.
+
+  // Now we should actually be able to get a lock since all fds related to
+  // the first lock are closed.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceedsWithValue(0));
+
+  // Unlock.
+  ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestDupFdUnlockRelease) {
+  /* flock(2): Furthermore, the lock is released either by an explicit LOCK_UN
+   * operation on any of these duplicate descriptors, or when all such
+   * descriptors have been closed.
+   */
+  // This test will verify that an explict unlock on a dupped FD will release
+  // the underlying lock unlike the previous case where close on a dup was
+  // not enough to release the lock.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+  // At this point we have ONE exclusive locked referenced by two different fds.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Validate that we cannot get a lock on a new unrelated FD.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Explicitly unlock via the dupped descriptor.
+  ASSERT_THAT(flock(dup_fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+
+  // Validate that we can now get the lock since we explicitly unlocked.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceedsWithValue(0));
+
+  // Unlock
+  ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestDupFdFollowedByLock) {
+  // This test will verify that taking a lock on a file descriptor that has
+  // already been dupped means that the lock is shared between both. This is
+  // slightly different than than duping on an already locked FD.
+  FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+  // Take a lock.
+  ASSERT_THAT(flock(dup_fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds());
+
+  // Now dup_fd and test_file_ should both reference the same lock.
+  // We shouldn't be able to obtain a lock until both are closed.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Closing the first fd
+  dup_fd.reset();  // Close the duped fd.
+
+  // Validate that we cannot get a lock yet because the dupped descriptor.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Closing the second fd.
+  CloseFile();  // CloseFile() will validate the syscall succeeds.
+
+  // Now we should be able to get the lock.
+  ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds());
+
+  // Unlock.
+  ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+// NOTE: These blocking tests are not perfect. Unfortunantely it's very hard to
+// determine if a thread was actually blocked in the kernel so we're forced
+// to use timing.
+TEST_F(FlockTest, BlockingLockNoBlockingForSharedLocks) {
+  // This test will verify that although LOCK_NB isn't specified
+  // two different fds can obtain shared locks without blocking.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH), SyscallSucceeds());
+
+  // kHoldLockTime is the amount of time we will hold the lock before releasing.
+  constexpr absl::Duration kHoldLockTime = absl::Seconds(30);
+
+  const DisableSave ds;  // Timing-related.
+
+  // We do this in another thread so we can determine if it was actually
+  // blocked by timing the amount of time it took for the syscall to complete.
+  ScopedThread t([&] {
+    MonotonicTimer timer;
+    const FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+    // Only a single shared lock is held, the lock will be granted immediately.
+    // This should be granted without any blocking. Don't save here to avoid
+    // wild discrepencies on timing.
+    timer.Start();
+    ASSERT_THAT(flock(fd.get(), LOCK_SH), SyscallSucceeds());
+
+    // We held the lock for 30 seconds but this thread should not have
+    // blocked at all so we expect a very small duration on syscall completion.
+    ASSERT_LT(timer.Duration(),
+              absl::Seconds(1));  // 1000ms is much less than 30s.
+
+    // We can release our second shared lock
+    ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceeds());
+  });
+
+  // Sleep before unlocking.
+  absl::SleepFor(kHoldLockTime);
+
+  // Release the first shared lock. Don't save in this situation to avoid
+  // discrepencies in timing.
+  EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
+}
+
+TEST_F(FlockTest, BlockingLockFirstSharedSecondExclusive) {
+  // This test will verify that if someone holds a shared lock any attempt to
+  // obtain an exclusive lock will result in blocking.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH), SyscallSucceeds());
+
+  // kHoldLockTime is the amount of time we will hold the lock before releasing.
+  constexpr absl::Duration kHoldLockTime = absl::Seconds(2);
+
+  const DisableSave ds;  // Timing-related.
+
+  // We do this in another thread so we can determine if it was actually
+  // blocked by timing the amount of time it took for the syscall to complete.
+  ScopedThread t([&] {
+    MonotonicTimer timer;
+    const FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+    // This exclusive lock should block because someone is already holding a
+    // shared lock. We don't save here to avoid wild discrepencies on timing.
+    timer.Start();
+    ASSERT_THAT(RetryEINTR(flock)(fd.get(), LOCK_EX), SyscallSucceeds());
+
+    // We should be blocked, we will expect to be blocked for more than 1.0s.
+    ASSERT_GT(timer.Duration(), absl::Seconds(1));
+
+    // We can release our exclusive lock.
+    ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceeds());
+  });
+
+  // Sleep before unlocking.
+  absl::SleepFor(kHoldLockTime);
+
+  // Release the shared lock allowing the thread to proceed.
+  // We don't save here to avoid wild discrepencies in timing.
+  EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
+}
+
+TEST_F(FlockTest, BlockingLockFirstExclusiveSecondShared) {
+  // This test will verify that if someone holds an exclusive lock any attempt
+  // to obtain a shared lock will result in blocking.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX), SyscallSucceeds());
+
+  // kHoldLockTime is the amount of time we will hold the lock before releasing.
+  constexpr absl::Duration kHoldLockTime = absl::Seconds(2);
+
+  const DisableSave ds;  // Timing-related.
+
+  // We do this in another thread so we can determine if it was actually
+  // blocked by timing the amount of time it took for the syscall to complete.
+  ScopedThread t([&] {
+    MonotonicTimer timer;
+    const FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+    // This shared lock should block because someone is already holding an
+    // exclusive lock. We don't save here to avoid wild discrepencies on timing.
+    timer.Start();
+    ASSERT_THAT(RetryEINTR(flock)(fd.get(), LOCK_SH), SyscallSucceeds());
+
+    // We should be blocked, we will expect to be blocked for more than 1.0s.
+    ASSERT_GT(timer.Duration(), absl::Seconds(1));
+
+    // We can release our shared lock.
+    ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceeds());
+  });
+
+  // Sleep before unlocking.
+  absl::SleepFor(kHoldLockTime);
+
+  // Release the exclusive lock allowing the blocked thread to proceed.
+  // We don't save here to avoid wild discrepencies in timing.
+  EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
+}
+
+TEST_F(FlockTest, BlockingLockFirstExclusiveSecondExclusive) {
+  // This test will verify that if someone holds an exclusive lock any attempt
+  // to obtain another exclusive lock will result in blocking.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX), SyscallSucceeds());
+
+  // kHoldLockTime is the amount of time we will hold the lock before releasing.
+  constexpr absl::Duration kHoldLockTime = absl::Seconds(2);
+
+  const DisableSave ds;  // Timing-related.
+
+  // We do this in another thread so we can determine if it was actually
+  // blocked by timing the amount of time it took for the syscall to complete.
+  ScopedThread t([&] {
+    MonotonicTimer timer;
+    const FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+    // This exclusive lock should block because someone is already holding an
+    // exclusive lock.
+    timer.Start();
+    ASSERT_THAT(RetryEINTR(flock)(fd.get(), LOCK_EX), SyscallSucceeds());
+
+    // We should be blocked, we will expect to be blocked for more than 1.0s.
+    ASSERT_GT(timer.Duration(), absl::Seconds(1));
+
+    // We can release our exclusive lock.
+    ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceeds());
+  });
+
+  // Sleep before unlocking.
+  absl::SleepFor(kHoldLockTime);
+
+  // Release the exclusive lock allowing the blocked thread to proceed.
+  // We don't save to avoid wild discrepencies in timing.
+  EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
new file mode 100644
index 000000000..1bff5e50f
--- /dev/null
+++ b/test/syscalls/linux/fork.cc
@@ -0,0 +1,413 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <atomic>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::Ge;
+
+class ForkTest : public ::testing::Test {
+ protected:
+  // SetUp creates a populated, open file.
+  void SetUp() override {
+    // Make a shared mapping.
+    shared_ = reinterpret_cast<char*>(mmap(0, kPageSize, PROT_READ | PROT_WRITE,
+                                           MAP_SHARED | MAP_ANONYMOUS, -1, 0));
+    ASSERT_NE(reinterpret_cast<void*>(shared_), MAP_FAILED);
+
+    // Make a private mapping.
+    private_ =
+        reinterpret_cast<char*>(mmap(0, kPageSize, PROT_READ | PROT_WRITE,
+                                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+    ASSERT_NE(reinterpret_cast<void*>(private_), MAP_FAILED);
+
+    // Make a pipe.
+    ASSERT_THAT(pipe(pipes_), SyscallSucceeds());
+  }
+
+  // TearDown frees associated resources.
+  void TearDown() override {
+    EXPECT_THAT(munmap(shared_, kPageSize), SyscallSucceeds());
+    EXPECT_THAT(munmap(private_, kPageSize), SyscallSucceeds());
+    EXPECT_THAT(close(pipes_[0]), SyscallSucceeds());
+    EXPECT_THAT(close(pipes_[1]), SyscallSucceeds());
+  }
+
+  // Fork executes a clone system call.
+  pid_t Fork() {
+    pid_t pid = fork();
+    MaybeSave();
+    TEST_PCHECK_MSG(pid >= 0, "fork failed");
+    return pid;
+  }
+
+  // Wait waits for the given pid and returns the exit status. If the child was
+  // killed by a signal or an error occurs, then 256+signal is returned.
+  int Wait(pid_t pid) {
+    int status;
+    while (true) {
+      int rval = wait4(pid, &status, 0, NULL);
+      if (rval < 0) {
+        return rval;
+      }
+      if (rval != pid) {
+        continue;
+      }
+      if (WIFEXITED(status)) {
+        return WEXITSTATUS(status);
+      }
+      if (WIFSIGNALED(status)) {
+        return 256 + WTERMSIG(status);
+      }
+    }
+  }
+
+  // Exit exits the proccess.
+  void Exit(int code) {
+    _exit(code);
+
+    // Should never reach here. Since the exit above failed, we really don't
+    // have much in the way of options to indicate failure. So we just try to
+    // log an assertion failure to the logs. The parent process will likely
+    // fail anyways if exit is not working.
+    TEST_CHECK_MSG(false, "_exit returned");
+  }
+
+  // ReadByte reads a byte from the shared pipe.
+  char ReadByte() {
+    char val = -1;
+    TEST_PCHECK(ReadFd(pipes_[0], &val, 1) == 1);
+    MaybeSave();
+    return val;
+  }
+
+  // WriteByte writes a byte from the shared pipe.
+  void WriteByte(char val) {
+    TEST_PCHECK(WriteFd(pipes_[1], &val, 1) == 1);
+    MaybeSave();
+  }
+
+  // Shared pipe.
+  int pipes_[2];
+
+  // Shared mapping (one page).
+  char* shared_;
+
+  // Private mapping (one page).
+  char* private_;
+};
+
+TEST_F(ForkTest, Simple) {
+  pid_t child = Fork();
+  if (child == 0) {
+    Exit(0);
+  }
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, ExitCode) {
+  pid_t child = Fork();
+  if (child == 0) {
+    Exit(123);
+  }
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(123));
+  child = Fork();
+  if (child == 0) {
+    Exit(1);
+  }
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(1));
+}
+
+TEST_F(ForkTest, Multi) {
+  pid_t child1 = Fork();
+  if (child1 == 0) {
+    Exit(0);
+  }
+  pid_t child2 = Fork();
+  if (child2 == 0) {
+    Exit(1);
+  }
+  EXPECT_THAT(Wait(child1), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(Wait(child2), SyscallSucceedsWithValue(1));
+}
+
+TEST_F(ForkTest, Pipe) {
+  pid_t child = Fork();
+  if (child == 0) {
+    WriteByte(1);
+    Exit(0);
+  }
+  EXPECT_EQ(ReadByte(), 1);
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, SharedMapping) {
+  pid_t child = Fork();
+  if (child == 0) {
+    // Wait for the parent.
+    ReadByte();
+    if (shared_[0] == 1) {
+      Exit(0);
+    }
+    // Failed.
+    Exit(1);
+  }
+  // Change the mapping.
+  ASSERT_EQ(shared_[0], 0);
+  shared_[0] = 1;
+  // Unblock the child.
+  WriteByte(0);
+  // Did it work?
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, PrivateMapping) {
+  pid_t child = Fork();
+  if (child == 0) {
+    // Wait for the parent.
+    ReadByte();
+    if (private_[0] == 0) {
+      Exit(0);
+    }
+    // Failed.
+    Exit(1);
+  }
+  // Change the mapping.
+  ASSERT_EQ(private_[0], 0);
+  private_[0] = 1;
+  // Unblock the child.
+  WriteByte(0);
+  // Did it work?
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+// Test that cpuid works after a fork.
+TEST_F(ForkTest, Cpuid) {
+  pid_t child = Fork();
+
+  // We should be able to determine the CPU vendor.
+  ASSERT_NE(GetCPUVendor(), CPUVendor::kUnknownVendor);
+
+  if (child == 0) {
+    Exit(0);
+  }
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, Mmap) {
+  pid_t child = Fork();
+
+  if (child == 0) {
+    void* addr =
+        mmap(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    MaybeSave();
+    Exit(addr == MAP_FAILED);
+  }
+
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+static volatile int alarmed = 0;
+
+void AlarmHandler(int sig, siginfo_t* info, void* context) { alarmed = 1; }
+
+TEST_F(ForkTest, Alarm) {
+  // Setup an alarm handler.
+  struct sigaction sa;
+  sa.sa_sigaction = AlarmHandler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  EXPECT_THAT(sigaction(SIGALRM, &sa, nullptr), SyscallSucceeds());
+
+  pid_t child = Fork();
+
+  if (child == 0) {
+    alarm(1);
+    sleep(3);
+    if (!alarmed) {
+      Exit(1);
+    }
+    Exit(0);
+  }
+
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(0, alarmed);
+}
+
+// Child cannot affect parent private memory.
+TEST_F(ForkTest, PrivateMemory) {
+  std::atomic<uint32_t> local(0);
+
+  pid_t child1 = Fork();
+  if (child1 == 0) {
+    local++;
+
+    pid_t child2 = Fork();
+    if (child2 == 0) {
+      local++;
+
+      TEST_CHECK(local.load() == 2);
+
+      Exit(0);
+    }
+
+    TEST_PCHECK(Wait(child2) == 0);
+    TEST_CHECK(local.load() == 1);
+    Exit(0);
+  }
+
+  EXPECT_THAT(Wait(child1), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(0, local.load());
+}
+
+// Kernel-accessed buffers should remain coherent across COW.
+TEST_F(ForkTest, COWSegment) {
+  constexpr int kBufSize = 1024;
+  char* read_buf = private_;
+  char* touch = private_ + kPageSize / 2;
+
+  std::string contents(kBufSize, 'a');
+
+  ScopedThread t([&] {
+    // Wait to be sure the parent is blocked in read.
+    absl::SleepFor(absl::Seconds(3));
+
+    // Fork to mark private pages for COW.
+    //
+    // Use fork directly rather than the Fork wrapper to skip the multi-threaded
+    // check, and limit the child to async-signal-safe functions:
+    //
+    // "After a fork() in a multithreaded program, the child can safely call
+    // only async-signal-safe functions (see signal(7)) until such time as it
+    // calls execve(2)."
+    //
+    // Skip ASSERT in the child, as it isn't async-signal-safe.
+    pid_t child = fork();
+    if (child == 0) {
+      // Wait to be sure parent touched memory.
+      sleep(3);
+      Exit(0);
+    }
+
+    // Check success only in the parent.
+    ASSERT_THAT(child, SyscallSucceedsWithValue(Ge(0)));
+
+    // Trigger COW on private page.
+    *touch = 42;
+
+    // Write to pipe. Parent should still be able to read this.
+    EXPECT_THAT(WriteFd(pipes_[1], contents.c_str(), kBufSize),
+                SyscallSucceedsWithValue(kBufSize));
+
+    EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+  });
+
+  EXPECT_THAT(ReadFd(pipes_[0], read_buf, kBufSize),
+              SyscallSucceedsWithValue(kBufSize));
+  EXPECT_STREQ(contents.c_str(), read_buf);
+}
+
+TEST_F(ForkTest, SigAltStack) {
+  std::vector<char> stack_mem(SIGSTKSZ);
+  stack_t stack = {};
+  stack.ss_size = SIGSTKSZ;
+  stack.ss_sp = stack_mem.data();
+  ASSERT_THAT(sigaltstack(&stack, nullptr), SyscallSucceeds());
+
+  pid_t child = Fork();
+
+  if (child == 0) {
+    stack_t oss = {};
+    TEST_PCHECK(sigaltstack(nullptr, &oss) == 0);
+    MaybeSave();
+
+    TEST_CHECK((oss.ss_flags & SS_DISABLE) == 0);
+    TEST_CHECK(oss.ss_size == SIGSTKSZ);
+    TEST_CHECK(oss.ss_sp == stack.ss_sp);
+
+    Exit(0);
+  }
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, Affinity) {
+  // Make a non-default cpumask.
+  cpu_set_t parent_mask;
+  EXPECT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &parent_mask),
+              SyscallSucceeds());
+  // Knock out the lowest bit.
+  for (unsigned int n = 0; n < CPU_SETSIZE; n++) {
+    if (CPU_ISSET(n, &parent_mask)) {
+      CPU_CLR(n, &parent_mask);
+      break;
+    }
+  }
+  EXPECT_THAT(sched_setaffinity(/*pid=*/0, sizeof(cpu_set_t), &parent_mask),
+              SyscallSucceeds());
+
+  pid_t child = Fork();
+  if (child == 0) {
+    cpu_set_t child_mask;
+
+    int ret = sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &child_mask);
+    MaybeSave();
+    if (ret < 0) {
+      Exit(-ret);
+    }
+
+    TEST_CHECK(CPU_EQUAL(&child_mask, &parent_mask));
+
+    Exit(0);
+  }
+
+  EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+#ifdef __x86_64__
+// Clone with CLONE_SETTLS and a non-canonical TLS address is rejected.
+TEST(CloneTest, NonCanonicalTLS) {
+  constexpr uintptr_t kNonCanonical = 1ull << 48;
+
+  // We need a valid address for the stack pointer. We'll never actually execute
+  // on this.
+  char stack;
+
+  EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
+                      nullptr, kNonCanonical),
+              SyscallFailsWithErrno(EPERM));
+}
+#endif
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/fpsig_fork.cc b/test/syscalls/linux/fpsig_fork.cc
new file mode 100644
index 000000000..e8f1dfa8a
--- /dev/null
+++ b/test/syscalls/linux/fpsig_fork.cc
@@ -0,0 +1,105 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This test verifies that fork(2) in a signal handler will correctly
+// restore floating point state after the signal handler returns in both
+// the child and parent.
+#include <sys/time.h>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#define GET_XMM(__var, __xmm) \
+  asm volatile("movq %%" #__xmm ", %0" : "=r"(__var))
+#define SET_XMM(__var, __xmm) asm volatile("movq %0, %%" #__xmm : : "r"(__var))
+
+int parent, child;
+
+void sigusr1(int s, siginfo_t* siginfo, void* _uc) {
+  // Fork and clobber %xmm0. The fpstate should be restored by sigreturn(2)
+  // in both parent and child.
+  child = fork();
+  TEST_CHECK_MSG(child >= 0, "fork failed");
+
+  uint64_t val = SIGUSR1;
+  SET_XMM(val, xmm0);
+}
+
+TEST(FPSigTest, Fork) {
+  parent = getpid();
+  pid_t parent_tid = gettid();
+
+  struct sigaction sa = {};
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  sa.sa_sigaction = sigusr1;
+  ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+  // The amd64 ABI specifies that the XMM register set is caller-saved. This
+  // implies that if there is any function call between SET_XMM and GET_XMM the
+  // compiler might save/restore xmm0 implicitly. This defeats the entire
+  // purpose of the test which is to verify that fpstate is restored by
+  // sigreturn(2).
+  //
+  // This is the reason why 'tgkill(getpid(), gettid(), SIGUSR1)' is implemented
+  // in inline assembly below.
+  //
+  // If the OS is broken and registers are clobbered by the child, using tgkill
+  // to signal the current thread increases the likelihood that this thread will
+  // be the one clobbered.
+
+  uint64_t expected = 0xdeadbeeffacefeed;
+  SET_XMM(expected, xmm0);
+
+  asm volatile(
+      "movl %[killnr], %%eax;"
+      "movl %[parent], %%edi;"
+      "movl %[tid], %%esi;"
+      "movl %[sig], %%edx;"
+      "syscall;"
+      :
+      : [killnr] "i"(__NR_tgkill), [parent] "rm"(parent),
+        [tid] "rm"(parent_tid), [sig] "i"(SIGUSR1)
+      : "rax", "rdi", "rsi", "rdx",
+        // Clobbered by syscall.
+        "rcx", "r11");
+
+  uint64_t got;
+  GET_XMM(got, xmm0);
+
+  if (getpid() == parent) {  // Parent.
+    int status;
+    ASSERT_THAT(waitpid(child, &status, 0), SyscallSucceedsWithValue(child));
+    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+  }
+
+  // TEST_CHECK_MSG since this may run in the child.
+  TEST_CHECK_MSG(expected == got, "Bad xmm0 value");
+
+  if (getpid() != parent) {  // Child.
+    _exit(0);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/fpsig_nested.cc b/test/syscalls/linux/fpsig_nested.cc
new file mode 100644
index 000000000..2fa40b42d
--- /dev/null
+++ b/test/syscalls/linux/fpsig_nested.cc
@@ -0,0 +1,134 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This program verifies that application floating point state is restored
+// correctly after a signal handler returns. It also verifies that this works
+// with nested signals.
+#include <sys/time.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#define GET_XMM(__var, __xmm) \
+  asm volatile("movq %%" #__xmm ", %0" : "=r"(__var))
+#define SET_XMM(__var, __xmm) asm volatile("movq %0, %%" #__xmm : : "r"(__var))
+
+int pid;
+int tid;
+
+volatile uint64_t entryxmm[2] = {~0UL, ~0UL};
+volatile uint64_t exitxmm[2];
+
+void sigusr2(int s, siginfo_t* siginfo, void* _uc) {
+  uint64_t val = SIGUSR2;
+
+  // Record the value of %xmm0 on entry and then clobber it.
+  GET_XMM(entryxmm[1], xmm0);
+  SET_XMM(val, xmm0);
+  GET_XMM(exitxmm[1], xmm0);
+}
+
+void sigusr1(int s, siginfo_t* siginfo, void* _uc) {
+  uint64_t val = SIGUSR1;
+
+  // Record the value of %xmm0 on entry and then clobber it.
+  GET_XMM(entryxmm[0], xmm0);
+  SET_XMM(val, xmm0);
+
+  // Send a SIGUSR2 to ourself. The signal mask is configured such that
+  // the SIGUSR2 handler will run before this handler returns.
+  asm volatile(
+      "movl %[killnr], %%eax;"
+      "movl %[pid], %%edi;"
+      "movl %[tid], %%esi;"
+      "movl %[sig], %%edx;"
+      "syscall;"
+      :
+      : [killnr] "i"(__NR_tgkill), [pid] "rm"(pid), [tid] "rm"(tid),
+        [sig] "i"(SIGUSR2)
+      : "rax", "rdi", "rsi", "rdx",
+        // Clobbered by syscall.
+        "rcx", "r11");
+
+  // Record value of %xmm0 again to verify that the nested signal handler
+  // does not clobber it.
+  GET_XMM(exitxmm[0], xmm0);
+}
+
+TEST(FPSigTest, NestedSignals) {
+  pid = getpid();
+  tid = gettid();
+
+  struct sigaction sa = {};
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  sa.sa_sigaction = sigusr1;
+  ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+  sa.sa_sigaction = sigusr2;
+  ASSERT_THAT(sigaction(SIGUSR2, &sa, nullptr), SyscallSucceeds());
+
+  // The amd64 ABI specifies that the XMM register set is caller-saved. This
+  // implies that if there is any function call between SET_XMM and GET_XMM the
+  // compiler might save/restore xmm0 implicitly. This defeats the entire
+  // purpose of the test which is to verify that fpstate is restored by
+  // sigreturn(2).
+  //
+  // This is the reason why 'tgkill(getpid(), gettid(), SIGUSR1)' is implemented
+  // in inline assembly below.
+  //
+  // If the OS is broken and registers are clobbered by the signal, using tgkill
+  // to signal the current thread ensures that this is the clobbered thread.
+
+  uint64_t expected = 0xdeadbeeffacefeed;
+  SET_XMM(expected, xmm0);
+
+  asm volatile(
+      "movl %[killnr], %%eax;"
+      "movl %[pid], %%edi;"
+      "movl %[tid], %%esi;"
+      "movl %[sig], %%edx;"
+      "syscall;"
+      :
+      : [killnr] "i"(__NR_tgkill), [pid] "rm"(pid), [tid] "rm"(tid),
+        [sig] "i"(SIGUSR1)
+      : "rax", "rdi", "rsi", "rdx",
+        // Clobbered by syscall.
+        "rcx", "r11");
+
+  uint64_t got;
+  GET_XMM(got, xmm0);
+
+  //
+  // The checks below verifies the following:
+  // - signal handlers must called with a clean fpu state.
+  // - sigreturn(2) must restore fpstate of the interrupted context.
+  //
+  EXPECT_EQ(expected, got);
+  EXPECT_EQ(entryxmm[0], 0);
+  EXPECT_EQ(entryxmm[1], 0);
+  EXPECT_EQ(exitxmm[0], SIGUSR1);
+  EXPECT_EQ(exitxmm[1], SIGUSR2);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/fsync.cc b/test/syscalls/linux/fsync.cc
new file mode 100644
index 000000000..536a73bf1
--- /dev/null
+++ b/test/syscalls/linux/fsync.cc
@@ -0,0 +1,55 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(FsyncTest, TempFileSucceeds) {
+  std::string path = NewTempAbsPath();
+  int fd;
+  EXPECT_THAT(fd = open(path.c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  const std::string data = "some data to sync";
+  EXPECT_THAT(write(fd, data.c_str(), data.size()),
+              SyscallSucceedsWithValue(data.size()));
+  EXPECT_THAT(fsync(fd), SyscallSucceeds());
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+  ASSERT_THAT(unlink(path.c_str()), SyscallSucceeds());
+}
+
+TEST(FsyncTest, CannotFsyncOnUnopenedFd) {
+  int fd;
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  ASSERT_THAT(fd = open(f.path().c_str(), O_RDONLY), SyscallSucceeds());
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+
+  // fd is now invalid.
+  EXPECT_THAT(fsync(fd), SyscallFailsWithErrno(EBADF));
+}
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
new file mode 100644
index 000000000..6fa284013
--- /dev/null
+++ b/test/syscalls/linux/futex.cc
@@ -0,0 +1,595 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <linux/futex.h>
+#include <linux/types.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <memory>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/memory_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Amount of time we wait for threads doing futex_wait to start running before
+// doing futex_wake.
+constexpr auto kWaiterStartupDelay = absl::Seconds(3);
+
+// Default timeout for waiters in tests where we expect a futex_wake to be
+// ineffective.
+constexpr auto kIneffectiveWakeTimeout = absl::Seconds(6);
+
+static_assert(kWaiterStartupDelay < kIneffectiveWakeTimeout,
+              "futex_wait will time out before futex_wake is called");
+
+int futex_wait(bool priv, std::atomic<int>* uaddr, int val,
+               absl::Duration timeout = absl::InfiniteDuration()) {
+  int op = FUTEX_WAIT;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+
+  if (timeout == absl::InfiniteDuration()) {
+    return RetryEINTR(syscall)(SYS_futex, uaddr, op, val, nullptr);
+  }
+
+  // FUTEX_WAIT doesn't adjust the timeout if it returns EINTR, so we have to do
+  // so.
+  while (true) {
+    auto const timeout_ts = absl::ToTimespec(timeout);
+    MonotonicTimer timer;
+    timer.Start();
+    int const ret = syscall(SYS_futex, uaddr, op, val, &timeout_ts);
+    if (ret != -1 || errno != EINTR) {
+      return ret;
+    }
+    timeout = std::max(timeout - timer.Duration(), absl::ZeroDuration());
+  }
+}
+
+int futex_wait_bitset(bool priv, std::atomic<int>* uaddr, int val, int bitset,
+                      absl::Time deadline = absl::InfiniteFuture()) {
+  int op = FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+
+  auto const deadline_ts = absl::ToTimespec(deadline);
+  return RetryEINTR(syscall)(
+      SYS_futex, uaddr, op, val,
+      deadline == absl::InfiniteFuture() ? nullptr : &deadline_ts, nullptr,
+      bitset);
+}
+
+int futex_wake(bool priv, std::atomic<int>* uaddr, int count) {
+  int op = FUTEX_WAKE;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return syscall(SYS_futex, uaddr, op, count);
+}
+
+int futex_wake_bitset(bool priv, std::atomic<int>* uaddr, int count,
+                      int bitset) {
+  int op = FUTEX_WAKE_BITSET;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return syscall(SYS_futex, uaddr, op, count, nullptr, nullptr, bitset);
+}
+
+int futex_wake_op(bool priv, std::atomic<int>* uaddr1, std::atomic<int>* uaddr2,
+                  int nwake1, int nwake2, uint32_t sub_op) {
+  int op = FUTEX_WAKE_OP;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return syscall(SYS_futex, uaddr1, op, nwake1, nwake2, uaddr2, sub_op);
+}
+
+// Fixture for futex tests parameterized by whether to use private or shared
+// futexes.
+class PrivateAndSharedFutexTest : public ::testing::TestWithParam<bool> {
+ protected:
+  bool IsPrivate() const { return GetParam(); }
+  int PrivateFlag() const { return IsPrivate() ? FUTEX_PRIVATE_FLAG : 0; }
+};
+
+// FUTEX_WAIT with 0 timeout does not block.
+TEST_P(PrivateAndSharedFutexTest, Wait_ZeroTimeout) {
+  struct timespec timeout = {};
+
+  // Don't use the futex_wait helper because it adjusts timeout.
+  int a = 1;
+  EXPECT_THAT(syscall(SYS_futex, &a, FUTEX_WAIT | PrivateFlag(), a, &timeout),
+              SyscallFailsWithErrno(ETIMEDOUT));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_Timeout) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(1);
+
+  MonotonicTimer timer;
+  timer.Start();
+  constexpr absl::Duration kTimeout = absl::Seconds(1);
+  EXPECT_THAT(futex_wait(IsPrivate(), &a, a, kTimeout),
+              SyscallFailsWithErrno(ETIMEDOUT));
+  EXPECT_GE(timer.Duration(), kTimeout);
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_BitsetTimeout) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(1);
+
+  MonotonicTimer timer;
+  timer.Start();
+  constexpr absl::Duration kTimeout = absl::Seconds(1);
+  EXPECT_THAT(
+      futex_wait_bitset(IsPrivate(), &a, a, 0xffffffff, absl::Now() + kTimeout),
+      SyscallFailsWithErrno(ETIMEDOUT));
+  EXPECT_GE(timer.Duration(), kTimeout);
+}
+
+TEST_P(PrivateAndSharedFutexTest, WaitBitset_NegativeTimeout) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(1);
+
+  MonotonicTimer timer;
+  timer.Start();
+  EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, a, 0xffffffff,
+                                absl::Now() - absl::Seconds(1)),
+              SyscallFailsWithErrno(ETIMEDOUT));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_WrongVal) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(1);
+  EXPECT_THAT(futex_wait(IsPrivate(), &a, a + 1),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_ZeroBitset) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(1);
+  EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, a, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wake1_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  // Prevent save/restore from interrupting futex_wait, which will cause it to
+  // return EAGAIN instead of the expected result if futex_wait is restarted
+  // after we change the value of a below.
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue),
+                SyscallSucceedsWithValue(0));
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  // Change a so that if futex_wake happens before futex_wait, the latter
+  // returns EAGAIN instead of hanging the test.
+  a.fetch_add(1);
+  EXPECT_THAT(futex_wake(IsPrivate(), &a, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeAll_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  DisableSave ds;
+  constexpr int kThreads = 5;
+  std::vector<std::unique_ptr<ScopedThread>> threads;
+  threads.reserve(kThreads);
+  for (int i = 0; i < kThreads; i++) {
+    threads.push_back(absl::make_unique<ScopedThread>([&] {
+      EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue),
+                  SyscallSucceeds());
+    }));
+  }
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  EXPECT_THAT(futex_wake(IsPrivate(), &a, kThreads),
+              SyscallSucceedsWithValue(kThreads));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeSome_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  DisableSave ds;
+  constexpr int kThreads = 5;
+  constexpr int kWokenThreads = 3;
+  static_assert(kWokenThreads < kThreads,
+                "can't wake more threads than are created");
+  std::vector<std::unique_ptr<ScopedThread>> threads;
+  threads.reserve(kThreads);
+  std::vector<int> rets;
+  rets.reserve(kThreads);
+  std::vector<int> errs;
+  errs.reserve(kThreads);
+  for (int i = 0; i < kThreads; i++) {
+    rets.push_back(-1);
+    errs.push_back(0);
+  }
+  for (int i = 0; i < kThreads; i++) {
+    threads.push_back(absl::make_unique<ScopedThread>([&, i] {
+      rets[i] =
+          futex_wait(IsPrivate(), &a, kInitialValue, kIneffectiveWakeTimeout);
+      errs[i] = errno;
+    }));
+  }
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  EXPECT_THAT(futex_wake(IsPrivate(), &a, kWokenThreads),
+              SyscallSucceedsWithValue(kWokenThreads));
+
+  int woken = 0;
+  int timedout = 0;
+  for (int i = 0; i < kThreads; i++) {
+    threads[i]->Join();
+    if (rets[i] == 0) {
+      woken++;
+    } else if (errs[i] == ETIMEDOUT) {
+      timedout++;
+    } else {
+      ADD_FAILURE() << " thread " << i << ": returned " << rets[i] << ", errno "
+                    << errs[i];
+    }
+  }
+  EXPECT_EQ(woken, kWokenThreads);
+  EXPECT_EQ(timedout, kThreads - kWokenThreads);
+}
+
+TEST_P(PrivateAndSharedFutexTest, WaitBitset_Wake_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, kInitialValue, 0b01001000),
+                SyscallSucceeds());
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  EXPECT_THAT(futex_wake(IsPrivate(), &a, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_WakeBitset_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue), SyscallSucceeds());
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  EXPECT_THAT(futex_wake_bitset(IsPrivate(), &a, 1, 0b01001000),
+              SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WaitBitset_WakeBitsetMatch_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  constexpr int kBitset = 0b01001000;
+
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, kInitialValue, kBitset),
+                SyscallSucceeds());
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  EXPECT_THAT(futex_wake_bitset(IsPrivate(), &a, 1, kBitset),
+              SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WaitBitset_WakeBitsetNoMatch_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  constexpr int kWaitBitset = 0b01000001;
+  constexpr int kWakeBitset = 0b00101000;
+  static_assert((kWaitBitset & kWakeBitset) == 0,
+                "futex_wake_bitset will wake waiter");
+
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, kInitialValue, kWaitBitset,
+                                  absl::Now() + kIneffectiveWakeTimeout),
+                SyscallFailsWithErrno(ETIMEDOUT));
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  EXPECT_THAT(futex_wake_bitset(IsPrivate(), &a, 1, kWakeBitset),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeOpCondSuccess_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+  std::atomic<int> b = ATOMIC_VAR_INIT(kInitialValue);
+
+  DisableSave ds;
+  ScopedThread thread_a([&] {
+    EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue), SyscallSucceeds());
+  });
+  ScopedThread thread_b([&] {
+    EXPECT_THAT(futex_wait(IsPrivate(), &b, kInitialValue), SyscallSucceeds());
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  b.fetch_add(1);
+  // This futex_wake_op should:
+  // - Wake 1 waiter on a unconditionally.
+  // - Wake 1 waiter on b if b == kInitialValue + 1, which it is.
+  // - Do "b += 1".
+  EXPECT_THAT(futex_wake_op(IsPrivate(), &a, &b, 1, 1,
+                            FUTEX_OP(FUTEX_OP_ADD, 1, FUTEX_OP_CMP_EQ,
+                                     (kInitialValue + 1))),
+              SyscallSucceedsWithValue(2));
+  EXPECT_EQ(b, kInitialValue + 2);
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeOpCondFailure_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+  std::atomic<int> b = ATOMIC_VAR_INIT(kInitialValue);
+
+  DisableSave ds;
+  ScopedThread thread_a([&] {
+    EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue), SyscallSucceeds());
+  });
+  ScopedThread thread_b([&] {
+    EXPECT_THAT(
+        futex_wait(IsPrivate(), &b, kInitialValue, kIneffectiveWakeTimeout),
+        SyscallFailsWithErrno(ETIMEDOUT));
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  b.fetch_add(1);
+  // This futex_wake_op should:
+  // - Wake 1 waiter on a unconditionally.
+  // - Wake 1 waiter on b if b == kInitialValue - 1, which it isn't.
+  // - Do "b += 1".
+  EXPECT_THAT(futex_wake_op(IsPrivate(), &a, &b, 1, 1,
+                            FUTEX_OP(FUTEX_OP_ADD, 1, FUTEX_OP_CMP_EQ,
+                                     (kInitialValue - 1))),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(b, kInitialValue + 2);
+}
+
+TEST_P(PrivateAndSharedFutexTest, NoWakeInterprocessPrivateAnon_NoRandomSave) {
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  auto const ptr = static_cast<std::atomic<int>*>(mapping.ptr());
+  constexpr int kInitialValue = 1;
+  ptr->store(kInitialValue);
+
+  DisableSave ds;
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    TEST_PCHECK(futex_wait(IsPrivate(), ptr, kInitialValue,
+                           kIneffectiveWakeTimeout) == -1 &&
+                errno == ETIMEDOUT);
+    _exit(0);
+  }
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+  absl::SleepFor(kWaiterStartupDelay);
+
+  EXPECT_THAT(futex_wake(IsPrivate(), ptr, 1), SyscallSucceedsWithValue(0));
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeAfterCOWBreak_NoRandomSave) {
+  // Use a futex on a non-stack mapping so we can be sure that the child process
+  // below isn't the one that breaks copy-on-write.
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  auto const ptr = static_cast<std::atomic<int>*>(mapping.ptr());
+  constexpr int kInitialValue = 1;
+  ptr->store(kInitialValue);
+
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(futex_wait(IsPrivate(), ptr, kInitialValue), SyscallSucceeds());
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // Wait to be killed by the parent.
+    while (true) pause();
+  }
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+  auto cleanup_child = Cleanup([&] {
+    EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+    int status;
+    ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+                SyscallSucceedsWithValue(child_pid));
+    EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+        << " status " << status;
+  });
+
+  // In addition to preventing a late futex_wait from sleeping, this breaks
+  // copy-on-write on the mapped page.
+  ptr->fetch_add(1);
+  EXPECT_THAT(futex_wake(IsPrivate(), ptr, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeWrongKind_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(
+        futex_wait(IsPrivate(), &a, kInitialValue, kIneffectiveWakeTimeout),
+        SyscallFailsWithErrno(ETIMEDOUT));
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  a.fetch_add(1);
+  // The value of priv passed to futex_wake is the opposite of that passed to
+  // the futex_waiter; we expect this not to wake the waiter.
+  EXPECT_THAT(futex_wake(!IsPrivate(), &a, 1), SyscallSucceedsWithValue(0));
+}
+
+INSTANTIATE_TEST_CASE_P(SharedPrivate, PrivateAndSharedFutexTest,
+                        ::testing::Bool());
+
+// Passing null as the address only works for private futexes.
+
+TEST(PrivateFutexTest, WakeOp0Set) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(1);
+
+  int futex_op = FUTEX_OP(FUTEX_OP_SET, 2, 0, 0);
+  EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(a, 2);
+}
+
+TEST(PrivateFutexTest, WakeOp0Add) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(1);
+  int futex_op = FUTEX_OP(FUTEX_OP_ADD, 1, 0, 0);
+  EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(a, 2);
+}
+
+TEST(PrivateFutexTest, WakeOp0Or) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0b01);
+  int futex_op = FUTEX_OP(FUTEX_OP_OR, 0b10, 0, 0);
+  EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(a, 0b11);
+}
+
+TEST(PrivateFutexTest, WakeOp0Andn) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0b11);
+  int futex_op = FUTEX_OP(FUTEX_OP_ANDN, 0b10, 0, 0);
+  EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(a, 0b01);
+}
+
+TEST(PrivateFutexTest, WakeOp0Xor) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0b1010);
+  int futex_op = FUTEX_OP(FUTEX_OP_XOR, 0b1100, 0, 0);
+  EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(a, 0b0110);
+}
+
+TEST(SharedFutexTest, WakeInterprocessSharedAnon_NoRandomSave) {
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED));
+  auto const ptr = static_cast<std::atomic<int>*>(mapping.ptr());
+  constexpr int kInitialValue = 1;
+  ptr->store(kInitialValue);
+
+  DisableSave ds;
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    TEST_PCHECK(futex_wait(false, ptr, kInitialValue) == 0);
+    _exit(0);
+  }
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+  auto kill_child = Cleanup(
+      [&] { EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds()); });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  ptr->fetch_add(1);
+  // This is an ASSERT so that if it fails, we immediately abort the test (and
+  // kill the subprocess).
+  ASSERT_THAT(futex_wake(false, ptr, 1), SyscallSucceedsWithValue(1));
+
+  kill_child.Release();
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
+TEST(SharedFutexTest, WakeInterprocessFile_NoRandomSave) {
+  auto const file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  ASSERT_THAT(truncate(file.path().c_str(), kPageSize), SyscallSucceeds());
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0));
+  auto const ptr = static_cast<std::atomic<int>*>(mapping.ptr());
+  constexpr int kInitialValue = 1;
+  ptr->store(kInitialValue);
+
+  DisableSave ds;
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    TEST_PCHECK(futex_wait(false, ptr, kInitialValue) == 0);
+    _exit(0);
+  }
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+  auto kill_child = Cleanup(
+      [&] { EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds()); });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  ptr->fetch_add(1);
+  // This is an ASSERT so that if it fails, we immediately abort the test (and
+  // kill the subprocess).
+  ASSERT_THAT(futex_wake(false, ptr, 1), SyscallSucceedsWithValue(1));
+
+  kill_child.Release();
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/getcpu.cc b/test/syscalls/linux/getcpu.cc
new file mode 100644
index 000000000..3a52b25fa
--- /dev/null
+++ b/test/syscalls/linux/getcpu.cc
@@ -0,0 +1,40 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(GetcpuTest, IsValidCpuStress) {
+  const int num_cpus = NumCPUs();
+  absl::Time deadline = absl::Now() + absl::Seconds(10);
+  while (absl::Now() < deadline) {
+    int cpu;
+    ASSERT_THAT(cpu = sched_getcpu(), SyscallSucceeds());
+    ASSERT_LT(cpu, num_cpus);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
new file mode 100644
index 000000000..5db580aa0
--- /dev/null
+++ b/test/syscalls/linux/getdents.cc
@@ -0,0 +1,485 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::IsEmpty;
+using ::testing::IsSupersetOf;
+using ::testing::Not;
+using ::testing::NotNull;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// New Linux dirent format.
+struct linux_dirent64 {
+  uint64_t d_ino;           // Inode number
+  int64_t d_off;            // Offset to next linux_dirent64
+  unsigned short d_reclen;  // NOLINT, Length of this linux_dirent64
+  unsigned char d_type;     // NOLINT, File type
+  char d_name[0];           // Filename (null-terminated)
+};
+
+// Old Linux dirent format.
+struct linux_dirent {
+  unsigned long d_ino;      // NOLINT
+  unsigned long d_off;      // NOLINT
+  unsigned short d_reclen;  // NOLINT
+  char d_name[0];
+};
+
+// Wraps a buffer to provide a set of dirents.
+// T is the underlying dirent type.
+template <typename T>
+class DirentBuffer {
+ public:
+  // DirentBuffer manages the buffer.
+  explicit DirentBuffer(size_t size)
+      : managed_(true), actual_size_(size), reported_size_(size) {
+    data_ = new char[actual_size_];
+  }
+
+  // The buffer is managed externally.
+  DirentBuffer(char* data, size_t actual_size, size_t reported_size)
+      : managed_(false),
+        data_(data),
+        actual_size_(actual_size),
+        reported_size_(reported_size) {}
+
+  ~DirentBuffer() {
+    if (managed_) {
+      delete[] data_;
+    }
+  }
+
+  T* Data() { return reinterpret_cast<T*>(data_); }
+
+  T* Start(size_t read) {
+    read_ = read;
+    if (read_) {
+      return Data();
+    } else {
+      return nullptr;
+    }
+  }
+
+  T* Current() { return reinterpret_cast<T*>(&data_[off_]); }
+
+  T* Next() {
+    size_t new_off = off_ + Current()->d_reclen;
+    if (new_off >= read_ || new_off >= actual_size_) {
+      return nullptr;
+    }
+
+    off_ = new_off;
+    return Current();
+  }
+
+  size_t Size() { return reported_size_; }
+
+  void Reset() {
+    off_ = 0;
+    read_ = 0;
+    memset(data_, 0, actual_size_);
+  }
+
+ private:
+  bool managed_;
+  char* data_;
+  size_t actual_size_;
+  size_t reported_size_;
+
+  size_t off_ = 0;
+
+  size_t read_ = 0;
+};
+
+// Test for getdents/getdents64.
+// T is the Linux dirent type.
+template <typename T>
+class GetdentsTest : public ::testing::Test {
+ public:
+  using LinuxDirentType = T;
+  using DirentBufferType = DirentBuffer<T>;
+
+ protected:
+  void SetUp() override {
+    dir_ = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+    fd_ = ASSERT_NO_ERRNO_AND_VALUE(Open(dir_.path(), O_RDONLY | O_DIRECTORY));
+  }
+
+  // Must be overridden with explicit specialization. See below.
+  int SyscallNum();
+
+  int Getdents(LinuxDirentType* dirp, unsigned int count) {
+    return RetryEINTR(syscall)(SyscallNum(), fd_.get(), dirp, count);
+  }
+
+  // Fill directory with num files, named by number starting at 0.
+  void FillDirectory(size_t num) {
+    for (size_t i = 0; i < num; i++) {
+      auto name = JoinPath(dir_.path(), absl::StrCat(i));
+      TEST_CHECK(CreateWithContents(name, "").ok());
+    }
+  }
+
+  // Fill directory with a given list of filenames.
+  void FillDirectoryWithFiles(const std::vector<std::string>& filenames) {
+    for (const auto& filename : filenames) {
+      auto name = JoinPath(dir_.path(), filename);
+      TEST_CHECK(CreateWithContents(name, "").ok());
+    }
+  }
+
+  // Seek to the start of the directory.
+  PosixError SeekStart() {
+    constexpr off_t kStartOfFile = 0;
+    off_t offset = lseek(fd_.get(), kStartOfFile, SEEK_SET);
+    if (offset < 0) {
+      return PosixError(errno, absl::StrCat("error seeking to ", kStartOfFile));
+    }
+    if (offset != kStartOfFile) {
+      return PosixError(EINVAL, absl::StrCat("tried to seek to ", kStartOfFile,
+                                             " but got ", offset));
+    }
+    return NoError();
+  }
+
+  // Call getdents multiple times, reading all dirents and calling f on each.
+  // f has the type signature PosixError f(T*).
+  // If f returns a non-OK error, so does ReadDirents.
+  template <typename F>
+  PosixError ReadDirents(DirentBufferType* dirents, F const& f) {
+    int n;
+    do {
+      dirents->Reset();
+
+      n = Getdents(dirents->Data(), dirents->Size());
+      MaybeSave();
+      if (n < 0) {
+        return PosixError(errno, "getdents");
+      }
+
+      for (auto d = dirents->Start(n); d; d = dirents->Next()) {
+        RETURN_IF_ERRNO(f(d));
+      }
+    } while (n > 0);
+
+    return NoError();
+  }
+
+  // Call Getdents successively and count all entries.
+  int ReadAndCountAllEntries(DirentBufferType* dirents) {
+    int found = 0;
+
+    EXPECT_NO_ERRNO(ReadDirents(dirents, [&](LinuxDirentType* d) {
+      found++;
+      return NoError();
+    }));
+
+    return found;
+  }
+
+ private:
+  TempPath dir_;
+  FileDescriptor fd_;
+};
+
+// GUnit TYPED_TEST_CASE does not allow multiple template parameters, so we
+// must use explicit template specialization to set the syscall number.
+template <>
+int GetdentsTest<struct linux_dirent>::SyscallNum() {
+  return SYS_getdents;
+}
+
+template <>
+int GetdentsTest<struct linux_dirent64>::SyscallNum() {
+  return SYS_getdents64;
+}
+
+// Test both legacy getdents and getdents64.
+typedef ::testing::Types<struct linux_dirent, struct linux_dirent64>
+    GetdentsTypes;
+TYPED_TEST_CASE(GetdentsTest, GetdentsTypes);
+
+// N.B. TYPED_TESTs require explicitly using this-> to access members of
+// GetdentsTest, since we are inside of a derived class template.
+
+TYPED_TEST(GetdentsTest, VerifyEntries) {
+  typename TestFixture::DirentBufferType dirents(1024);
+
+  this->FillDirectory(2);
+
+  // Map of all the entries we expect to find.
+  std::map<std::string, bool> found;
+  found["."] = false;
+  found[".."] = false;
+  found["0"] = false;
+  found["1"] = false;
+
+  EXPECT_NO_ERRNO(this->ReadDirents(
+      &dirents, [&](typename TestFixture::LinuxDirentType* d) {
+        auto kv = found.find(d->d_name);
+        EXPECT_NE(kv, found.end()) << "Unexpected file: " << d->d_name;
+        if (kv != found.end()) {
+          EXPECT_FALSE(kv->second);
+        }
+        found[d->d_name] = true;
+        return NoError();
+      }));
+
+  for (auto& kv : found) {
+    EXPECT_TRUE(kv.second) << "File not found: " << kv.first;
+  }
+}
+
+TYPED_TEST(GetdentsTest, VerifyPadding) {
+  typename TestFixture::DirentBufferType dirents(1024);
+
+  // Create files with names of length 1 through 16.
+  std::vector<std::string> files;
+  std::string filename;
+  for (int i = 0; i < 16; ++i) {
+    absl::StrAppend(&filename, "a");
+    files.push_back(filename);
+  }
+  this->FillDirectoryWithFiles(files);
+
+  // We expect to find all the files, plus '.' and '..'.
+  const int expect_found = 2 + files.size();
+  int found = 0;
+
+  EXPECT_NO_ERRNO(this->ReadDirents(
+      &dirents, [&](typename TestFixture::LinuxDirentType* d) {
+        EXPECT_EQ(d->d_reclen % 8, 0)
+            << "Dirent " << d->d_name
+            << " had reclen that was not byte aligned: " << d->d_name;
+        found++;
+        return NoError();
+      }));
+
+  // Make sure we found all the files.
+  EXPECT_EQ(found, expect_found);
+}
+
+// For a small directory, the provided buffer should be large enough
+// for all entries.
+TYPED_TEST(GetdentsTest, SmallDir) {
+  // . and .. should be in an otherwise empty directory.
+  int expect = 2;
+
+  // Add some actual files.
+  this->FillDirectory(2);
+  expect += 2;
+
+  typename TestFixture::DirentBufferType dirents(256);
+
+  EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+}
+
+// A directory with lots of files requires calling getdents multiple times.
+TYPED_TEST(GetdentsTest, LargeDir) {
+  // . and .. should be in an otherwise empty directory.
+  int expect = 2;
+
+  // Add some actual files.
+  this->FillDirectory(100);
+  expect += 100;
+
+  typename TestFixture::DirentBufferType dirents(256);
+
+  EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+}
+
+// If we lie about the size of the buffer, we should still be able to read the
+// entries with the available space.
+TYPED_TEST(GetdentsTest, PartialBuffer) {
+  // . and .. should be in an otherwise empty directory.
+  int expect = 2;
+
+  // Add some actual files.
+  this->FillDirectory(100);
+  expect += 100;
+
+  void* addr = mmap(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
+                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  ASSERT_NE(addr, MAP_FAILED);
+
+  char* buf = reinterpret_cast<char*>(addr);
+
+  // Guard page
+  EXPECT_THAT(
+      mprotect(reinterpret_cast<void*>(buf + kPageSize), kPageSize, PROT_NONE),
+      SyscallSucceeds());
+
+  // Limit space in buf to 256 bytes.
+  buf += kPageSize - 256;
+
+  // Lie about the buffer. Even though we claim the buffer is 1 page,
+  // we should still get all of the dirents in the first 256 bytes.
+  typename TestFixture::DirentBufferType dirents(buf, 256, kPageSize);
+
+  EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+
+  EXPECT_THAT(munmap(addr, 2 * kPageSize), SyscallSucceeds());
+}
+
+// Open many file descriptors, then scan through /proc/self/fd to find and close
+// them all. (The latter is commonly used to handle races betweek fork/execve
+// and the creation of unwanted non-O_CLOEXEC file descriptors.) This tests that
+// getdents iterates correctly despite mutation of /proc/self/fd.
+TYPED_TEST(GetdentsTest, ProcSelfFd) {
+  constexpr size_t kNfds = 10;
+  std::unordered_set<int> fds;
+  std::vector<FileDescriptor> fd_closers;
+  fd_closers.reserve(fds.size());
+  for (int fd : fds) {
+    fd_closers.emplace_back(fd);
+  }
+  for (size_t i = 0; i < kNfds; i++) {
+    int fd;
+    ASSERT_THAT(fd = eventfd(0, 0), SyscallSucceeds());
+    fds.insert(fd);
+  }
+
+  const FileDescriptor proc_self_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/fd", O_RDONLY | O_DIRECTORY));
+
+  // Make the buffer very small since we want to iterate.
+  typename TestFixture::DirentBufferType dirents(
+      2 * sizeof(typename TestFixture::LinuxDirentType));
+  std::unordered_set<int> prev_fds;
+  while (true) {
+    dirents.Reset();
+    int rv;
+    ASSERT_THAT(rv = RetryEINTR(syscall)(this->SyscallNum(), proc_self_fd.get(),
+                                         dirents.Data(), dirents.Size()),
+                SyscallSucceeds());
+    if (rv == 0) {
+      break;
+    }
+    for (auto* d = dirents.Start(rv); d; d = dirents.Next()) {
+      int dfd;
+      if (!absl::SimpleAtoi(d->d_name, &dfd)) continue;
+      EXPECT_TRUE(prev_fds.insert(dfd).second)
+          << "Repeated observation of /proc/self/fd/" << dfd;
+      auto it = fds.find(dfd);
+      if (it != fds.end()) {
+        fds.erase(it);
+        EXPECT_THAT(close(dfd), SyscallSucceeds());
+      }
+    }
+  }
+
+  // Check that we closed every fd.
+  EXPECT_THAT(fds, ::testing::IsEmpty());
+}
+
+// Test that getdents returns ENOTDIR when called on a file.
+TYPED_TEST(GetdentsTest, NotDir) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  typename TestFixture::DirentBufferType dirents(256);
+  EXPECT_THAT(RetryEINTR(syscall)(this->SyscallNum(), fd.get(), dirents.Data(),
+                                  dirents.Size()),
+              SyscallFailsWithErrno(ENOTDIR));
+}
+
+// Test that SEEK_SET to 0 causes getdents to re-read the entries.
+TYPED_TEST(GetdentsTest, SeekResetsCursor) {
+  // . and .. should be in an otherwise empty directory.
+  int expect = 2;
+
+  // Add some files to the directory.
+  this->FillDirectory(10);
+  expect += 10;
+
+  typename TestFixture::DirentBufferType dirents(256);
+
+  // We should get all the expected entries.
+  EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+
+  // Seek back to 0.
+  ASSERT_NO_ERRNO(this->SeekStart());
+
+  // We should get all the expected entries again.
+  EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+}
+
+// Some tests using the glibc readdir interface.
+TEST(ReaddirTest, OpenDir) {
+  DIR* dev;
+  ASSERT_THAT(dev = opendir("/dev"), NotNull());
+  EXPECT_THAT(closedir(dev), SyscallSucceeds());
+}
+
+TEST(ReaddirTest, RootContainsBasicDirectories) {
+  EXPECT_THAT(ListDir("/", true),
+              IsPosixErrorOkAndHolds(IsSupersetOf(
+                  {"bin", "dev", "etc", "lib", "proc", "sbin", "usr"})));
+}
+
+TEST(ReaddirTest, Bug24096713Dev) {
+  auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/dev", true));
+  EXPECT_THAT(contents, Not(IsEmpty()));
+}
+
+TEST(ReaddirTest, Bug24096713ProcTid) {
+  auto contents = ASSERT_NO_ERRNO_AND_VALUE(
+      ListDir(absl::StrCat("/proc/", syscall(SYS_gettid), "/"), true));
+  EXPECT_THAT(contents, Not(IsEmpty()));
+}
+
+TEST(ReaddirTest, Bug33429925Proc) {
+  auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc", true));
+  EXPECT_THAT(contents, Not(IsEmpty()));
+}
+
+TEST(ReaddirTest, Bug35110122Root) {
+  auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/", true));
+  EXPECT_THAT(contents, Not(IsEmpty()));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/getrandom.cc b/test/syscalls/linux/getrandom.cc
new file mode 100644
index 000000000..be5325497
--- /dev/null
+++ b/test/syscalls/linux/getrandom.cc
@@ -0,0 +1,61 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifndef SYS_getrandom
+#if defined(__x86_64__)
+#define SYS_getrandom 318
+#elif defined(__i386__)
+#define SYS_getrandom 355
+#else
+#error "Unknown architecture"
+#endif
+#endif  // SYS_getrandom
+
+bool SomeByteIsNonZero(char* random_bytes, int length) {
+  for (int i = 0; i < length; i++) {
+    if (random_bytes[i] != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TEST(GetrandomTest, IsRandom) {
+  // This test calls get_random and makes sure that the array is filled in with
+  // something that is non-zero. Perhaps we get back \x00\x00\x00\x00\x00.... as
+  // a random result, but it's so unlikely that we'll just ignore this.
+  char random_bytes[64] = {};
+  int n = syscall(SYS_getrandom, random_bytes, 64, 0);
+  SKIP_IF(!IsRunningOnGvisor() && n < 0 && errno == ENOSYS);
+  EXPECT_THAT(n, SyscallSucceeds());
+  EXPECT_GT(n, 0);  // Some bytes should be returned.
+  EXPECT_TRUE(SomeByteIsNonZero(random_bytes, n));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/getrusage.cc b/test/syscalls/linux/getrusage.cc
new file mode 100644
index 000000000..1ae603858
--- /dev/null
+++ b/test/syscalls/linux/getrusage.cc
@@ -0,0 +1,177 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(GetrusageTest, BasicFork) {
+  pid_t pid = fork();
+  if (pid == 0) {
+    struct rusage rusage_self;
+    TEST_PCHECK(getrusage(RUSAGE_SELF, &rusage_self) == 0);
+    struct rusage rusage_children;
+    TEST_PCHECK(getrusage(RUSAGE_CHILDREN, &rusage_children) == 0);
+    // The child has consumed some memory.
+    TEST_CHECK(rusage_self.ru_maxrss != 0);
+    // The child has no children of its own.
+    TEST_CHECK(rusage_children.ru_maxrss == 0);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0), SyscallSucceeds());
+  struct rusage rusage_self;
+  ASSERT_THAT(getrusage(RUSAGE_SELF, &rusage_self), SyscallSucceeds());
+  struct rusage rusage_children;
+  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &rusage_children), SyscallSucceeds());
+  // The parent has consumed some memory.
+  EXPECT_GT(rusage_self.ru_maxrss, 0);
+  // The child has consumed some memory, and because it has exited we can get
+  // its max RSS.
+  EXPECT_GT(rusage_children.ru_maxrss, 0);
+}
+
+// Verifies that a process can get the max resident set size of its grandchild,
+// i.e. that maxrss propagates correctly from children to waiting parents.
+TEST(GetrusageTest, Grandchild) {
+  constexpr int kGrandchildSizeKb = 1024;
+  pid_t pid = fork();
+  if (pid == 0) {
+    pid = fork();
+    if (pid == 0) {
+      int flags = MAP_ANONYMOUS | MAP_POPULATE | MAP_PRIVATE;
+      void *addr =
+          mmap(nullptr, kGrandchildSizeKb * 1024, PROT_WRITE, flags, -1, 0);
+      TEST_PCHECK(addr != MAP_FAILED);
+    } else {
+      int status;
+      TEST_PCHECK(RetryEINTR(waitpid)(pid, &status, 0) == pid);
+    }
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0), SyscallSucceeds());
+  struct rusage rusage_self;
+  ASSERT_THAT(getrusage(RUSAGE_SELF, &rusage_self), SyscallSucceeds());
+  struct rusage rusage_children;
+  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &rusage_children), SyscallSucceeds());
+  // The parent has consumed some memory.
+  EXPECT_GT(rusage_self.ru_maxrss, 0);
+  // The child should consume next to no memory, but the grandchild will
+  // consume at least 1MB. Verify that usage bubbles up to the grandparent.
+  EXPECT_GT(rusage_children.ru_maxrss, kGrandchildSizeKb);
+}
+
+// Verifies that processes ignoring SIGCHLD do not have updated child maxrss
+// updated.
+TEST(GetrusageTest, IgnoreSIGCHLD) {
+  struct sigaction sa;
+  sa.sa_handler = SIG_IGN;
+  sa.sa_flags = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGCHLD, sa));
+  pid_t pid = fork();
+  if (pid == 0) {
+    struct rusage rusage_self;
+    TEST_PCHECK(getrusage(RUSAGE_SELF, &rusage_self) == 0);
+    // The child has consumed some memory.
+    TEST_CHECK(rusage_self.ru_maxrss != 0);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+              SyscallFailsWithErrno(ECHILD));
+  struct rusage rusage_self;
+  ASSERT_THAT(getrusage(RUSAGE_SELF, &rusage_self), SyscallSucceeds());
+  struct rusage rusage_children;
+  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &rusage_children), SyscallSucceeds());
+  // The parent has consumed some memory.
+  EXPECT_GT(rusage_self.ru_maxrss, 0);
+  // The child's maxrss should not have propagated up.
+  EXPECT_EQ(rusage_children.ru_maxrss, 0);
+}
+
+// Verifies that zombie processes do not update their parent's maxrss. Only
+// reaped processes should do this.
+TEST(GetrusageTest, IgnoreZombie) {
+  pid_t pid = fork();
+  if (pid == 0) {
+    struct rusage rusage_self;
+    TEST_PCHECK(getrusage(RUSAGE_SELF, &rusage_self) == 0);
+    struct rusage rusage_children;
+    TEST_PCHECK(getrusage(RUSAGE_CHILDREN, &rusage_children) == 0);
+    // The child has consumed some memory.
+    TEST_CHECK(rusage_self.ru_maxrss != 0);
+    // The child has no children of its own.
+    TEST_CHECK(rusage_children.ru_maxrss == 0);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  // Give the child time to exit. Because we don't call wait, the child should
+  // remain a zombie.
+  absl::SleepFor(absl::Seconds(5));
+  struct rusage rusage_self;
+  ASSERT_THAT(getrusage(RUSAGE_SELF, &rusage_self), SyscallSucceeds());
+  struct rusage rusage_children;
+  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &rusage_children), SyscallSucceeds());
+  // The parent has consumed some memory.
+  EXPECT_GT(rusage_self.ru_maxrss, 0);
+  // The child has consumed some memory, but hasn't been reaped.
+  EXPECT_EQ(rusage_children.ru_maxrss, 0);
+}
+
+TEST(GetrusageTest, Wait4) {
+  pid_t pid = fork();
+  if (pid == 0) {
+    struct rusage rusage_self;
+    TEST_PCHECK(getrusage(RUSAGE_SELF, &rusage_self) == 0);
+    struct rusage rusage_children;
+    TEST_PCHECK(getrusage(RUSAGE_CHILDREN, &rusage_children) == 0);
+    // The child has consumed some memory.
+    TEST_CHECK(rusage_self.ru_maxrss != 0);
+    // The child has no children of its own.
+    TEST_CHECK(rusage_children.ru_maxrss == 0);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  struct rusage rusage_children;
+  int status;
+  ASSERT_THAT(RetryEINTR(wait4)(pid, &status, 0, &rusage_children),
+              SyscallSucceeds());
+  // The child has consumed some memory, and because it has exited we can get
+  // its max RSS.
+  EXPECT_GT(rusage_children.ru_maxrss, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
new file mode 100644
index 000000000..62fc55c72
--- /dev/null
+++ b/test/syscalls/linux/inotify.cc
@@ -0,0 +1,1489 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <libgen.h>
+#include <sys/inotify.h>
+#include <sys/ioctl.h>
+
+#include <list>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using ::absl::StreamFormat;
+using ::absl::StrFormat;
+
+constexpr int kBufSize = 1024;
+
+// C++-friendly version of struct inotify_event.
+struct Event {
+  int32_t wd;
+  uint32_t mask;
+  uint32_t cookie;
+  uint32_t len;
+  std::string name;
+
+  Event(uint32_t mask, int32_t wd, absl::string_view name, uint32_t cookie)
+      : wd(wd),
+        mask(mask),
+        cookie(cookie),
+        len(name.size()),
+        name(std::string(name)) {}
+  Event(uint32_t mask, int32_t wd, absl::string_view name)
+      : Event(mask, wd, name, 0) {}
+  Event(uint32_t mask, int32_t wd) : Event(mask, wd, "", 0) {}
+  Event() : Event(0, 0, "", 0) {}
+};
+
+// Prints the symbolic name for a struct inotify_event's 'mask' field.
+std::string FlagString(uint32_t flags) {
+  std::vector<std::string> names;
+
+#define EMIT(target)          \
+  if (flags & target) {       \
+    names.push_back(#target); \
+    flags &= ~target;         \
+  }
+
+  EMIT(IN_ACCESS);
+  EMIT(IN_ATTRIB);
+  EMIT(IN_CLOSE_WRITE);
+  EMIT(IN_CLOSE_NOWRITE);
+  EMIT(IN_CREATE);
+  EMIT(IN_DELETE);
+  EMIT(IN_DELETE_SELF);
+  EMIT(IN_MODIFY);
+  EMIT(IN_MOVE_SELF);
+  EMIT(IN_MOVED_FROM);
+  EMIT(IN_MOVED_TO);
+  EMIT(IN_OPEN);
+
+  EMIT(IN_DONT_FOLLOW);
+  EMIT(IN_EXCL_UNLINK);
+  EMIT(IN_ONESHOT);
+  EMIT(IN_ONLYDIR);
+
+  EMIT(IN_IGNORED);
+  EMIT(IN_ISDIR);
+  EMIT(IN_Q_OVERFLOW);
+  EMIT(IN_UNMOUNT);
+
+#undef EMIT
+
+  // If we have anything left over at the end, print it as a hex value.
+  if (flags) {
+    names.push_back(absl::StrCat("0x", absl::Hex(flags)));
+  }
+
+  return absl::StrJoin(names, "|");
+}
+
+std::string DumpEvent(const Event& event) {
+  return StrFormat(
+      "%s, wd=%d%s%s", FlagString(event.mask), event.wd,
+      (event.len > 0) ? StrFormat(", name=%s", event.name) : "",
+      (event.cookie > 0) ? StrFormat(", cookie=%ud", event.cookie) : "");
+}
+
+std::string DumpEvents(const std::vector<Event>& events, int indent_level) {
+  std::stringstream ss;
+  ss << StreamFormat("%d event%s:\n", events.size(),
+                     (events.size() > 1) ? "s" : "");
+  int i = 0;
+  for (const Event& ev : events) {
+    ss << StreamFormat("%sevents[%d]: %s\n", std::string(indent_level, '\t'), i++,
+                       DumpEvent(ev));
+  }
+  return ss.str();
+}
+
+// A matcher which takes an expected list of events to match against another
+// list of inotify events, in order. This is similar to the ElementsAre matcher,
+// but displays more informative messages on mismatch.
+class EventsAreMatcher
+    : public ::testing::MatcherInterface<std::vector<Event>> {
+ public:
+  explicit EventsAreMatcher(std::vector<Event> references)
+      : references_(std::move(references)) {}
+
+  bool MatchAndExplain(
+      std::vector<Event> events,
+      ::testing::MatchResultListener* const listener) const override {
+    if (references_.size() != events.size()) {
+      *listener << StreamFormat("\n\tCount mismatch, got %s",
+                                DumpEvents(events, 2));
+      return false;
+    }
+
+    bool success = true;
+    for (unsigned int i = 0; i < references_.size(); ++i) {
+      const Event& reference = references_[i];
+      const Event& target = events[i];
+
+      if (target.mask != reference.mask || target.wd != reference.wd ||
+          target.name != reference.name || target.cookie != reference.cookie) {
+        *listener << StreamFormat("\n\tMismatch at index %d, want %s, got %s,",
+                                  i, DumpEvent(reference), DumpEvent(target));
+        success = false;
+      }
+    }
+
+    if (!success) {
+      *listener << StreamFormat("\n\tIn total of %s", DumpEvents(events, 2));
+    }
+    return success;
+  }
+
+  void DescribeTo(::std::ostream* const os) const override {
+    *os << StreamFormat("%s", DumpEvents(references_, 1));
+  }
+
+  void DescribeNegationTo(::std::ostream* const os) const override {
+    *os << StreamFormat("mismatch from %s", DumpEvents(references_, 1));
+  }
+
+ private:
+  std::vector<Event> references_;
+};
+
+::testing::Matcher<std::vector<Event>> Are(std::vector<Event> events) {
+  return MakeMatcher(new EventsAreMatcher(std::move(events)));
+}
+
+// Similar to the EventsAre matcher, but the order of events are ignored.
+class UnorderedEventsAreMatcher
+    : public ::testing::MatcherInterface<std::vector<Event>> {
+ public:
+  explicit UnorderedEventsAreMatcher(std::vector<Event> references)
+      : references_(std::move(references)) {}
+
+  bool MatchAndExplain(
+      std::vector<Event> events,
+      ::testing::MatchResultListener* const listener) const override {
+    if (references_.size() != events.size()) {
+      *listener << StreamFormat("\n\tCount mismatch, got %s",
+                                DumpEvents(events, 2));
+      return false;
+    }
+
+    std::vector<Event> unmatched(references_);
+
+    for (const Event& candidate : events) {
+      for (auto it = unmatched.begin(); it != unmatched.end();) {
+        const Event& reference = *it;
+        if (candidate.mask == reference.mask && candidate.wd == reference.wd &&
+            candidate.name == reference.name &&
+            candidate.cookie == reference.cookie) {
+          it = unmatched.erase(it);
+          break;
+        } else {
+          ++it;
+        }
+      }
+    }
+
+    // Anything left unmatched? If so, the matcher fails.
+    if (!unmatched.empty()) {
+      *listener << StreamFormat("\n\tFailed to match %s",
+                                DumpEvents(unmatched, 2));
+      *listener << StreamFormat("\n\tIn total of %s", DumpEvents(events, 2));
+      return false;
+    }
+
+    return true;
+  }
+
+  void DescribeTo(::std::ostream* const os) const override {
+    *os << StreamFormat("unordered %s", DumpEvents(references_, 1));
+  }
+
+  void DescribeNegationTo(::std::ostream* const os) const override {
+    *os << StreamFormat("mismatch from unordered %s",
+                        DumpEvents(references_, 1));
+  }
+
+ private:
+  std::vector<Event> references_;
+};
+
+::testing::Matcher<std::vector<Event>> AreUnordered(std::vector<Event> events) {
+  return MakeMatcher(new UnorderedEventsAreMatcher(std::move(events)));
+}
+
+// Reads events from an inotify fd until either EOF, or read returns EAGAIN.
+PosixErrorOr<std::vector<Event>> DrainEvents(int fd) {
+  std::vector<Event> events;
+  while (true) {
+    int events_size = 0;
+    if (ioctl(fd, FIONREAD, &events_size) < 0) {
+      return PosixError(errno, "ioctl(FIONREAD) failed on inotify fd");
+    }
+    // Deliberately use a buffer that is larger than necessary, expecting to
+    // only read events_size bytes.
+    std::vector<char> buf(events_size + kBufSize, 0);
+    const ssize_t readlen = read(fd, buf.data(), buf.size());
+    MaybeSave();
+    // Read error?
+    if (readlen < 0) {
+      if (errno == EAGAIN) {
+        // If EAGAIN, no more events at the moment. Return what we have so far.
+        return events;
+      }
+      // Some other read error. Return an error. Right now if we encounter this
+      // after already reading some events, they get lost. However, we don't
+      // expect to see any error, and the calling test will fail immediately if
+      // we signal an error anyways, so this is acceptable.
+      return PosixError(errno, "read() failed on inotify fd");
+    }
+    if (readlen < static_cast<int>(sizeof(struct inotify_event))) {
+      // Impossibly short read.
+      return PosixError(
+          EIO,
+          "read() didn't return enough data represent even a single event");
+    }
+    if (readlen != events_size) {
+      return PosixError(EINVAL, absl::StrCat("read ", readlen,
+                                             " bytes, expected ", events_size));
+    }
+    if (readlen == 0) {
+      // EOF.
+      return events;
+    }
+
+    // Normal read.
+    const char* cursor = buf.data();
+    while (cursor < (buf.data() + readlen)) {
+      struct inotify_event event = {};
+      memcpy(&event, cursor, sizeof(struct inotify_event));
+
+      Event ev;
+      ev.wd = event.wd;
+      ev.mask = event.mask;
+      ev.cookie = event.cookie;
+      ev.len = event.len;
+      if (event.len > 0) {
+        TEST_CHECK(static_cast<int>(sizeof(struct inotify_event) + event.len) <=
+                   readlen);
+        ev.name =
+            std::string(cursor + offsetof(struct inotify_event, name));  // NOLINT
+        // Name field should always be smaller than event.len, otherwise we have
+        // a buffer overflow. The two sizes aren't equal because the std::string
+        // constructor will stop at the first null byte, while event.name may be
+        // padded up to event.len using multiple null bytes.
+        TEST_CHECK(ev.name.size() <= event.len);
+      }
+
+      events.push_back(ev);
+      cursor += sizeof(struct inotify_event) + event.len;
+    }
+  }
+}
+
+PosixErrorOr<FileDescriptor> InotifyInit1(int flags) {
+  int fd;
+  EXPECT_THAT(fd = inotify_init1(flags), SyscallSucceeds());
+  if (fd < 0) {
+    return PosixError(errno, "inotify_init1() failed");
+  }
+  return FileDescriptor(fd);
+}
+
+PosixErrorOr<int> InotifyAddWatch(int fd, const std::string& path, uint32_t mask) {
+  int wd;
+  EXPECT_THAT(wd = inotify_add_watch(fd, path.c_str(), mask),
+              SyscallSucceeds());
+  if (wd < 0) {
+    return PosixError(errno, "inotify_add_watch() failed");
+  }
+  return wd;
+}
+
+TEST(Inotify, InotifyFdNotWritable) {
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(0));
+  EXPECT_THAT(write(fd.get(), "x", 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(Inotify, NonBlockingReadReturnsEagain) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  std::vector<char> buf(kBufSize, 0);
+
+  // The read below should return fail with EAGAIN because there is no data to
+  // read and we've specified IN_NONBLOCK. We're guaranteed that there is no
+  // data to read because we haven't registered any watches yet.
+  EXPECT_THAT(read(fd.get(), buf.data(), buf.size()),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(Inotify, AddWatchOnInvalidFdFails) {
+  // Garbage fd.
+  EXPECT_THAT(inotify_add_watch(-1, "/tmp", IN_ALL_EVENTS),
+              SyscallFailsWithErrno(EBADF));
+  EXPECT_THAT(inotify_add_watch(1337, "/tmp", IN_ALL_EVENTS),
+              SyscallFailsWithErrno(EBADF));
+
+  // Non-inotify fds.
+  EXPECT_THAT(inotify_add_watch(0, "/tmp", IN_ALL_EVENTS),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(inotify_add_watch(1, "/tmp", IN_ALL_EVENTS),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(inotify_add_watch(2, "/tmp", IN_ALL_EVENTS),
+              SyscallFailsWithErrno(EINVAL));
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open("/tmp", O_RDONLY));
+  EXPECT_THAT(inotify_add_watch(fd.get(), "/tmp", IN_ALL_EVENTS),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, RemovingWatchGeneratesEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallSucceeds());
+
+  // Read events, ensure the first event is IN_IGNORED.
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_IGNORED, wd)}));
+}
+
+TEST(Inotify, CanDeleteFileAfterRemovingWatch) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallSucceeds());
+  file1.reset();
+}
+
+TEST(Inotify, CanRemoveWatchAfterDeletingFile) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  file1.reset();
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd), Event(IN_DELETE_SELF, wd),
+                           Event(IN_IGNORED, wd)}));
+
+  EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, DuplicateWatchRemovalFails) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallSucceeds());
+  EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, ConcurrentFileDeletionAndWatchRemoval) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const std::string filename = NewTempAbsPathInDir(root.path());
+
+  auto file_create_delete = [filename]() {
+    const DisableSave ds;  // Too expensive.
+    for (int i = 0; i < 100; ++i) {
+      FileDescriptor file_fd =
+          ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT, S_IRUSR | S_IWUSR));
+      file_fd.reset();  // Close before unlinking (although save is disabled).
+      EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+    }
+  };
+
+  const int shared_fd = fd.get();  // We need to pass it to the thread.
+  auto add_remove_watch = [shared_fd, filename]() {
+    for (int i = 0; i < 100; ++i) {
+      int wd = inotify_add_watch(shared_fd, filename.c_str(), IN_ALL_EVENTS);
+      MaybeSave();
+      if (wd != -1) {
+        // Watch added successfully, try removal.
+        if (inotify_rm_watch(shared_fd, wd)) {
+          // If removal fails, the only acceptable reason is if the wd
+          // is invalid, which will be the case if we try to remove
+          // the watch after the file has been deleted.
+          EXPECT_EQ(errno, EINVAL);
+        }
+      } else {
+        // Add watch failed, this should only fail if the target file doesn't
+        // exist.
+        EXPECT_EQ(errno, ENOENT);
+      }
+    }
+  };
+
+  ScopedThread t1(file_create_delete);
+  ScopedThread t2(add_remove_watch);
+}
+
+TEST(Inotify, DeletingChildGeneratesEvents) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  const std::string file1_path = file1.reset();
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(
+      events,
+      AreUnordered({Event(IN_ATTRIB, file1_wd), Event(IN_DELETE_SELF, file1_wd),
+                    Event(IN_IGNORED, file1_wd),
+                    Event(IN_DELETE, root_wd, Basename(file1_path))}));
+}
+
+TEST(Inotify, CreatingFileGeneratesEvents) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  // Create a new file in the directory.
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+  // The library function we use to create the new file opens it for writing to
+  // create it and sets permissions on it, so we expect the three extra events.
+  ASSERT_THAT(events, Are({Event(IN_CREATE, wd, Basename(file1.path())),
+                           Event(IN_OPEN, wd, Basename(file1.path())),
+                           Event(IN_CLOSE_WRITE, wd, Basename(file1.path())),
+                           Event(IN_ATTRIB, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, ReadingFileGeneratesAccessEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), "some content", TempPath::kDefaultFileMode));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  char buf;
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_ACCESS, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, WritingFileGeneratesModifyEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  const std::string data = "some content";
+  EXPECT_THAT(write(file1_fd.get(), data.c_str(), data.length()),
+              SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_MODIFY, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, WatchSetAfterOpenReportsCloseFdEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  FileDescriptor file1_fd_writable =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+  FileDescriptor file1_fd_not_writable =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  file1_fd_writable.reset();  // Close file1_fd_writable.
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_CLOSE_WRITE, wd, Basename(file1.path()))}));
+
+  file1_fd_not_writable.reset();  // Close file1_fd_not_writable.
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events,
+              Are({Event(IN_CLOSE_NOWRITE, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, ChildrenDeletionInWatchedDirGeneratesEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  const std::string file1_path = file1.reset();
+  const std::string dir1_path = dir1.release();
+  EXPECT_THAT(rmdir(dir1_path.c_str()), SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+  ASSERT_THAT(events,
+              Are({Event(IN_DELETE, wd, Basename(file1_path)),
+                   Event(IN_DELETE | IN_ISDIR, wd, Basename(dir1_path))}));
+}
+
+TEST(Inotify, WatchTargetDeletionGeneratesEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  EXPECT_THAT(rmdir(root.path().c_str()), SyscallSucceeds());
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_DELETE_SELF, wd), Event(IN_IGNORED, wd)}));
+}
+
+TEST(Inotify, MoveGeneratesEvents) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const TempPath dir1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+  const TempPath dir2 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int dir1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), dir1.path(), IN_ALL_EVENTS));
+  const int dir2_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), dir2.path(), IN_ALL_EVENTS));
+  // Test move from root -> root.
+  std::string newpath = NewTempAbsPathInDir(root.path());
+  std::string oldpath = file1.release();
+  EXPECT_THAT(rename(oldpath.c_str(), newpath.c_str()), SyscallSucceeds());
+  file1.reset(newpath);
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(
+      events,
+      Are({Event(IN_MOVED_FROM, root_wd, Basename(oldpath), events[0].cookie),
+           Event(IN_MOVED_TO, root_wd, Basename(newpath), events[1].cookie)}));
+  EXPECT_NE(events[0].cookie, 0);
+  EXPECT_EQ(events[0].cookie, events[1].cookie);
+  uint32_t last_cookie = events[0].cookie;
+
+  // Test move from root -> root/dir1.
+  newpath = NewTempAbsPathInDir(dir1.path());
+  oldpath = file1.release();
+  EXPECT_THAT(rename(oldpath.c_str(), newpath.c_str()), SyscallSucceeds());
+  file1.reset(newpath);
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(
+      events,
+      Are({Event(IN_MOVED_FROM, root_wd, Basename(oldpath), events[0].cookie),
+           Event(IN_MOVED_TO, dir1_wd, Basename(newpath), events[1].cookie)}));
+  // Cookies should be distinct between distinct rename events.
+  EXPECT_NE(events[0].cookie, last_cookie);
+  EXPECT_EQ(events[0].cookie, events[1].cookie);
+  last_cookie = events[0].cookie;
+
+  // Test move from root/dir1 -> root/dir2.
+  newpath = NewTempAbsPathInDir(dir2.path());
+  oldpath = file1.release();
+  EXPECT_THAT(rename(oldpath.c_str(), newpath.c_str()), SyscallSucceeds());
+  file1.reset(newpath);
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(
+      events,
+      Are({Event(IN_MOVED_FROM, dir1_wd, Basename(oldpath), events[0].cookie),
+           Event(IN_MOVED_TO, dir2_wd, Basename(newpath), events[1].cookie)}));
+  EXPECT_NE(events[0].cookie, last_cookie);
+  EXPECT_EQ(events[0].cookie, events[1].cookie);
+  last_cookie = events[0].cookie;
+}
+
+TEST(Inotify, MoveWatchedTargetGeneratesEvents) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  const std::string newpath = NewTempAbsPathInDir(root.path());
+  const std::string oldpath = file1.release();
+  EXPECT_THAT(rename(oldpath.c_str(), newpath.c_str()), SyscallSucceeds());
+  file1.reset(newpath);
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(
+      events,
+      Are({Event(IN_MOVED_FROM, root_wd, Basename(oldpath), events[0].cookie),
+           Event(IN_MOVED_TO, root_wd, Basename(newpath), events[1].cookie),
+           // Self move events do not have a cookie.
+           Event(IN_MOVE_SELF, file1_wd)}));
+  EXPECT_NE(events[0].cookie, 0);
+  EXPECT_EQ(events[0].cookie, events[1].cookie);
+}
+
+TEST(Inotify, CoalesceEvents) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), "some content", TempPath::kDefaultFileMode));
+
+  FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  // Read the file a few times. This will would generate multiple IN_ACCESS
+  // events but they should get coalesced to a single event.
+  char buf;
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+  // Use the close event verify that we haven't simply left the additional
+  // IN_ACCESS events unread.
+  file1_fd.reset();  // Close file1_fd.
+
+  const std::string file1_name = std::string(Basename(file1.path()));
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_ACCESS, wd, file1_name),
+                           Event(IN_CLOSE_NOWRITE, wd, file1_name)}));
+
+  // Now let's try interleaving other events into a stream of repeated events.
+  file1_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(write(file1_fd.get(), "x", 1), SyscallSucceeds());
+  EXPECT_THAT(write(file1_fd.get(), "x", 1), SyscallSucceeds());
+  EXPECT_THAT(write(file1_fd.get(), "x", 1), SyscallSucceeds());
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+  file1_fd.reset();  // Close the file.
+
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(
+      events,
+      Are({Event(IN_OPEN, wd, file1_name), Event(IN_ACCESS, wd, file1_name),
+           Event(IN_MODIFY, wd, file1_name), Event(IN_ACCESS, wd, file1_name),
+           Event(IN_CLOSE_WRITE, wd, file1_name)}));
+
+  // Ensure events aren't coalesced if they are from different files.
+  const TempPath file2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), "some content", TempPath::kDefaultFileMode));
+  // Discard events resulting from creation of file2.
+  ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+  file1_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+  FileDescriptor file2_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file2.path(), O_RDONLY));
+
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(read(file2_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+  // Close both files.
+  file1_fd.reset();
+  file2_fd.reset();
+
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  const std::string file2_name = std::string(Basename(file2.path()));
+  ASSERT_THAT(
+      events,
+      Are({Event(IN_OPEN, wd, file1_name), Event(IN_OPEN, wd, file2_name),
+           Event(IN_ACCESS, wd, file1_name), Event(IN_ACCESS, wd, file2_name),
+           Event(IN_ACCESS, wd, file1_name),
+           Event(IN_CLOSE_NOWRITE, wd, file1_name),
+           Event(IN_CLOSE_NOWRITE, wd, file2_name)}));
+}
+
+TEST(Inotify, ClosingInotifyFdWithoutRemovingWatchesWorks) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+  // Note: The check on close will happen in FileDescriptor::~FileDescriptor().
+}
+
+TEST(Inotify, NestedWatches) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), "some content", TempPath::kDefaultFileMode));
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  // Read from file1. This should generate an event for both watches.
+  char buf;
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_ACCESS, root_wd, Basename(file1.path())),
+                           Event(IN_ACCESS, file1_wd)}));
+}
+
+TEST(Inotify, ConcurrentThreadsGeneratingEvents) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  std::vector<TempPath> files;
+  files.reserve(10);
+  for (int i = 0; i < 10; i++) {
+    files.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+        root.path(), "some content", TempPath::kDefaultFileMode)));
+  }
+
+  auto test_thread = [&files]() {
+    uint32_t seed = time(nullptr);
+    for (int i = 0; i < 20; i++) {
+      const TempPath& file = files[rand_r(&seed) % files.size()];
+      const FileDescriptor file_fd =
+          ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+      TEST_PCHECK(write(file_fd.get(), "x", 1) == 1);
+    }
+  };
+
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  std::list<ScopedThread> threads;
+  for (int i = 0; i < 3; i++) {
+    threads.emplace_back(test_thread);
+  }
+  for (auto& t : threads) {
+    t.Join();
+  }
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  // 3 threads doing 20 iterations, 3 events per iteration (open, write,
+  // close). However, some events may be coalesced, and we can't reliably
+  // predict how they'll be coalesced since the test threads aren't
+  // synchronized. We can only check that we aren't getting unexpected events.
+  for (const Event& ev : events) {
+    EXPECT_NE(ev.mask & (IN_OPEN | IN_MODIFY | IN_CLOSE_WRITE), 0);
+  }
+}
+
+TEST(Inotify, ReadWithTooSmallBufferFails) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  // Open the file to queue an event. This event will not have a filename, so
+  // reading from the inotify fd should return sizeof(struct inotify_event)
+  // bytes of data.
+  FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+  std::vector<char> buf(kBufSize, 0);
+  ssize_t readlen;
+
+  // Try a buffer too small to hold any potential event. This is rejected
+  // outright without the event being dequeued.
+  EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event) - 1),
+              SyscallFailsWithErrno(EINVAL));
+  // Try a buffer just large enough. This should succeeed.
+  EXPECT_THAT(
+      readlen = read(fd.get(), buf.data(), sizeof(struct inotify_event)),
+      SyscallSucceeds());
+  EXPECT_EQ(readlen, sizeof(struct inotify_event));
+  // Event queue is now empty, the next read should return EAGAIN.
+  EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event)),
+              SyscallFailsWithErrno(EAGAIN));
+
+  // Now put a watch on the directory, so that generated events contain a name.
+  EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallSucceeds());
+
+  // Drain the event generated from the watch removal.
+  ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  file1_fd.reset();  // Close file to generate an event.
+
+  // Try a buffer too small to hold any event and one too small to hold an event
+  // with a name. These should both fail without consuming the event.
+  EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event) - 1),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event)),
+              SyscallFailsWithErrno(EINVAL));
+  // Now try with a large enough buffer. This should return the one event.
+  EXPECT_THAT(readlen = read(fd.get(), buf.data(), buf.size()),
+              SyscallSucceeds());
+  EXPECT_GE(readlen,
+            sizeof(struct inotify_event) + Basename(file1.path()).size());
+  // With the single event read, the queue should once again be empty.
+  EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(Inotify, BlockingReadOnInotifyFd) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(0));
+  const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), "some content", TempPath::kDefaultFileMode));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  // Spawn a thread performing a blocking read for new events on the inotify fd.
+  std::vector<char> buf(kBufSize, 0);
+  const int shared_fd = fd.get();  // The thread needs it.
+  ScopedThread t([shared_fd, &buf]() {
+    ssize_t readlen;
+    EXPECT_THAT(readlen = read(shared_fd, buf.data(), buf.size()),
+                SyscallSucceeds());
+  });
+
+  // Perform a read on the watched file, which should generate an IN_ACCESS
+  // event, unblocking the event_reader thread.
+  char c;
+  EXPECT_THAT(read(file1_fd.get(), &c, 1), SyscallSucceeds());
+
+  // Wait for the thread to read the event and exit.
+  t.Join();
+
+  // Make sure the event we got back is sane.
+  uint32_t event_mask;
+  memcpy(&event_mask, buf.data() + offsetof(struct inotify_event, mask),
+         sizeof(event_mask));
+  EXPECT_EQ(event_mask, IN_ACCESS);
+}
+
+TEST(Inotify, WatchOnRelativePath) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), "some content", TempPath::kDefaultFileMode));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+
+  // Change working directory to root.
+  const char* old_working_dir = get_current_dir_name();
+  EXPECT_THAT(chdir(root.path().c_str()), SyscallSucceeds());
+
+  // Add a watch on file1 with a relative path.
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), std::string(Basename(file1.path())), IN_ALL_EVENTS));
+
+  // Perform a read on file1, this should generate an IN_ACCESS event.
+  char c;
+  EXPECT_THAT(read(file1_fd.get(), &c, 1), SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ACCESS, wd)}));
+
+  // Explicitly reset the working directory so that we don't continue to
+  // reference "root". Once the test ends, "root" will get unlinked. If we
+  // continue to hold a reference, random save/restore tests can fail if a save
+  // is triggered after "root" is unlinked; we can't save deleted fs objects
+  // with active references.
+  EXPECT_THAT(chdir(old_working_dir), SyscallSucceeds());
+}
+
+TEST(Inotify, ZeroLengthReadWriteDoesNotGenerateEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const char kContent[] = "some content";
+  TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), kContent, TempPath::kDefaultFileMode));
+  const int kContentSize = sizeof(kContent) - 1;
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  std::vector<char> buf(kContentSize, 0);
+  // Read all available data.
+  ssize_t readlen;
+  EXPECT_THAT(readlen = read(file1_fd.get(), buf.data(), kContentSize),
+              SyscallSucceeds());
+  EXPECT_EQ(readlen, kContentSize);
+  // Drain all events and make sure we got the IN_ACCESS for the read.
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ACCESS, wd, Basename(file1.path()))}));
+
+  // Now try read again. This should be a 0-length read, since we're at EOF.
+  char c;
+  EXPECT_THAT(readlen = read(file1_fd.get(), &c, 1), SyscallSucceeds());
+  EXPECT_EQ(readlen, 0);
+  // We should have no new events.
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_TRUE(events.empty());
+
+  // Try issuing a zero-length read.
+  EXPECT_THAT(readlen = read(file1_fd.get(), &c, 0), SyscallSucceeds());
+  EXPECT_EQ(readlen, 0);
+  // We should have no new events.
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_TRUE(events.empty());
+
+  // Try issuing a zero-length write.
+  ssize_t writelen;
+  EXPECT_THAT(writelen = write(file1_fd.get(), &c, 0), SyscallSucceeds());
+  EXPECT_EQ(writelen, 0);
+  // We should have no new events.
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_TRUE(events.empty());
+}
+
+TEST(Inotify, ChmodGeneratesAttribEvent_NoRandomSave) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor root_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(root.path(), O_RDONLY));
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  auto verify_chmod_events = [&]() {
+    std::vector<Event> events =
+        ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+    ASSERT_THAT(events, Are({Event(IN_ATTRIB, root_wd, Basename(file1.path())),
+                             Event(IN_ATTRIB, file1_wd)}));
+  };
+
+  // Don't do cooperative S/R tests for any of the {f}chmod* syscalls below, the
+  // test will always fail because nodes cannot be saved when they have stricted
+  // permissions than the original host node.
+  const DisableSave ds;
+
+  // Chmod.
+  ASSERT_THAT(chmod(file1.path().c_str(), S_IWGRP), SyscallSucceeds());
+  verify_chmod_events();
+
+  // Fchmod.
+  ASSERT_THAT(fchmod(file1_fd.get(), S_IRGRP | S_IWGRP), SyscallSucceeds());
+  verify_chmod_events();
+
+  // Fchmodat.
+  const std::string file1_basename = std::string(Basename(file1.path()));
+  ASSERT_THAT(fchmodat(root_fd.get(), file1_basename.c_str(), S_IWGRP, 0),
+              SyscallSucceeds());
+  verify_chmod_events();
+}
+
+TEST(Inotify, TruncateGeneratesModifyEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  auto verify_truncate_events = [&]() {
+    std::vector<Event> events =
+        ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+    ASSERT_THAT(events, Are({Event(IN_MODIFY, root_wd, Basename(file1.path())),
+                             Event(IN_MODIFY, file1_wd)}));
+  };
+
+  // Truncate.
+  EXPECT_THAT(truncate(file1.path().c_str(), 4096), SyscallSucceeds());
+  verify_truncate_events();
+
+  // Ftruncate.
+  EXPECT_THAT(ftruncate(file1_fd.get(), 8192), SyscallSucceeds());
+  verify_truncate_events();
+
+  // No events if truncate fails.
+  EXPECT_THAT(ftruncate(file1_fd.get(), -1), SyscallFailsWithErrno(EINVAL));
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({}));
+}
+
+TEST(Inotify, GetdentsGeneratesAccessEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  // This internally calls getdents(2). We also expect to see an open/close
+  // event for the dirfd.
+  ASSERT_NO_ERRNO_AND_VALUE(ListDir(root.path(), false));
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+  // Linux only seems to generate access events on getdents() on some
+  // calls. Allow the test to pass even if it isn't generated. gVisor will
+  // always generate the IN_ACCESS event so the test will at least ensure gVisor
+  // behaves reasonably.
+  int i = 0;
+  EXPECT_EQ(events[i].mask, IN_OPEN | IN_ISDIR);
+  ++i;
+  if (IsRunningOnGvisor()) {
+    EXPECT_EQ(events[i].mask, IN_ACCESS | IN_ISDIR);
+    ++i;
+  } else {
+    if (events[i].mask == (IN_ACCESS | IN_ISDIR)) {
+      // Skip over the IN_ACCESS event on Linux, it only shows up some of the
+      // time so we can't assert its existence.
+      ++i;
+    }
+  }
+  EXPECT_EQ(events[i].mask, IN_CLOSE_NOWRITE | IN_ISDIR);
+}
+
+TEST(Inotify, MknodGeneratesCreateEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  const TempPath file1(root.path() + "/file1");
+  const int rc = mknod(file1.path().c_str(), S_IFREG, 0);
+  // mknod(2) is only supported on tmpfs in the sandbox.
+  SKIP_IF(IsRunningOnGvisor() && rc != 0);
+  ASSERT_THAT(rc, SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_CREATE, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, SymlinkGeneratesCreateEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const TempPath link1(NewTempAbsPathInDir(root.path()));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  ASSERT_THAT(symlink(file1.path().c_str(), link1.path().c_str()),
+              SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+  ASSERT_THAT(events, Are({Event(IN_CREATE, root_wd, Basename(link1.path()))}));
+}
+
+TEST(Inotify, LinkGeneratesAttribAndCreateEvents) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const TempPath link1(root.path() + "/link1");
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  const int rc = link(file1.path().c_str(), link1.path().c_str());
+  // link(2) is only supported on tmpfs in the sandbox.
+  SKIP_IF(IsRunningOnGvisor() && rc != 0 && errno == EPERM);
+  ASSERT_THAT(rc, SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_ATTRIB, file1_wd),
+                           Event(IN_CREATE, root_wd, Basename(link1.path()))}));
+}
+
+TEST(Inotify, HardlinksReuseSameWatch) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  TempPath link1(root.path() + "/link1");
+  const int rc = link(file1.path().c_str(), link1.path().c_str());
+  // link(2) is only supported on tmpfs in the sandbox.
+  SKIP_IF(IsRunningOnGvisor() && rc != 0 && errno == EPERM);
+  ASSERT_THAT(rc, SyscallSucceeds());
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+  const int link1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), link1.path(), IN_ALL_EVENTS));
+
+  // The watch descriptors for watches on different links to the same file
+  // should be identical.
+  EXPECT_NE(root_wd, file1_wd);
+  EXPECT_EQ(file1_wd, link1_wd);
+
+  FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events,
+              AreUnordered({Event(IN_OPEN, root_wd, Basename(file1.path())),
+                            Event(IN_OPEN, file1_wd)}));
+
+  // For the next step, we want to ensure all fds to the file are closed. Do
+  // that now and drain the resulting events.
+  file1_fd.reset();
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events,
+              Are({Event(IN_CLOSE_WRITE, root_wd, Basename(file1.path())),
+                   Event(IN_CLOSE_WRITE, file1_wd)}));
+
+  // Try removing the link and let's see what events show up. Note that after
+  // this, we still have a link to the file so the watch shouldn't be
+  // automatically removed.
+  const std::string link1_path = link1.reset();
+
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_ATTRIB, link1_wd),
+                           Event(IN_DELETE, root_wd, Basename(link1_path))}));
+
+  // Now remove the other link. Since this is the last link to the file, the
+  // watch should be automatically removed.
+  const std::string file1_path = file1.reset();
+
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(
+      events,
+      AreUnordered({Event(IN_ATTRIB, file1_wd), Event(IN_DELETE_SELF, file1_wd),
+                    Event(IN_IGNORED, file1_wd),
+                    Event(IN_DELETE, root_wd, Basename(file1_path))}));
+}
+
+TEST(Inotify, MkdirGeneratesCreateEventWithDirFlag) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  const TempPath dir1(NewTempAbsPathInDir(root.path()));
+  ASSERT_THAT(mkdir(dir1.path().c_str(), 0777), SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(
+      events,
+      Are({Event(IN_CREATE | IN_ISDIR, root_wd, Basename(dir1.path()))}));
+}
+
+TEST(Inotify, MultipleInotifyInstancesAndWatchesAllGetEvents) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+  constexpr int kNumFds = 30;
+  std::vector<FileDescriptor> inotify_fds;
+
+  for (int i = 0; i < kNumFds; ++i) {
+    const DisableSave ds;  // Too expensive.
+    inotify_fds.emplace_back(
+        ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)));
+    const FileDescriptor& fd = inotify_fds[inotify_fds.size() - 1];  // Back.
+    ASSERT_NO_ERRNO_AND_VALUE(
+        InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+    ASSERT_NO_ERRNO_AND_VALUE(
+        InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+  }
+
+  const std::string data = "some content";
+  EXPECT_THAT(write(file1_fd.get(), data.c_str(), data.length()),
+              SyscallSucceeds());
+
+  for (const FileDescriptor& fd : inotify_fds) {
+    const DisableSave ds;  // Too expensive.
+    const std::vector<Event> events =
+        ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+    if (events.size() >= 2) {
+      EXPECT_EQ(events[0].mask, IN_MODIFY);
+      EXPECT_EQ(events[0].wd, 1);
+      EXPECT_EQ(events[0].name, Basename(file1.path()));
+      EXPECT_EQ(events[1].mask, IN_MODIFY);
+      EXPECT_EQ(events[1].wd, 2);
+      EXPECT_EQ(events[1].name, "");
+    }
+  }
+}
+
+TEST(Inotify, EventsGoUpAtMostOneLevel) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath dir1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  const int dir1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), dir1.path(), IN_ALL_EVENTS));
+
+  const std::string file1_path = file1.reset();
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_DELETE, dir1_wd, Basename(file1_path))}));
+}
+
+TEST(Inotify, DuplicateWatchReturnsSameWatchDescriptor) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int wd1 = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+  const int wd2 = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  EXPECT_EQ(wd1, wd2);
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  // The watch shouldn't be duplicated, we only expect one event.
+  ASSERT_THAT(events, Are({Event(IN_OPEN, wd1)}));
+}
+
+TEST(Inotify, UnmatchedEventsAreDiscarded) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(fd.get(), file1.path(), IN_ACCESS));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  // We only asked for access events, the open event should be discarded.
+  ASSERT_THAT(events, Are({}));
+}
+
+TEST(Inotify, AddWatchWithInvalidEventMaskFails) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  EXPECT_THAT(inotify_add_watch(fd.get(), root.path().c_str(), 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, AddWatchOnInvalidPathFails) {
+  const TempPath nonexistent(NewTempAbsPath());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  // Non-existent path.
+  EXPECT_THAT(
+      inotify_add_watch(fd.get(), nonexistent.path().c_str(), IN_CREATE),
+      SyscallFailsWithErrno(ENOENT));
+
+  // Garbage path pointer.
+  EXPECT_THAT(inotify_add_watch(fd.get(), nullptr, IN_CREATE),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(Inotify, InOnlyDirFlagRespected) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  EXPECT_THAT(
+      inotify_add_watch(fd.get(), root.path().c_str(), IN_ACCESS | IN_ONLYDIR),
+      SyscallSucceeds());
+
+  EXPECT_THAT(
+      inotify_add_watch(fd.get(), file1.path().c_str(), IN_ACCESS | IN_ONLYDIR),
+      SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(Inotify, MaskAddMergesWithExistingEventMask) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_OPEN | IN_CLOSE_WRITE));
+
+  const std::string data = "some content";
+  EXPECT_THAT(write(file1_fd.get(), data.c_str(), data.length()),
+              SyscallSucceeds());
+
+  // We shouldn't get any events, since IN_MODIFY wasn't in the event mask.
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({}));
+
+  // Add IN_MODIFY to event mask.
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_MODIFY | IN_MASK_ADD));
+
+  EXPECT_THAT(write(file1_fd.get(), data.c_str(), data.length()),
+              SyscallSucceeds());
+
+  // This time we should get the modify event.
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_MODIFY, wd)}));
+
+  // Now close the fd. If the modify event was added to the event mask rather
+  // than replacing the event mask we won't get the close event.
+  file1_fd.reset();
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_CLOSE_WRITE, wd)}));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
new file mode 100644
index 000000000..bee0ba1b3
--- /dev/null
+++ b/test/syscalls/linux/ioctl.cc
@@ -0,0 +1,375 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+bool CheckNonBlocking(int fd) {
+  int ret = fcntl(fd, F_GETFL, 0);
+  TEST_CHECK(ret != -1);
+  return (ret & O_NONBLOCK) == O_NONBLOCK;
+}
+
+bool CheckCloExec(int fd) {
+  int ret = fcntl(fd, F_GETFD, 0);
+  TEST_CHECK(ret != -1);
+  return (ret & FD_CLOEXEC) == FD_CLOEXEC;
+}
+
+class IoctlTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    ASSERT_THAT(fd_ = open("/dev/null", O_RDONLY), SyscallSucceeds());
+  }
+
+  void TearDown() override {
+    if (fd_ >= 0) {
+      ASSERT_THAT(close(fd_), SyscallSucceeds());
+      fd_ = -1;
+    }
+  }
+
+  int fd() const { return fd_; }
+
+ private:
+  int fd_ = -1;
+};
+
+TEST_F(IoctlTest, BadFileDescriptor) {
+  EXPECT_THAT(ioctl(-1 /* fd */, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(IoctlTest, InvalidControlNumber) {
+  EXPECT_THAT(ioctl(STDOUT_FILENO, 0), SyscallFailsWithErrno(ENOTTY));
+}
+
+TEST_F(IoctlTest, FIONBIOSucceeds) {
+  EXPECT_FALSE(CheckNonBlocking(fd()));
+  int set = 1;
+  EXPECT_THAT(ioctl(fd(), FIONBIO, &set), SyscallSucceeds());
+  EXPECT_TRUE(CheckNonBlocking(fd()));
+  set = 0;
+  EXPECT_THAT(ioctl(fd(), FIONBIO, &set), SyscallSucceeds());
+  EXPECT_FALSE(CheckNonBlocking(fd()));
+}
+
+TEST_F(IoctlTest, FIONBIOFails) {
+  EXPECT_THAT(ioctl(fd(), FIONBIO, nullptr), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(IoctlTest, FIONCLEXSucceeds) {
+  EXPECT_THAT(ioctl(fd(), FIONCLEX), SyscallSucceeds());
+  EXPECT_FALSE(CheckCloExec(fd()));
+}
+
+TEST_F(IoctlTest, FIOCLEXSucceeds) {
+  EXPECT_THAT(ioctl(fd(), FIOCLEX), SyscallSucceeds());
+  EXPECT_TRUE(CheckCloExec(fd()));
+}
+
+TEST_F(IoctlTest, FIOASYNCFails) {
+  EXPECT_THAT(ioctl(fd(), FIOASYNC, nullptr), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(IoctlTest, FIOASYNCSucceeds) {
+  // Not all FDs support FIOASYNC.
+  const FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  int before = -1;
+  ASSERT_THAT(before = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+
+  int set = 1;
+  EXPECT_THAT(ioctl(s.get(), FIOASYNC, &set), SyscallSucceeds());
+
+  int after_set = -1;
+  ASSERT_THAT(after_set = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+  EXPECT_EQ(after_set, before | O_ASYNC) << "before was " << before;
+
+  set = 0;
+  EXPECT_THAT(ioctl(s.get(), FIOASYNC, &set), SyscallSucceeds());
+
+  ASSERT_THAT(fcntl(s.get(), F_GETFL), SyscallSucceedsWithValue(before));
+}
+
+/* Count of the number of SIGIOs handled. */
+static volatile int io_received = 0;
+
+void inc_io_handler(int sig, siginfo_t* siginfo, void* arg) { io_received++; }
+
+TEST_F(IoctlTest, FIOASYNCNoTarget) {
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  // Count SIGIOs received.
+  io_received = 0;
+  struct sigaction sa;
+  sa.sa_sigaction = inc_io_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_RESTART;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+  // Actually allow SIGIO delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+  int set = 1;
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+
+  constexpr char kData[] = "abc";
+  ASSERT_THAT(WriteFd(pair->first_fd(), kData, sizeof(kData)),
+              SyscallSucceedsWithValue(sizeof(kData)));
+
+  EXPECT_EQ(io_received, 0);
+}
+
+TEST_F(IoctlTest, FIOASYNCSelfTarget) {
+  // FIXME: gVisor erroneously sends SIGIO on close(2), which would
+  // kill the test when pair goes out of scope. Temporarily ignore SIGIO so that
+  // that the close signal is ignored.
+  struct sigaction sa;
+  sa.sa_handler = SIG_IGN;
+  auto early_sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  // Count SIGIOs received.
+  io_received = 0;
+  sa.sa_sigaction = inc_io_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_RESTART;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+  // Actually allow SIGIO delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+  int set = 1;
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+
+  pid_t pid = getpid();
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+  constexpr char kData[] = "abc";
+  ASSERT_THAT(WriteFd(pair->first_fd(), kData, sizeof(kData)),
+              SyscallSucceedsWithValue(sizeof(kData)));
+
+  EXPECT_EQ(io_received, 1);
+}
+
+// Equivalent to FIOASYNCSelfTarget except that FIOSETOWN is called before
+// FIOASYNC.
+TEST_F(IoctlTest, FIOASYNCSelfTarget2) {
+  // FIXME: gVisor erroneously sends SIGIO on close(2), which would
+  // kill the test when pair goes out of scope. Temporarily ignore SIGIO so that
+  // that the close signal is ignored.
+  struct sigaction sa;
+  sa.sa_handler = SIG_IGN;
+  auto early_sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  // Count SIGIOs received.
+  io_received = 0;
+  sa.sa_sigaction = inc_io_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_RESTART;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+  // Actually allow SIGIO delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+  pid_t pid = getpid();
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+  int set = 1;
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+
+  constexpr char kData[] = "abc";
+  ASSERT_THAT(WriteFd(pair->first_fd(), kData, sizeof(kData)),
+              SyscallSucceedsWithValue(sizeof(kData)));
+
+  EXPECT_EQ(io_received, 1);
+}
+
+TEST_F(IoctlTest, FIOASYNCInvalidPID) {
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  int set = 1;
+  ASSERT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+  pid_t pid = INT_MAX;
+  // This succeeds (with behavior equivalent to a pid of 0) in Linux prior to
+  // f73127356f34 "fs/fcntl: return -ESRCH in f_setown when pid/pgid can't be
+  // found", and fails with EPERM after that commit.
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid),
+              AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ESRCH)));
+}
+
+TEST_F(IoctlTest, FIOASYNCUnsetTarget) {
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  // Count SIGIOs received.
+  io_received = 0;
+  struct sigaction sa;
+  sa.sa_sigaction = inc_io_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_RESTART;
+  auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+  // Actually allow SIGIO delivery.
+  auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+  int set = 1;
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+
+  pid_t pid = getpid();
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+  // Passing a PID of 0 unsets the target.
+  pid = 0;
+  EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+  constexpr char kData[] = "abc";
+  ASSERT_THAT(WriteFd(pair->first_fd(), kData, sizeof(kData)),
+              SyscallSucceedsWithValue(sizeof(kData)));
+
+  EXPECT_EQ(io_received, 0);
+}
+
+using IoctlTestSIOCGIFCONF = SimpleSocketTest;
+
+TEST_P(IoctlTestSIOCGIFCONF, ValidateNoArrayGetsLength) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Validate that no array can be used to get the length required.
+  struct ifconf ifconf = {};
+  ASSERT_THAT(ioctl(fd->get(), SIOCGIFCONF, &ifconf), SyscallSucceeds());
+  ASSERT_GT(ifconf.ifc_len, 0);
+}
+
+// This test validates that we will only return a partial array list and not
+// partial ifrreq structs.
+TEST_P(IoctlTestSIOCGIFCONF, ValidateNoPartialIfrsReturned) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  struct ifreq ifr = {};
+  struct ifconf ifconf = {};
+  ifconf.ifc_len = sizeof(ifr) - 1;  // One byte too few.
+  ifconf.ifc_ifcu.ifcu_req = &ifr;
+
+  ASSERT_THAT(ioctl(fd->get(), SIOCGIFCONF, &ifconf), SyscallSucceeds());
+  ASSERT_EQ(ifconf.ifc_len, 0);
+  ASSERT_EQ(ifr.ifr_name[0], '\0');  // Nothing is returned.
+
+  ifconf.ifc_len = sizeof(ifreq);
+  ASSERT_THAT(ioctl(fd->get(), SIOCGIFCONF, &ifconf), SyscallSucceeds());
+  ASSERT_GT(ifconf.ifc_len, 0);
+  ASSERT_NE(ifr.ifr_name[0], '\0');  // An interface can now be returned.
+}
+
+TEST_P(IoctlTestSIOCGIFCONF, ValidateLoopbackIsPresent) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  struct ifconf ifconf = {};
+  struct ifreq ifr[10] = {};  // Storage for up to 10 interfaces.
+
+  ifconf.ifc_req = ifr;
+  ifconf.ifc_len = sizeof(ifr);
+
+  ASSERT_THAT(ioctl(fd->get(), SIOCGIFCONF, &ifconf), SyscallSucceeds());
+  size_t num_if = ifconf.ifc_len / sizeof(struct ifreq);
+
+  // We should have at least one interface.
+  ASSERT_GE(num_if, 1);
+
+  // One of the interfaces should be a loopback.
+  bool found_loopback = false;
+  for (size_t i = 0; i < num_if; ++i) {
+    if (strcmp(ifr[i].ifr_name, "lo") == 0) {
+      // SIOCGIFCONF returns the ipv4 address of the interface, let's check it.
+      ASSERT_EQ(ifr[i].ifr_addr.sa_family, AF_INET);
+
+      // Validate the address is correct for loopback.
+      sockaddr_in* sin = reinterpret_cast<sockaddr_in*>(&ifr[i].ifr_addr);
+      ASSERT_EQ(htonl(sin->sin_addr.s_addr), INADDR_LOOPBACK);
+
+      found_loopback = true;
+      break;
+    }
+  }
+  ASSERT_TRUE(found_loopback);
+}
+
+std::vector<SocketKind> IoctlSocketTypes() {
+  return {SimpleSocket(AF_UNIX, SOCK_STREAM, 0),
+          SimpleSocket(AF_UNIX, SOCK_DGRAM, 0),
+          SimpleSocket(AF_INET, SOCK_STREAM, 0),
+          SimpleSocket(AF_INET6, SOCK_STREAM, 0),
+          SimpleSocket(AF_INET, SOCK_DGRAM, 0),
+          SimpleSocket(AF_INET6, SOCK_DGRAM, 0)};
+}
+
+INSTANTIATE_TEST_CASE_P(IoctlTest, IoctlTestSIOCGIFCONF,
+                        ::testing::ValuesIn(IoctlSocketTypes()));
+
+}  // namespace
+
+TEST_F(IoctlTest, FIOGETOWNSucceeds) {
+  const FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  int get = -1;
+  ASSERT_THAT(ioctl(s.get(), FIOGETOWN, &get), SyscallSucceeds());
+  EXPECT_EQ(get, 0);
+}
+
+TEST_F(IoctlTest, SIOCGPGRPSucceeds) {
+  const FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  int get = -1;
+  ASSERT_THAT(ioctl(s.get(), SIOCGPGRP, &get), SyscallSucceeds());
+  EXPECT_EQ(get, 0);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
new file mode 100644
index 000000000..1659d3d83
--- /dev/null
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -0,0 +1,78 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+std::string DescribeSocketType(int type) {
+  return absl::StrCat(((type & SOCK_NONBLOCK) != 0) ? "non-blocking " : "",
+                      ((type & SOCK_CLOEXEC) != 0) ? "close-on-exec " : "");
+}
+
+}  // namespace
+
+SocketPairKind IPv6TCPAcceptBindSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "IPv6 TCP socket");
+  return SocketPairKind{
+      description, TCPAcceptBindSocketPairCreator(AF_INET6, type | SOCK_STREAM,
+                                                  0, /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4TCPAcceptBindSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "IPv4 TCP socket");
+  return SocketPairKind{
+      description, TCPAcceptBindSocketPairCreator(AF_INET, type | SOCK_STREAM,
+                                                  0, /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackTCPAcceptBindSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "dual stack TCP socket");
+  return SocketPairKind{
+      description, TCPAcceptBindSocketPairCreator(AF_INET6, type | SOCK_STREAM,
+                                                  0, /* dual_stack = */ true)};
+}
+
+SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "IPv6 UDP socket");
+  return SocketPairKind{description, UDPBidirectionalBindSocketPairCreator(
+                                         AF_INET6, type | SOCK_DGRAM, 0,
+                                         /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4UDPBidirectionalBindSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "IPv4 UDP socket");
+  return SocketPairKind{description, UDPBidirectionalBindSocketPairCreator(
+                                         AF_INET, type | SOCK_DGRAM, 0,
+                                         /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "dual stack UDP socket");
+  return SocketPairKind{description, UDPBidirectionalBindSocketPairCreator(
+                                         AF_INET6, type | SOCK_DGRAM, 0,
+                                         /* dual_stack = */ true)};
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
new file mode 100644
index 000000000..1e1400ecd
--- /dev/null
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -0,0 +1,57 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_IP_SOCKET_TEST_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_IP_SOCKET_TEST_UTIL_H_
+
+#include <string>
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// IPv6TCPAcceptBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with AF_INET6 and the
+// given type bound to the IPv6 loopback.
+SocketPairKind IPv6TCPAcceptBindSocketPair(int type);
+
+// IPv4TCPAcceptBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with AF_INET and the
+// given type bound to the IPv4 loopback.
+SocketPairKind IPv4TCPAcceptBindSocketPair(int type);
+
+// DualStackTCPAcceptBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with AF_INET6 and the
+// given type bound to the IPv4 loopback.
+SocketPairKind DualStackTCPAcceptBindSocketPair(int type);
+
+// IPv6UDPBidirectionalBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and connect() syscalls with AF_INET6 and the
+// given type bound to the IPv6 loopback.
+SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type);
+
+// IPv4UDPBidirectionalBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and connect() syscalls with AF_INET and the
+// given type bound to the IPv4 loopback.
+SocketPairKind IPv4UDPBidirectionalBindSocketPair(int type);
+
+// DualStackUDPBidirectionalBindSocketPair returns a SocketPairKind that
+// represents SocketPairs created with bind() and connect() syscalls with
+// AF_INET6 and the given type bound to the IPv4 loopback.
+SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_IP_SOCKET_TEST_UTIL_H_
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
new file mode 100644
index 000000000..ee5871cbe
--- /dev/null
+++ b/test/syscalls/linux/itimer.cc
@@ -0,0 +1,342 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+
+#include <atomic>
+#include <functional>
+#include <iostream>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr char kSIGALRMToMainThread[] = "--itimer_sigarlm_to_main_thread";
+constexpr char kSIGPROFFairnessActive[] = "--itimer_sigprof_fairness_active";
+constexpr char kSIGPROFFairnessIdle[] = "--itimer_sigprof_fairness_idle";
+
+// Time period to be set for the itimers.
+constexpr absl::Duration kPeriod = absl::Milliseconds(25);
+// Total amount of time to spend per thread.
+constexpr absl::Duration kTestDuration = absl::Seconds(20);
+// Amount of spin iterations to perform as the minimum work item per thread.
+// Chosen to be sub-millisecond range.
+constexpr int kIterations = 10000000;
+// Allow deviation in the number of samples.
+constexpr double kNumSamplesDeviationRatio = 0.2;
+constexpr double kNumSamplesMinRatio = 0.5;
+
+TEST(ItimerTest, ItimervalUpdatedBeforeExpiration) {
+  constexpr int kSleepSecs = 10;
+  constexpr int kAlarmSecs = 15;
+  static_assert(
+      kSleepSecs < kAlarmSecs,
+      "kSleepSecs must be less than kAlarmSecs for the test to be meaningful");
+  constexpr int kMaxRemainingSecs = kAlarmSecs - kSleepSecs;
+
+  // Install a no-op handler for SIGALRM.
+  struct sigaction sa = {};
+  sigfillset(&sa.sa_mask);
+  sa.sa_handler = +[](int signo) {};
+  auto const cleanup_sa =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+  // Set an itimer-based alarm for kAlarmSecs from now.
+  struct itimerval itv = {};
+  itv.it_value.tv_sec = kAlarmSecs;
+  auto const cleanup_itimer =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, itv));
+
+  // After sleeping for kSleepSecs, the itimer value should reflect the elapsed
+  // time even if it hasn't expired.
+  absl::SleepFor(absl::Seconds(kSleepSecs));
+  ASSERT_THAT(getitimer(ITIMER_REAL, &itv), SyscallSucceeds());
+  EXPECT_TRUE(
+      itv.it_value.tv_sec < kMaxRemainingSecs ||
+      (itv.it_value.tv_sec == kMaxRemainingSecs && itv.it_value.tv_usec == 0))
+      << "Remaining time: " << itv.it_value.tv_sec << " seconds + "
+      << itv.it_value.tv_usec << " microseconds";
+}
+
+ABSL_CONST_INIT static thread_local std::atomic_int signal_test_num_samples =
+    ATOMIC_VAR_INIT(0);
+
+void SignalTestSignalHandler(int /*signum*/) { signal_test_num_samples++; }
+
+struct SignalTestResult {
+  int expected_total;
+  int main_thread_samples;
+  std::vector<int> worker_samples;
+};
+
+std::ostream& operator<<(std::ostream& os, const SignalTestResult& r) {
+  os << "{expected_total: " << r.expected_total
+     << ", main_thread_samples: " << r.main_thread_samples
+     << ", worker_samples: [";
+  bool first = true;
+  for (int sample : r.worker_samples) {
+    if (!first) {
+      os << ", ";
+    }
+    os << sample;
+    first = false;
+  }
+  os << "]}";
+  return os;
+}
+
+// Starts two worker threads and itimer id and measures the number of signal
+// delivered to each thread.
+SignalTestResult ItimerSignalTest(int id, clock_t main_clock,
+                                  clock_t worker_clock, int signal,
+                                  absl::Duration sleep) {
+  signal_test_num_samples = 0;
+
+  struct sigaction sa = {};
+  sa.sa_handler = &SignalTestSignalHandler;
+  sa.sa_flags = SA_RESTART;
+  sigemptyset(&sa.sa_mask);
+  auto sigaction_cleanup = std::move(ScopedSigaction(signal, sa).ValueOrDie());
+
+  int socketfds[2];
+  TEST_PCHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, socketfds) == 0);
+
+  // Do the spinning in the workers.
+  std::function<void*(int)> work = [&](int socket_fd) {
+    FileDescriptor fd(socket_fd);
+
+    absl::Time finish = Now(worker_clock) + kTestDuration;
+    while (Now(worker_clock) < finish) {
+      // Blocked on read.
+      char c;
+      RetryEINTR(read)(fd.get(), &c, 1);
+      for (int i = 0; i < kIterations; i++) {
+        // Ensure compiler won't optimize this loop away.
+        asm("");
+      }
+
+      if (sleep != absl::ZeroDuration()) {
+        // Sleep so that the entire process is idle for a while.
+        absl::SleepFor(sleep);
+      }
+
+      // Unblock the other thread.
+      RetryEINTR(write)(fd.get(), &c, 1);
+    }
+
+    return reinterpret_cast<void*>(signal_test_num_samples.load());
+  };
+
+  ScopedThread th1(
+      static_cast<std::function<void*()>>(std::bind(work, socketfds[0])));
+  ScopedThread th2(
+      static_cast<std::function<void*()>>(std::bind(work, socketfds[1])));
+
+  absl::Time start = Now(main_clock);
+  // Start the timer.
+  struct itimerval timer = {};
+  timer.it_value = absl::ToTimeval(kPeriod);
+  timer.it_interval = absl::ToTimeval(kPeriod);
+  auto cleanup_itimer = std::move(ScopedItimer(id, timer).ValueOrDie());
+
+  // Unblock th1.
+  //
+  // N.B. th2 owns socketfds[1] but can't close it until it unblocks.
+  char c = 0;
+  TEST_CHECK(write(socketfds[1], &c, 1) == 1);
+
+  SignalTestResult result;
+
+  // Wait for the workers to be done and collect their sample counts.
+  result.worker_samples.push_back(reinterpret_cast<int64_t>(th1.Join()));
+  result.worker_samples.push_back(reinterpret_cast<int64_t>(th2.Join()));
+  cleanup_itimer.Release()();
+  result.expected_total = (Now(main_clock) - start) / kPeriod;
+  result.main_thread_samples = signal_test_num_samples.load();
+
+  return result;
+}
+
+int TestSIGALRMToMainThread() {
+  SignalTestResult result =
+      ItimerSignalTest(ITIMER_REAL, CLOCK_REALTIME, CLOCK_REALTIME, SIGALRM,
+                       absl::ZeroDuration());
+
+  std::cerr << "result: " << result << std::endl;
+
+  // ITIMER_REAL-generated SIGALRMs prefer to deliver to the thread group leader
+  // (but don't guarantee it), so we expect to see most samples on the main
+  // thread.
+  //
+  // Linux only guarantees timers will never expire before the requested time.
+  // Thus, we only check the upper bound and also it at least have one sample.
+  TEST_CHECK(result.main_thread_samples <= result.expected_total);
+  TEST_CHECK(result.main_thread_samples > 0);
+  for (int num : result.worker_samples) {
+    TEST_CHECK_MSG(num <= 50, "worker received too many samples");
+  }
+
+  return 0;
+}
+
+// Random save/restore is disabled as it introduces additional latency and
+// unpredictable distribution patterns.
+TEST(ItimerTest, DeliversSIGALRMToMainThread_NoRandomSave) {
+  pid_t child;
+  int execve_errno;
+  auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec("/proc/self/exe", {"/proc/self/exe", kSIGALRMToMainThread},
+                  {}, &child, &execve_errno));
+  EXPECT_EQ(0, execve_errno);
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+              SyscallSucceedsWithValue(child));
+
+  // Not required anymore.
+  kill.Release();
+
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
+}
+
+// Signals are delivered to threads fairly.
+//
+// sleep indicates how long to sleep worker threads each iteration to make the
+// entire process idle.
+int TestSIGPROFFairness(absl::Duration sleep) {
+  SignalTestResult result =
+      ItimerSignalTest(ITIMER_PROF, CLOCK_PROCESS_CPUTIME_ID,
+                       CLOCK_THREAD_CPUTIME_ID, SIGPROF, sleep);
+
+  std::cerr << "result: " << result << std::endl;
+
+  // The number of samples on the main thread should be very low as it did
+  // nothing.
+  TEST_CHECK(result.main_thread_samples < 60);
+
+  // Both workers should get roughly equal number of samples.
+  TEST_CHECK(result.worker_samples.size() == 2);
+
+  TEST_CHECK(result.expected_total > 0);
+
+  // In an ideal world each thread would get exactly 50% of the signals,
+  // but since that's unlikely to happen we allow for them to get no less than
+  // kNumSamplesDeviationRatio of the total observed samples.
+  TEST_CHECK_MSG(std::abs(result.worker_samples[0] - result.worker_samples[1]) <
+                     ((result.worker_samples[0] + result.worker_samples[1]) *
+                      kNumSamplesDeviationRatio),
+                 "one worker received disproportionate share of samples");
+
+  return 0;
+}
+
+// Random save/restore is disabled as it introduces additional latency and
+// unpredictable distribution patterns.
+TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
+  pid_t child;
+  int execve_errno;
+  auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec("/proc/self/exe", {"/proc/self/exe", kSIGPROFFairnessActive},
+                  {}, &child, &execve_errno));
+  EXPECT_EQ(0, execve_errno);
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+              SyscallSucceedsWithValue(child));
+
+  // Not required anymore.
+  kill.Release();
+
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+// Random save/restore is disabled as it introduces additional latency and
+// unpredictable distribution patterns.
+TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyIdle_NoRandomSave) {
+  pid_t child;
+  int execve_errno;
+  auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec("/proc/self/exe", {"/proc/self/exe", kSIGPROFFairnessIdle},
+                  {}, &child, &execve_errno));
+  EXPECT_EQ(0, execve_errno);
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+              SyscallSucceedsWithValue(child));
+
+  // Not required anymore.
+  kill.Release();
+
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "Exited with code: " << status;
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
+
+namespace {
+void MaskSIGPIPE() {
+  // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
+  // We don't take the TestInit() path so we must do this manually.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_IGN;
+  TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
+}
+}  // namespace
+
+int main(int argc, char** argv) {
+  // These tests require no background threads, so check for them before
+  // TestInit.
+  for (int i = 0; i < argc; i++) {
+    absl::string_view arg(argv[i]);
+
+    if (arg == gvisor::testing::kSIGALRMToMainThread) {
+      MaskSIGPIPE();
+      return gvisor::testing::TestSIGALRMToMainThread();
+    }
+    if (arg == gvisor::testing::kSIGPROFFairnessActive) {
+      MaskSIGPIPE();
+      return gvisor::testing::TestSIGPROFFairness(absl::ZeroDuration());
+    }
+    if (arg == gvisor::testing::kSIGPROFFairnessIdle) {
+      MaskSIGPIPE();
+      return gvisor::testing::TestSIGPROFFairness(absl::Milliseconds(10));
+    }
+  }
+
+  gvisor::testing::TestInit(&argc, &argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/kill.cc b/test/syscalls/linux/kill.cc
new file mode 100644
index 000000000..18ba8fb16
--- /dev/null
+++ b/test/syscalls/linux/kill.cc
@@ -0,0 +1,380 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <csignal>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_int32(scratch_uid, 65534, "scratch UID");
+DEFINE_int32(scratch_gid, 65534, "scratch GID");
+
+using ::testing::Ge;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(KillTest, CanKillValidPid) {
+  // If pid is positive, then signal sig is sent to the process with the ID
+  // specified by pid.
+  EXPECT_THAT(kill(getpid(), 0), SyscallSucceeds());
+  // If pid equals 0, then sig is sent to every process in the process group of
+  // the calling process.
+  EXPECT_THAT(kill(0, 0), SyscallSucceeds());
+
+  ScopedThread([] { EXPECT_THAT(kill(gettid(), 0), SyscallSucceeds()); });
+}
+
+void SigHandler(int sig, siginfo_t* info, void* context) { _exit(0); }
+
+// If pid equals -1, then sig is sent to every process for which the calling
+// process has permission to send signals, except for process 1 (init).
+TEST(KillTest, CanKillAllPIDs) {
+  int pipe_fds[2];
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+  FileDescriptor read_fd(pipe_fds[0]);
+  FileDescriptor write_fd(pipe_fds[1]);
+
+  pid_t pid = fork();
+  if (pid == 0) {
+    read_fd.reset();
+
+    struct sigaction sa;
+    sa.sa_sigaction = SigHandler;
+    sigfillset(&sa.sa_mask);
+    sa.sa_flags = SA_SIGINFO;
+    auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGWINCH, sa));
+
+    // Indicate to the parent that we're ready.
+    write_fd.reset();
+
+    // Wait until we get the signal from the parent.
+    while (true) {
+      pause();
+    }
+  }
+
+  EXPECT_THAT(pid, SyscallSucceeds());
+
+  write_fd.reset();
+
+  // Wait for the child to indicate that it's unmasked the signal by closing
+  // the write end.
+  char buf;
+  ASSERT_THAT(ReadFd(read_fd.get(), &buf, 1), SyscallSucceedsWithValue(0));
+
+  // Signal the child and wait for it to die with status 0, indicating that
+  // it got the expected signal.
+  EXPECT_THAT(kill(-1, SIGWINCH), SyscallSucceeds());
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+              SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(KillTest, CannotKillInvalidPID) {
+  // We need an unused pid to verify that kill fails when given one.
+  //
+  // There is no way to guarantee that a PID is unused, but the PID of a
+  // recently exited process likely won't be reused soon.
+  pid_t fake_pid = fork();
+  if (fake_pid == 0) {
+    _exit(0);
+  }
+
+  EXPECT_THAT(fake_pid, SyscallSucceeds());
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(fake_pid, &status, 0),
+              SyscallSucceedsWithValue(fake_pid));
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(0, WEXITSTATUS(status));
+
+  EXPECT_THAT(kill(fake_pid, 0), SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(KillTest, CannotUseInvalidSignal) {
+  EXPECT_THAT(kill(getpid(), 200), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(KillTest, CanKillRemoteProcess) {
+  pid_t pid = fork();
+  if (pid == 0) {
+    while (true) {
+      pause();
+    }
+  }
+
+  EXPECT_THAT(pid, SyscallSucceeds());
+
+  EXPECT_THAT(kill(pid, SIGKILL), SyscallSucceeds());
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+              SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFSIGNALED(status));
+  EXPECT_EQ(SIGKILL, WTERMSIG(status));
+}
+
+TEST(KillTest, CanKillOwnProcess) {
+  EXPECT_THAT(kill(getpid(), 0), SyscallSucceeds());
+}
+
+// Verify that you can kill a process even using a tid from a thread other than
+// the group leader.
+TEST(KillTest, CannotKillTid) {
+  pid_t tid;
+  bool tid_available = false;
+  bool finished = false;
+  absl::Mutex mu;
+  ScopedThread t([&] {
+    mu.Lock();
+    tid = gettid();
+    tid_available = true;
+    mu.Await(absl::Condition(&finished));
+    mu.Unlock();
+  });
+  mu.LockWhen(absl::Condition(&tid_available));
+  EXPECT_THAT(kill(tid, 0), SyscallSucceeds());
+  finished = true;
+  mu.Unlock();
+}
+
+TEST(KillTest, SetPgid) {
+  for (int i = 0; i < 10; i++) {
+    // The following in the normal pattern for creating a new process group.
+    // Both the parent and child process will call setpgid in order to avoid any
+    // race conditions. We do this ten times to catch races.
+    pid_t pid = fork();
+    if (pid == 0) {
+      setpgid(0, 0);
+      while (true) {
+        pause();
+      }
+    }
+
+    EXPECT_THAT(pid, SyscallSucceeds());
+
+    // Set the child's group and exit.
+    ASSERT_THAT(setpgid(pid, pid), SyscallSucceeds());
+    EXPECT_THAT(kill(pid, SIGKILL), SyscallSucceeds());
+
+    int status;
+    EXPECT_THAT(RetryEINTR(waitpid)(-pid, &status, 0),
+                SyscallSucceedsWithValue(pid));
+    EXPECT_TRUE(WIFSIGNALED(status));
+    EXPECT_EQ(SIGKILL, WTERMSIG(status));
+  }
+}
+
+TEST(KillTest, ProcessGroups) {
+  // Fork a new child.
+  //
+  // other_child is used as a placeholder process. We use this PID as our "does
+  // not exist" process group to ensure some amount of safety. (It is still
+  // possible to violate this assumption, but extremely unlikely.)
+  pid_t child = fork();
+  if (child == 0) {
+    while (true) {
+      pause();
+    }
+  }
+  EXPECT_THAT(child, SyscallSucceeds());
+
+  pid_t other_child = fork();
+  if (other_child == 0) {
+    while (true) {
+      pause();
+    }
+  }
+
+  // Ensure the kill does not succeed without the new group.
+  EXPECT_THAT(kill(-child, SIGKILL), SyscallFailsWithErrno(ESRCH));
+
+  // Put the child in its own process group.
+  ASSERT_THAT(setpgid(child, child), SyscallSucceeds());
+
+  // This should be not allowed: you can only create a new group with the same
+  // id or join an existing one. The other_child group should not exist.
+  ASSERT_THAT(setpgid(child, other_child), SyscallFailsWithErrno(EPERM));
+
+  // Done with other_child; kill it.
+  EXPECT_THAT(kill(other_child, SIGKILL), SyscallSucceeds());
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(other_child, &status, 0), SyscallSucceeds());
+
+  // Linux returns success for the no-op call.
+  ASSERT_THAT(setpgid(child, child), SyscallSucceeds());
+
+  // Kill the child's process group.
+  ASSERT_THAT(kill(-child, SIGKILL), SyscallSucceeds());
+
+  // Wait on the process group; ensure that the signal was as expected.
+  EXPECT_THAT(RetryEINTR(waitpid)(-child, &status, 0),
+              SyscallSucceedsWithValue(child));
+  EXPECT_TRUE(WIFSIGNALED(status));
+  EXPECT_EQ(SIGKILL, WTERMSIG(status));
+
+  // Try to kill the process group again; ensure that the wait fails.
+  EXPECT_THAT(kill(-child, SIGKILL), SyscallFailsWithErrno(ESRCH));
+  EXPECT_THAT(RetryEINTR(waitpid)(-child, &status, 0),
+              SyscallFailsWithErrno(ECHILD));
+}
+
+TEST(KillTest, ChildDropsPrivsCannotKill) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+  int uid = FLAGS_scratch_uid;
+  int gid = FLAGS_scratch_gid;
+
+  // Create the child that drops privileges and tries to kill the parent.
+  pid_t pid = fork();
+  if (pid == 0) {
+    TEST_PCHECK(setresgid(gid, gid, gid) == 0);
+    MaybeSave();
+
+    TEST_PCHECK(setresuid(uid, uid, uid) == 0);
+    MaybeSave();
+
+    // setresuid should have dropped CAP_KILL. Make sure.
+    TEST_CHECK(!HaveCapability(CAP_KILL).ValueOrDie());
+
+    // Try to kill parent with every signal-sending syscall possible.
+    pid_t parent = getppid();
+
+    TEST_CHECK(kill(parent, SIGKILL) < 0);
+    TEST_PCHECK_MSG(errno == EPERM, "kill failed with wrong errno");
+    MaybeSave();
+
+    TEST_CHECK(tgkill(parent, parent, SIGKILL) < 0);
+    TEST_PCHECK_MSG(errno == EPERM, "tgkill failed with wrong errno");
+    MaybeSave();
+
+    TEST_CHECK(syscall(SYS_tkill, parent, SIGKILL) < 0);
+    TEST_PCHECK_MSG(errno == EPERM, "tkill failed with wrong errno");
+    MaybeSave();
+
+    siginfo_t uinfo;
+    uinfo.si_code = -1;  // SI_QUEUE (allowed).
+
+    TEST_CHECK(syscall(SYS_rt_sigqueueinfo, parent, SIGKILL, &uinfo) < 0);
+    TEST_PCHECK_MSG(errno == EPERM, "rt_sigqueueinfo failed with wrong errno");
+    MaybeSave();
+
+    TEST_CHECK(syscall(SYS_rt_tgsigqueueinfo, parent, parent, SIGKILL, &uinfo) <
+               0);
+    TEST_PCHECK_MSG(errno == EPERM, "rt_sigqueueinfo failed with wrong errno");
+    MaybeSave();
+
+    _exit(0);
+  }
+
+  EXPECT_THAT(pid, SyscallSucceeds());
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+              SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status = " << status;
+}
+
+TEST(KillTest, CanSIGCONTSameSession) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+  pid_t stopped_child = fork();
+  if (stopped_child == 0) {
+    raise(SIGSTOP);
+    _exit(0);
+  }
+
+  EXPECT_THAT(stopped_child, SyscallSucceeds());
+
+  // Put the child in its own process group. The child and parent process
+  // groups also share a session.
+  ASSERT_THAT(setpgid(stopped_child, stopped_child), SyscallSucceeds());
+
+  // Make sure child stopped.
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(stopped_child, &status, WUNTRACED),
+              SyscallSucceedsWithValue(stopped_child));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << "status " << status;
+
+  int uid = FLAGS_scratch_uid;
+  int gid = FLAGS_scratch_gid;
+
+  // Drop privileges only in child process, or else this parent process won't be
+  // able to open some log files after the test ends.
+  pid_t other_child = fork();
+  if (other_child == 0) {
+    // Drop privileges.
+    TEST_PCHECK(setresgid(gid, gid, gid) == 0);
+    MaybeSave();
+
+    TEST_PCHECK(setresuid(uid, uid, uid) == 0);
+    MaybeSave();
+
+    // setresuid should have dropped CAP_KILL.
+    TEST_CHECK(!HaveCapability(CAP_KILL).ValueOrDie());
+
+    // Child 2 and child should now not share a thread group and any UIDs.
+    // Child 2 should have no privileges. That means any signal other than
+    // SIGCONT should fail.
+    TEST_CHECK(kill(stopped_child, SIGKILL) < 0);
+    TEST_PCHECK_MSG(errno == EPERM, "kill failed with wrong errno");
+    MaybeSave();
+
+    TEST_PCHECK(kill(stopped_child, SIGCONT) == 0);
+    MaybeSave();
+
+    _exit(0);
+  }
+
+  EXPECT_THAT(stopped_child, SyscallSucceeds());
+
+  // Make sure child exited normally.
+  EXPECT_THAT(RetryEINTR(waitpid)(stopped_child, &status, 0),
+              SyscallSucceedsWithValue(stopped_child));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+
+  // Make sure other_child exited normally.
+  EXPECT_THAT(RetryEINTR(waitpid)(other_child, &status, 0),
+              SyscallSucceedsWithValue(other_child));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc
new file mode 100644
index 000000000..ed74437bc
--- /dev/null
+++ b/test/syscalls/linux/link.cc
@@ -0,0 +1,291 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_int32(scratch_uid, 65534, "scratch UID");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// IsSameFile returns true if both filenames have the same device and inode.
+bool IsSameFile(const std::string& f1, const std::string& f2) {
+  // Use lstat rather than stat, so that symlinks are not followed.
+  struct stat stat1 = {};
+  EXPECT_THAT(lstat(f1.c_str(), &stat1), SyscallSucceeds());
+  struct stat stat2 = {};
+  EXPECT_THAT(lstat(f2.c_str(), &stat2), SyscallSucceeds());
+
+  return stat1.st_dev == stat2.st_dev && stat1.st_ino == stat2.st_ino;
+}
+
+TEST(LinkTest, CanCreateLinkFile) {
+  auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string newname = NewTempAbsPath();
+
+  // Get the initial link count.
+  uint64_t initial_link_count = ASSERT_NO_ERRNO_AND_VALUE(Links(oldfile.path()));
+
+  EXPECT_THAT(link(oldfile.path().c_str(), newname.c_str()), SyscallSucceeds());
+
+  EXPECT_TRUE(IsSameFile(oldfile.path(), newname));
+
+  // Link count should be incremented.
+  EXPECT_THAT(Links(oldfile.path()),
+              IsPosixErrorOkAndHolds(initial_link_count + 1));
+
+  // Delete the link.
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+
+  // Link count should be back to initial.
+  EXPECT_THAT(Links(oldfile.path()),
+              IsPosixErrorOkAndHolds(initial_link_count));
+}
+
+TEST(LinkTest, PermissionDenied) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_FOWNER)));
+
+  // Make the file "unsafe" to link by making it only readable, but not
+  // writable.
+  const auto oldfile =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0400));
+  const std::string newname = NewTempAbsPath();
+
+  // Do setuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting this
+  // test. Otherwise, the files are created by root (UID before the test), but
+  // cannot be opened by the `uid` set below after the test. After calling
+  // setuid(non-zero-UID), there is no way to get root privileges back.
+  ScopedThread([&] {
+    // Use syscall instead of glibc setuid wrapper because we want this setuid
+    // call to only apply to this task. POSIX threads, however, require that all
+    // threads have the same UIDs, so using the setuid wrapper sets all threads'
+    // real UID.
+    // Also drops capabilities.
+    EXPECT_THAT(syscall(SYS_setuid, FLAGS_scratch_uid), SyscallSucceeds());
+
+    EXPECT_THAT(link(oldfile.path().c_str(), newname.c_str()),
+                SyscallFailsWithErrno(EPERM));
+  });
+}
+
+TEST(LinkTest, CannotLinkDirectory) {
+  auto olddir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string newdir = NewTempAbsPath();
+
+  EXPECT_THAT(link(olddir.path().c_str(), newdir.c_str()),
+              SyscallFailsWithErrno(EPERM));
+
+  EXPECT_THAT(rmdir(olddir.path().c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, CannotLinkWithSlash) {
+  auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  // Put a final "/" on newname.
+  const std::string newname = absl::StrCat(NewTempAbsPath(), "/");
+
+  EXPECT_THAT(link(oldfile.path().c_str(), newname.c_str()),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(LinkTest, OldnameIsEmpty) {
+  const std::string newname = NewTempAbsPath();
+  EXPECT_THAT(link("", newname.c_str()), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(LinkTest, OldnameDoesNotExist) {
+  const std::string oldname = NewTempAbsPath();
+  const std::string newname = NewTempAbsPath();
+  EXPECT_THAT(link("", newname.c_str()), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(LinkTest, NewnameCannotExist) {
+  const std::string newname =
+      JoinPath(GetAbsoluteTestTmpdir(), "thisdoesnotexist", "foo");
+  EXPECT_THAT(link("/thisdoesnotmatter", newname.c_str()),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(LinkTest, WithOldDirFD) {
+  const std::string oldname_parent = NewTempAbsPath();
+  const std::string oldname_base = "child";
+  const std::string oldname = JoinPath(oldname_parent, oldname_base);
+  const std::string newname = NewTempAbsPath();
+
+  // Create oldname_parent directory, and get an FD.
+  ASSERT_THAT(mkdir(oldname_parent.c_str(), 0777), SyscallSucceeds());
+  const FileDescriptor oldname_parent_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(oldname_parent, O_DIRECTORY | O_RDONLY));
+
+  // Create oldname file.
+  const FileDescriptor oldname_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(oldname, O_CREAT | O_RDWR, 0666));
+
+  // Link oldname to newname, using oldname_parent_fd.
+  EXPECT_THAT(linkat(oldname_parent_fd.get(), oldname_base.c_str(), AT_FDCWD,
+                     newname.c_str(), 0),
+              SyscallSucceeds());
+
+  EXPECT_TRUE(IsSameFile(oldname, newname));
+
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
+  EXPECT_THAT(rmdir(oldname_parent.c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, BogusFlags) {
+  ASSERT_THAT(linkat(1, "foo", 2, "bar", 3), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(LinkTest, WithNewDirFD) {
+  auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string newname_parent = NewTempAbsPath();
+  const std::string newname_base = "child";
+  const std::string newname = JoinPath(newname_parent, newname_base);
+
+  // Create newname_parent directory, and get an FD.
+  EXPECT_THAT(mkdir(newname_parent.c_str(), 0777), SyscallSucceeds());
+  const FileDescriptor newname_parent_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(newname_parent, O_DIRECTORY | O_RDONLY));
+
+  // Link newname to oldfile, using newname_parent_fd.
+  EXPECT_THAT(linkat(AT_FDCWD, oldfile.path().c_str(), newname_parent_fd.get(),
+                     newname.c_str(), 0),
+              SyscallSucceeds());
+
+  EXPECT_TRUE(IsSameFile(oldfile.path(), newname));
+
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+  EXPECT_THAT(rmdir(newname_parent.c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, RelPathsWithNonDirFDs) {
+  auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Create a file that will be passed as the directory fd for old/new names.
+  const std::string filename = NewTempAbsPath();
+  const FileDescriptor file_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT | O_RDWR, 0666));
+
+  // Using file_fd as olddirfd will fail.
+  EXPECT_THAT(linkat(file_fd.get(), "foo", AT_FDCWD, "bar", 0),
+              SyscallFailsWithErrno(ENOTDIR));
+
+  // Using file_fd as newdirfd will fail.
+  EXPECT_THAT(linkat(AT_FDCWD, oldfile.path().c_str(), file_fd.get(), "bar", 0),
+              SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(LinkTest, AbsPathsWithNonDirFDs) {
+  auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string newname = NewTempAbsPath();
+
+  // Create a file that will be passed as the directory fd for old/new names.
+  const std::string filename = NewTempAbsPath();
+  const FileDescriptor file_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT | O_RDWR, 0666));
+
+  // Using file_fd as the dirfds is OK as long as paths are absolute.
+  EXPECT_THAT(linkat(file_fd.get(), oldfile.path().c_str(), file_fd.get(),
+                     newname.c_str(), 0),
+              SyscallSucceeds());
+}
+
+TEST(LinkTest, LinkDoesNotFollowSymlinks) {
+  // Create oldfile, and oldsymlink which points to it.
+  auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string oldsymlink = NewTempAbsPath();
+  EXPECT_THAT(symlink(oldfile.path().c_str(), oldsymlink.c_str()),
+              SyscallSucceeds());
+
+  // Now hard link newname to oldsymlink.
+  const std::string newname = NewTempAbsPath();
+  EXPECT_THAT(link(oldsymlink.c_str(), newname.c_str()), SyscallSucceeds());
+
+  // The link should not have resolved the symlink, so newname and oldsymlink
+  // are the same.
+  EXPECT_TRUE(IsSameFile(oldsymlink, newname));
+  EXPECT_FALSE(IsSameFile(oldfile.path(), newname));
+
+  EXPECT_THAT(unlink(oldsymlink.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, LinkatDoesNotFollowSymlinkByDefault) {
+  // Create oldfile, and oldsymlink which points to it.
+  auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string oldsymlink = NewTempAbsPath();
+  EXPECT_THAT(symlink(oldfile.path().c_str(), oldsymlink.c_str()),
+              SyscallSucceeds());
+
+  // Now hard link newname to oldsymlink.
+  const std::string newname = NewTempAbsPath();
+  EXPECT_THAT(
+      linkat(AT_FDCWD, oldsymlink.c_str(), AT_FDCWD, newname.c_str(), 0),
+      SyscallSucceeds());
+
+  // The link should not have resolved the symlink, so newname and oldsymlink
+  // are the same.
+  EXPECT_TRUE(IsSameFile(oldsymlink, newname));
+  EXPECT_FALSE(IsSameFile(oldfile.path(), newname));
+
+  EXPECT_THAT(unlink(oldsymlink.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, LinkatWithSymlinkFollow) {
+  // Create oldfile, and oldsymlink which points to it.
+  auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string oldsymlink = NewTempAbsPath();
+  ASSERT_THAT(symlink(oldfile.path().c_str(), oldsymlink.c_str()),
+              SyscallSucceeds());
+
+  // Now hard link newname to oldsymlink, and pass AT_SYMLINK_FOLLOW flag.
+  const std::string newname = NewTempAbsPath();
+  ASSERT_THAT(linkat(AT_FDCWD, oldsymlink.c_str(), AT_FDCWD, newname.c_str(),
+                     AT_SYMLINK_FOLLOW),
+              SyscallSucceeds());
+
+  // The link should have resolved the symlink, so oldfile and newname are the
+  // same.
+  EXPECT_TRUE(IsSameFile(oldfile.path(), newname));
+  EXPECT_FALSE(IsSameFile(oldsymlink, newname));
+
+  EXPECT_THAT(unlink(oldsymlink.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
new file mode 100644
index 000000000..fb6a1546e
--- /dev/null
+++ b/test/syscalls/linux/lseek.cc
@@ -0,0 +1,202 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(LseekTest, InvalidWhence) {
+  const std::string kFileData = "hello world\n";
+  const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+  ASSERT_THAT(lseek(fd.get(), 0, -1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(LseekTest, NegativeOffset) {
+  const std::string kFileData = "hello world\n";
+  const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+  EXPECT_THAT(lseek(fd.get(), -(kFileData.length() + 1), SEEK_CUR),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// A 32-bit off_t is not large enough to represent an offset larger than
+// maximum file size on standard file systems, so it isn't possible to cause
+// overflow.
+#ifdef __x86_64__
+TEST(LseekTest, Overflow) {
+  // HA! Classic Linux. We really should have an EOVERFLOW
+  // here, since we're seeking to something that cannot be
+  // represented.. but instead we are given an EINVAL.
+  const std::string kFileData = "hello world\n";
+  const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+  EXPECT_THAT(lseek(fd.get(), 0x7fffffffffffffff, SEEK_END),
+              SyscallFailsWithErrno(EINVAL));
+}
+#endif
+
+TEST(LseekTest, Set) {
+  const std::string kFileData = "hello world\n";
+  const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+  char buf = '\0';
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, kFileData.c_str()[0]);
+  EXPECT_THAT(lseek(fd.get(), 6, SEEK_SET), SyscallSucceedsWithValue(6));
+  ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, kFileData.c_str()[6]);
+}
+
+TEST(LseekTest, Cur) {
+  const std::string kFileData = "hello world\n";
+  const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+  char buf = '\0';
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, kFileData.c_str()[0]);
+  EXPECT_THAT(lseek(fd.get(), 3, SEEK_CUR), SyscallSucceedsWithValue(4));
+  ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, kFileData.c_str()[4]);
+}
+
+TEST(LseekTest, End) {
+  const std::string kFileData = "hello world\n";
+  const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+  char buf = '\0';
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, kFileData.c_str()[0]);
+  EXPECT_THAT(lseek(fd.get(), -2, SEEK_END), SyscallSucceedsWithValue(10));
+  ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, kFileData.c_str()[kFileData.length() - 2]);
+}
+
+TEST(LseekTest, InvalidFD) {
+  EXPECT_THAT(lseek(-1, 0, SEEK_SET), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(LseekTest, DirCurEnd) {
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open("/tmp", O_RDONLY));
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+}
+
+TEST(LseekTest, ProcDir) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self", O_RDONLY));
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(LseekTest, ProcFile) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/meminfo", O_RDONLY));
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(LseekTest, SysDir) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/sys/devices", O_RDONLY));
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(LseekTest, SeekCurrentDir) {
+  // From include/linux/fs.h.
+  constexpr loff_t MAX_LFS_FILESIZE = 0x7fffffffffffffff;
+
+  char* dir = get_current_dir_name();
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir, O_RDONLY));
+
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_END),
+              // Some filesystems (like ext4) allow lseek(SEEK_END) on a
+              // directory and return MAX_LFS_FILESIZE, others return EINVAL.
+              AnyOf(SyscallSucceedsWithValue(MAX_LFS_FILESIZE),
+                    SyscallFailsWithErrno(EINVAL)));
+  free(dir);
+}
+
+TEST(LseekTest, ProcStatTwice) {
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/stat", O_RDONLY));
+  const FileDescriptor fd2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/stat", O_RDONLY));
+
+  ASSERT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(lseek(fd1.get(), 0, SEEK_END), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(lseek(fd1.get(), 1000, SEEK_CUR), SyscallSucceeds());
+  // Check that just because we moved fd1, fd2 doesn't move.
+  ASSERT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd3 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/stat", O_RDONLY));
+  ASSERT_THAT(lseek(fd3.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+}
+
+TEST(LseekTest, EtcPasswdDup) {
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/etc/passwd", O_RDONLY));
+  const FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(fd1.Dup());
+
+  ASSERT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(lseek(fd1.get(), 1000, SEEK_CUR), SyscallSucceeds());
+  // Check that just because we moved fd1, fd2 doesn't move.
+  ASSERT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(1000));
+
+  const FileDescriptor fd3 = ASSERT_NO_ERRNO_AND_VALUE(fd1.Dup());
+  ASSERT_THAT(lseek(fd3.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(1000));
+}
+
+// TODO: Add tests where we have donated in sockets.
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
new file mode 100644
index 000000000..a79c8c75d
--- /dev/null
+++ b/test/syscalls/linux/madvise.cc
@@ -0,0 +1,142 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void ExpectAllMappingBytes(Mapping const& m, char c) {
+  auto const v = m.view();
+  for (size_t i = 0; i < kPageSize; i++) {
+    ASSERT_EQ(v[i], c) << "at offset " << i;
+  }
+}
+
+// Equivalent to ExpectAllMappingBytes but async-signal-safe and with less
+// helpful failure messages.
+void CheckAllMappingBytes(Mapping const& m, char c) {
+  auto const v = m.view();
+  for (size_t i = 0; i < kPageSize; i++) {
+    TEST_CHECK_MSG(v[i] == c, "mapping contains wrong value");
+  }
+}
+
+TEST(MadviseDontneedTest, ZerosPrivateAnonPage) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  ExpectAllMappingBytes(m, 0);
+  memset(m.ptr(), 1, m.len());
+  ExpectAllMappingBytes(m, 1);
+  ASSERT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+  ExpectAllMappingBytes(m, 0);
+}
+
+TEST(MadviseDontneedTest, ZerosCOWAnonPageInCallerOnly) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  ExpectAllMappingBytes(m, 0);
+  memset(m.ptr(), 2, m.len());
+  ExpectAllMappingBytes(m, 2);
+
+  // Do madvise in a child process.
+  pid_t pid = fork();
+  CheckAllMappingBytes(m, 2);
+  if (pid == 0) {
+    TEST_PCHECK(madvise(m.ptr(), m.len(), MADV_DONTNEED) == 0);
+    CheckAllMappingBytes(m, 0);
+    _exit(0);
+  }
+
+  ASSERT_THAT(pid, SyscallSucceeds());
+
+  int status = 0;
+  ASSERT_THAT(waitpid(-1, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(WEXITSTATUS(status), 0);
+  // The child's madvise should not have affected the parent.
+  ExpectAllMappingBytes(m, 2);
+}
+
+TEST(MadviseDontneedTest, DoesNotModifySharedAnonPage) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED));
+  ExpectAllMappingBytes(m, 0);
+  memset(m.ptr(), 3, m.len());
+  ExpectAllMappingBytes(m, 3);
+  ASSERT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+  ExpectAllMappingBytes(m, 3);
+}
+
+TEST(MadviseDontneedTest, CleansPrivateFilePage) {
+  TempPath f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      /* parent = */ GetAbsoluteTestTmpdir(),
+      /* content = */ std::string(kPageSize, 4), TempPath::kDefaultFileMode));
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd.get(), 0));
+  ExpectAllMappingBytes(m, 4);
+  memset(m.ptr(), 5, m.len());
+  ExpectAllMappingBytes(m, 5);
+  ASSERT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+  ExpectAllMappingBytes(m, 4);
+}
+
+TEST(MadviseDontneedTest, DoesNotModifySharedFilePage) {
+  TempPath f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      /* parent = */ GetAbsoluteTestTmpdir(),
+      /* content = */ std::string(kPageSize, 6), TempPath::kDefaultFileMode));
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0));
+  ExpectAllMappingBytes(m, 6);
+  memset(m.ptr(), 7, m.len());
+  ExpectAllMappingBytes(m, 7);
+  ASSERT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+  ExpectAllMappingBytes(m, 7);
+}
+
+TEST(MadviseDontneedTest, IgnoresPermissions) {
+  auto m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+  EXPECT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
new file mode 100644
index 000000000..b4b680c34
--- /dev/null
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -0,0 +1,99 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/mman.h>
+#include <map>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using ::absl::StrFormat;
+
+// AnonUsageFromMeminfo scrapes the current anonymous memory usage from
+// /proc/meminfo and returns it in bytes.
+PosixErrorOr<uint64_t> AnonUsageFromMeminfo() {
+  ASSIGN_OR_RETURN_ERRNO(auto meminfo, GetContents("/proc/meminfo"));
+  std::vector<std::string> lines(absl::StrSplit(meminfo, '\n'));
+
+  // Try to find AnonPages line, the format is AnonPages:\\s+(\\d+) kB\n.
+  for (const auto& line : lines) {
+    if (!absl::StartsWith(line, "AnonPages:")) {
+      continue;
+    }
+
+    std::vector<std::string> parts(
+        absl::StrSplit(line, ' ', absl::SkipEmpty()));
+    if (parts.size() == 3) {
+      // The size is the second field, let's try to parse it as a number.
+      ASSIGN_OR_RETURN_ERRNO(auto anon_kb, Atoi<uint64_t>(parts[1]));
+      return anon_kb * 1024;
+    }
+
+    return PosixError(EINVAL, "AnonPages field in /proc/meminfo was malformed");
+  }
+
+  return PosixError(EINVAL, "AnonPages field not found in /proc/meminfo");
+}
+
+TEST(MemoryAccounting, AnonAccountingPreservedOnSaveRestore) {
+  // This test isn't meaningful on Linux. /proc/meminfo reports system-wide
+  // memory usage, which can change arbitrarily in Linux from other activity on
+  // the machine. In gvisor, this test is the only thing running on the
+  // "machine", so values in /proc/meminfo accurately reflect the memory used by
+  // the test.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  uint64_t anon_initial = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+
+  // Cause some anonymous memory usage.
+  uint64_t map_bytes = Megabytes(512);
+  char* mem =
+      static_cast<char*>(mmap(nullptr, map_bytes, PROT_READ | PROT_WRITE,
+                              MAP_POPULATE | MAP_ANON | MAP_PRIVATE, -1, 0));
+  ASSERT_NE(mem, MAP_FAILED)
+      << "Map failed, errno: " << errno << " (" << strerror(errno) << ").";
+
+  // Write something to each page to prevent them from being decommited on
+  // S/R. Zero pages are dropped on save.
+  for (uint64_t i = 0; i < map_bytes; i += kPageSize) {
+    mem[i] = 'a';
+  }
+
+  uint64_t anon_after_alloc = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+  EXPECT_THAT(anon_after_alloc,
+              EquivalentWithin(anon_initial + map_bytes, 0.03));
+
+  // We have many implicit S/R cycles from scraping /proc/meminfo throughout the
+  // test, but throw an explicit S/R in here as well.
+  MaybeSave();
+
+  // Usage should remain the same across S/R.
+  uint64_t anon_after_sr = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+  EXPECT_THAT(anon_after_sr, EquivalentWithin(anon_after_alloc, 0.03));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
new file mode 100644
index 000000000..9f8033bdf
--- /dev/null
+++ b/test/syscalls/linux/mempolicy.cc
@@ -0,0 +1,258 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "test/util/cleanup.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#define BITS_PER_BYTE 8
+
+#define MPOL_F_STATIC_NODES (1 << 15)
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+#define MPOL_DEFAULT 0
+#define MPOL_PREFERRED 1
+#define MPOL_BIND 2
+#define MPOL_INTERLEAVE 3
+#define MPOL_MAX MPOL_INTERLEAVE
+#define MPOL_F_NODE (1 << 0)
+#define MPOL_F_ADDR (1 << 1)
+#define MPOL_F_MEMS_ALLOWED (1 << 2)
+#define MPOL_MF_STRICT (1 << 0)
+#define MPOL_MF_MOVE (1 << 1)
+#define MPOL_MF_MOVE_ALL (1 << 2)
+
+int get_mempolicy(int *policy, uint64_t *nmask, uint64_t maxnode, void *addr,
+                  int flags) {
+  return syscall(__NR_get_mempolicy, policy, nmask, maxnode, addr, flags);
+}
+
+int set_mempolicy(int mode, uint64_t *nmask, uint64_t maxnode) {
+  return syscall(__NR_set_mempolicy, mode, nmask, maxnode);
+}
+
+// Creates a cleanup object that resets the calling thread's mempolicy to the
+// system default when the calling scope ends.
+Cleanup ScopedMempolicy() {
+  return Cleanup([] {
+    EXPECT_THAT(set_mempolicy(MPOL_DEFAULT, nullptr, 0), SyscallSucceeds());
+  });
+}
+
+// Temporarily change the memory policy for the calling thread within the
+// caller's scope.
+PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64_t *nmask,
+                                         uint64_t maxnode) {
+  if (set_mempolicy(mode, nmask, maxnode)) {
+    return PosixError(errno, "set_mempolicy");
+  }
+  return ScopedMempolicy();
+}
+
+TEST(MempolicyTest, CheckDefaultPolicy) {
+  int mode = 0;
+  uint64_t nodemask = 0;
+  ASSERT_THAT(get_mempolicy(&mode, &nodemask, sizeof(nodemask) * BITS_PER_BYTE,
+                            nullptr, 0),
+              SyscallSucceeds());
+
+  EXPECT_EQ(MPOL_DEFAULT, mode);
+  EXPECT_EQ(0x0, nodemask);
+}
+
+TEST(MempolicyTest, PolicyPreservedAfterSetMempolicy) {
+  uint64_t nodemask = 0x1;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(
+      MPOL_BIND, &nodemask, sizeof(nodemask) * BITS_PER_BYTE));
+
+  int mode = 0;
+  uint64_t nodemask_after = 0x0;
+  ASSERT_THAT(get_mempolicy(&mode, &nodemask_after,
+                            sizeof(nodemask_after) * BITS_PER_BYTE, nullptr, 0),
+              SyscallSucceeds());
+  EXPECT_EQ(MPOL_BIND, mode);
+  EXPECT_EQ(0x1, nodemask_after);
+
+  // Try throw in some mode flags.
+  for (auto mode_flag : {MPOL_F_STATIC_NODES, MPOL_F_RELATIVE_NODES}) {
+    auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+        ScopedSetMempolicy(MPOL_INTERLEAVE | mode_flag, &nodemask,
+                           sizeof(nodemask) * BITS_PER_BYTE));
+    mode = 0;
+    nodemask_after = 0x0;
+    ASSERT_THAT(
+        get_mempolicy(&mode, &nodemask_after,
+                      sizeof(nodemask_after) * BITS_PER_BYTE, nullptr, 0),
+        SyscallSucceeds());
+    EXPECT_EQ(MPOL_INTERLEAVE | mode_flag, mode);
+    EXPECT_EQ(0x1, nodemask_after);
+  }
+}
+
+TEST(MempolicyTest, SetMempolicyRejectsInvalidInputs) {
+  auto cleanup = ScopedMempolicy();
+  uint64_t nodemask;
+
+  if (IsRunningOnGvisor()) {
+    // Invalid nodemask, we only support a single node on gvisor.
+    nodemask = 0x4;
+    ASSERT_THAT(set_mempolicy(MPOL_DEFAULT, &nodemask,
+                              sizeof(nodemask) * BITS_PER_BYTE),
+                SyscallFailsWithErrno(EINVAL));
+  }
+
+  nodemask = 0x1;
+
+  // Invalid mode.
+  ASSERT_THAT(set_mempolicy(7439, &nodemask, sizeof(nodemask) * BITS_PER_BYTE),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Invalid nodemask size.
+  ASSERT_THAT(set_mempolicy(MPOL_DEFAULT, &nodemask, 0),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Invalid mode flag.
+  ASSERT_THAT(
+      set_mempolicy(MPOL_DEFAULT | MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES,
+                    &nodemask, sizeof(nodemask) * BITS_PER_BYTE),
+      SyscallFailsWithErrno(EINVAL));
+
+  // MPOL_INTERLEAVE with empty nodemask.
+  nodemask = 0x0;
+  ASSERT_THAT(set_mempolicy(MPOL_INTERLEAVE, &nodemask,
+                            sizeof(nodemask) * BITS_PER_BYTE),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// The manpages specify that the nodemask provided to set_mempolicy are
+// considered empty if the nodemask pointer is null, or if the nodemask size is
+// 0. We use a policy which accepts both empty and non-empty nodemasks
+// (MPOL_PREFERRED), a policy which requires a non-empty nodemask (MPOL_BIND),
+// and a policy which completely ignores the nodemask (MPOL_DEFAULT) to verify
+// argument checking around nodemasks.
+TEST(MempolicyTest, EmptyNodemaskOnSet) {
+  auto cleanup = ScopedMempolicy();
+
+  EXPECT_THAT(set_mempolicy(MPOL_DEFAULT, nullptr, 1), SyscallSucceeds());
+  EXPECT_THAT(set_mempolicy(MPOL_BIND, nullptr, 1),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(set_mempolicy(MPOL_PREFERRED, nullptr, 1), SyscallSucceeds());
+
+  uint64_t nodemask = 0x1;
+  EXPECT_THAT(set_mempolicy(MPOL_DEFAULT, &nodemask, 0),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(set_mempolicy(MPOL_BIND, &nodemask, 0),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(set_mempolicy(MPOL_PREFERRED, &nodemask, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(MempolicyTest, QueryAvailableNodes) {
+  uint64_t nodemask = 0;
+  ASSERT_THAT(
+      get_mempolicy(nullptr, &nodemask, sizeof(nodemask) * BITS_PER_BYTE,
+                    nullptr, MPOL_F_MEMS_ALLOWED),
+      SyscallSucceeds());
+  // We can only be sure there is a single node if running on gvisor.
+  if (IsRunningOnGvisor()) {
+    EXPECT_EQ(0x1, nodemask);
+  }
+
+  // MPOL_F_ADDR and MPOL_F_NODE flags may not be combined with
+  // MPOL_F_MEMS_ALLLOWED.
+  for (auto flags :
+       {MPOL_F_MEMS_ALLOWED | MPOL_F_ADDR, MPOL_F_MEMS_ALLOWED | MPOL_F_NODE,
+        MPOL_F_MEMS_ALLOWED | MPOL_F_ADDR | MPOL_F_NODE}) {
+    ASSERT_THAT(get_mempolicy(nullptr, &nodemask,
+                              sizeof(nodemask) * BITS_PER_BYTE, nullptr, flags),
+                SyscallFailsWithErrno(EINVAL));
+  }
+}
+
+TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
+  uint64_t dummy_stack_address;
+  auto dummy_heap_address = absl::make_unique<uint64_t>();
+  int mode;
+
+  for (auto ptr : {&dummy_stack_address, dummy_heap_address.get()}) {
+    mode = -1;
+    ASSERT_THAT(
+        get_mempolicy(&mode, nullptr, 0, ptr, MPOL_F_ADDR | MPOL_F_NODE),
+        SyscallSucceeds());
+    // If we're not running on gvisor, the address may be allocated on a
+    // different numa node.
+    if (IsRunningOnGvisor()) {
+      EXPECT_EQ(0, mode);
+    }
+  }
+
+  void* invalid_address = reinterpret_cast<void*>(-1);
+
+  // Invalid address.
+  ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, invalid_address,
+                            MPOL_F_ADDR | MPOL_F_NODE),
+              SyscallFailsWithErrno(EFAULT));
+
+  // Invalid mode pointer.
+  ASSERT_THAT(get_mempolicy(reinterpret_cast<int*>(invalid_address), nullptr, 0,
+                            &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(MempolicyTest, GetMempolicyCanOmitPointers) {
+  int mode;
+  uint64_t nodemask;
+
+  // Omit nodemask pointer.
+  ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, nullptr, 0), SyscallSucceeds());
+  // Omit mode pointer.
+  ASSERT_THAT(get_mempolicy(nullptr, &nodemask,
+                            sizeof(nodemask) * BITS_PER_BYTE, nullptr, 0),
+              SyscallSucceeds());
+  // Omit both pointers.
+  ASSERT_THAT(get_mempolicy(nullptr, nullptr, 0, nullptr, 0),
+              SyscallSucceeds());
+}
+
+TEST(MempolicyTest, GetMempolicyNextInterleaveNode) {
+  int mode;
+  // Policy for thread not yet set to MPOL_INTERLEAVE, can't query for
+  // the next node which will be used for allocation.
+  ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, nullptr, MPOL_F_NODE),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Set default policy for thread to MPOL_INTERLEAVE.
+  uint64_t nodemask = 0x1;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(
+      MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * BITS_PER_BYTE));
+
+  mode = -1;
+  ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, nullptr, MPOL_F_NODE),
+              SyscallSucceeds());
+  EXPECT_EQ(0, mode);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/mincore.cc b/test/syscalls/linux/mincore.cc
new file mode 100644
index 000000000..c572bf5ec
--- /dev/null
+++ b/test/syscalls/linux/mincore.cc
@@ -0,0 +1,96 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+size_t CountSetLSBs(std::vector<unsigned char> const& vec) {
+  return std::count_if(begin(vec), end(vec),
+                       [](unsigned char c) { return (c & 1) != 0; });
+}
+
+TEST(MincoreTest, DirtyAnonPagesAreResident) {
+  constexpr size_t kTestPageCount = 10;
+  auto const kTestMappingBytes = kTestPageCount * kPageSize;
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kTestMappingBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  memset(m.ptr(), 0, m.len());
+
+  std::vector<unsigned char> vec(kTestPageCount, 0);
+  ASSERT_THAT(mincore(m.ptr(), kTestMappingBytes, vec.data()),
+              SyscallSucceeds());
+  EXPECT_EQ(kTestPageCount, CountSetLSBs(vec));
+}
+
+TEST(MincoreTest, UnalignedAddressFails) {
+  // Map and touch two pages, then try to mincore the second half of the first
+  // page + the first half of the second page. Both pages are mapped, but
+  // mincore should return EINVAL due to the misaligned start address.
+  constexpr size_t kTestPageCount = 2;
+  auto const kTestMappingBytes = kTestPageCount * kPageSize;
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kTestMappingBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  memset(m.ptr(), 0, m.len());
+
+  std::vector<unsigned char> vec(kTestPageCount, 0);
+  EXPECT_THAT(mincore(reinterpret_cast<void*>(m.addr() + kPageSize / 2),
+                      kPageSize, vec.data()),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(MincoreTest, UnalignedLengthSucceedsAndIsRoundedUp) {
+  // Map and touch two pages, then try to mincore the first page + the first
+  // half of the second page. mincore should silently round up the length to
+  // include both pages.
+  constexpr size_t kTestPageCount = 2;
+  auto const kTestMappingBytes = kTestPageCount * kPageSize;
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kTestMappingBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  memset(m.ptr(), 0, m.len());
+
+  std::vector<unsigned char> vec(kTestPageCount, 0);
+  ASSERT_THAT(mincore(m.ptr(), kPageSize + kPageSize / 2, vec.data()),
+              SyscallSucceeds());
+  EXPECT_EQ(kTestPageCount, CountSetLSBs(vec));
+}
+
+TEST(MincoreTest, ZeroLengthSucceedsAndAllowsAnyVecBelowTaskSize) {
+  EXPECT_THAT(mincore(nullptr, 0, nullptr), SyscallSucceeds());
+}
+
+TEST(MincoreTest, InvalidLengthFails) {
+  EXPECT_THAT(mincore(nullptr, -1, nullptr), SyscallFailsWithErrno(ENOMEM));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
new file mode 100644
index 000000000..84db45eb3
--- /dev/null
+++ b/test/syscalls/linux/mkdir.cc
@@ -0,0 +1,96 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/temp_umask.h"
+#include "test/util/capability_util.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class MkdirTest : public ::testing::Test {
+ protected:
+  // SetUp creates various configurations of files.
+  void SetUp() override { dirname_ = NewTempAbsPath(); }
+
+  // TearDown unlinks created files.
+  void TearDown() override {
+    // FIXME: We don't currently implement rmdir.
+    // We do this unconditionally because there's no harm in trying.
+    rmdir(dirname_.c_str());
+  }
+
+  std::string dirname_;
+};
+
+TEST_F(MkdirTest, DISABLED_CanCreateReadbleDir) {
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0444), SyscallSucceeds());
+  ASSERT_THAT(
+      open(JoinPath(dirname_, "anything").c_str(), O_RDWR | O_CREAT, 0666),
+      SyscallFailsWithErrno(EACCES));
+}
+
+TEST_F(MkdirTest, CanCreateWritableDir) {
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
+  std::string filename = JoinPath(dirname_, "anything");
+  int fd;
+  ASSERT_THAT(fd = open(filename.c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  ASSERT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+TEST_F(MkdirTest, HonorsUmask) {
+  constexpr mode_t kMask = 0111;
+  TempUmask mask(kMask);
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
+  struct stat statbuf;
+  ASSERT_THAT(stat(dirname_.c_str(), &statbuf), SyscallSucceeds());
+  EXPECT_EQ(0777 & ~kMask, statbuf.st_mode & 0777);
+}
+
+TEST_F(MkdirTest, HonorsUmask2) {
+  constexpr mode_t kMask = 0142;
+  TempUmask mask(kMask);
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
+  struct stat statbuf;
+  ASSERT_THAT(stat(dirname_.c_str(), &statbuf), SyscallSucceeds());
+  EXPECT_EQ(0777 & ~kMask, statbuf.st_mode & 0777);
+}
+
+TEST_F(MkdirTest, FailsOnDirWithoutWritePerms) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto parent = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0555));
+  auto dir = JoinPath(parent.path(), "foo");
+  ASSERT_THAT(mkdir(dir.c_str(), 0777), SyscallFailsWithErrno(EACCES));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc
new file mode 100644
index 000000000..361ca299b
--- /dev/null
+++ b/test/syscalls/linux/mknod.cc
@@ -0,0 +1,173 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(MknodTest, RegularFile) {
+  std::string const node0 = NewTempAbsPathInDir("/tmp");
+  std::string const node1 = NewTempAbsPathInDir("/tmp");
+  ASSERT_THAT(mknod(node0.c_str(), S_IFREG, 0), SyscallSucceeds());
+  ASSERT_THAT(mknod(node1.c_str(), 0, 0), SyscallSucceeds());
+}
+
+TEST(MknodTest, MknodAtRegularFile) {
+  std::string const fifo_relpath = NewTempRelPath();
+  std::string const fifo = JoinPath("/tmp", fifo_relpath);
+  int dirfd;
+  ASSERT_THAT(dirfd = open("/tmp", O_RDONLY), SyscallSucceeds());
+  ASSERT_THAT(mknodat(dirfd, fifo_relpath.c_str(), S_IFIFO | S_IRUSR, 0),
+              SyscallSucceeds());
+  EXPECT_THAT(close(dirfd), SyscallSucceeds());
+
+  struct stat st;
+  ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds());
+  EXPECT_TRUE(S_ISFIFO(st.st_mode));
+}
+
+TEST(MknodTest, MknodOnExistingPathFails) {
+  std::string const file = NewTempAbsPathInDir("/tmp");
+  std::string const slink = NewTempAbsPathInDir("/tmp");
+  int fd;
+  ASSERT_THAT(fd = open(file.c_str(), O_CREAT | O_RDWR, S_IRUSR | S_IWUSR),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  ASSERT_THAT(symlink(file.c_str(), slink.c_str()), SyscallSucceeds());
+
+  EXPECT_THAT(mknod(file.c_str(), S_IFREG, 0), SyscallFailsWithErrno(EEXIST));
+  EXPECT_THAT(mknod(file.c_str(), S_IFIFO, 0), SyscallFailsWithErrno(EEXIST));
+  EXPECT_THAT(mknod(slink.c_str(), S_IFREG, 0), SyscallFailsWithErrno(EEXIST));
+  EXPECT_THAT(mknod(slink.c_str(), S_IFIFO, 0), SyscallFailsWithErrno(EEXIST));
+}
+
+TEST(MknodTest, UnimplementedTypesReturnError) {
+  if (IsRunningOnGvisor()) {
+    ASSERT_THAT(mknod("/tmp/a_socket", S_IFSOCK, 0),
+                SyscallFailsWithErrno(EOPNOTSUPP));
+  }
+  // These will fail on linux as well since we don't have CAP_MKNOD.
+  ASSERT_THAT(mknod("/tmp/a_chardev", S_IFCHR, 0),
+              SyscallFailsWithErrno(EPERM));
+  ASSERT_THAT(mknod("/tmp/a_blkdev", S_IFBLK, 0), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MknodTest, Fifo) {
+  std::string const fifo = NewTempAbsPathInDir("/tmp");
+  ASSERT_THAT(mknod(fifo.c_str(), S_IFIFO | S_IRUSR | S_IWUSR, 0),
+              SyscallSucceeds());
+
+  struct stat st;
+  ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds());
+  EXPECT_TRUE(S_ISFIFO(st.st_mode));
+
+  std::string msg = "some string";
+  std::vector<char> buf(512);
+
+  // Read-end of the pipe.
+  ScopedThread t([&fifo, &buf, &msg]() {
+    int fd;
+    ASSERT_THAT(fd = open(fifo.c_str(), O_RDONLY), SyscallSucceeds());
+    EXPECT_THAT(read(fd, buf.data(), buf.size()),
+                SyscallSucceedsWithValue(msg.length()));
+    EXPECT_EQ(msg, std::string(buf.data()));
+    EXPECT_THAT(close(fd), SyscallSucceeds());
+  });
+
+  // Write-end of the pipe.
+  int wfd;
+  ASSERT_THAT(wfd = open(fifo.c_str(), O_WRONLY), SyscallSucceeds());
+  EXPECT_THAT(write(wfd, msg.c_str(), msg.length()),
+              SyscallSucceedsWithValue(msg.length()));
+  EXPECT_THAT(close(wfd), SyscallSucceeds());
+}
+
+TEST(MknodTest, FifoOtrunc) {
+  std::string const fifo = NewTempAbsPathInDir("/tmp");
+  ASSERT_THAT(mknod(fifo.c_str(), S_IFIFO | S_IRUSR | S_IWUSR, 0),
+              SyscallSucceeds());
+
+  struct stat st = {};
+  ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds());
+  EXPECT_TRUE(S_ISFIFO(st.st_mode));
+
+  std::string msg = "some string";
+  std::vector<char> buf(512);
+  // Read-end of the pipe.
+  ScopedThread t([&fifo, &buf, &msg]() {
+    int fd;
+    ASSERT_THAT(fd = open(fifo.c_str(), O_RDONLY), SyscallSucceeds());
+    EXPECT_THAT(read(fd, buf.data(), buf.size()),
+                SyscallSucceedsWithValue(msg.length()));
+    EXPECT_EQ(msg, std::string(buf.data()));
+    EXPECT_THAT(close(fd), SyscallSucceeds());
+  });
+
+  int wfd;
+  ASSERT_THAT(wfd = open(fifo.c_str(), O_TRUNC | O_WRONLY), SyscallSucceeds());
+  EXPECT_THAT(write(wfd, msg.c_str(), msg.length()),
+              SyscallSucceedsWithValue(msg.length()));
+  EXPECT_THAT(close(wfd), SyscallSucceeds());
+}
+
+TEST(MknodTest, FifoTruncNoOp) {
+  std::string const fifo = NewTempAbsPathInDir("/tmp");
+  ASSERT_THAT(mknod(fifo.c_str(), S_IFIFO | S_IRUSR | S_IWUSR, 0),
+              SyscallSucceeds());
+
+  EXPECT_THAT(truncate(fifo.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+
+  struct stat st = {};
+  ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds());
+  EXPECT_TRUE(S_ISFIFO(st.st_mode));
+
+  std::string msg = "some string";
+  std::vector<char> buf(512);
+  // Read-end of the pipe.
+  ScopedThread t([&fifo, &buf, &msg]() {
+    int rfd = 0;
+    ASSERT_THAT(rfd = open(fifo.c_str(), O_RDONLY), SyscallSucceeds());
+    EXPECT_THAT(ReadFd(rfd, buf.data(), buf.size()),
+                SyscallSucceedsWithValue(msg.length()));
+    EXPECT_EQ(msg, std::string(buf.data()));
+    EXPECT_THAT(close(rfd), SyscallSucceeds());
+  });
+
+  int wfd = 0;
+  ASSERT_THAT(wfd = open(fifo.c_str(), O_TRUNC | O_WRONLY), SyscallSucceeds());
+  EXPECT_THAT(ftruncate(wfd, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(WriteFd(wfd, msg.c_str(), msg.length()),
+              SyscallSucceedsWithValue(msg.length()));
+  EXPECT_THAT(ftruncate(wfd, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(close(wfd), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
new file mode 100644
index 000000000..afe060d33
--- /dev/null
+++ b/test/syscalls/linux/mmap.cc
@@ -0,0 +1,1714 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/magic.h>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/statfs.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_split.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Gt;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<int64_t> VirtualMemorySize() {
+  ASSIGN_OR_RETURN_ERRNO(auto contents, GetContents("/proc/self/statm"));
+  std::vector<std::string> parts = absl::StrSplit(contents, ' ');
+  if (parts.empty()) {
+    return PosixError(EINVAL, "Unable to parse /proc/self/statm");
+  }
+  ASSIGN_OR_RETURN_ERRNO(auto pages, Atoi<int64_t>(parts[0]));
+  return pages * getpagesize();
+}
+
+class MMapTest : public ::testing::Test {
+ protected:
+  // Unmap mapping, if one was made.
+  void TearDown() override {
+    if (addr_) {
+      EXPECT_THAT(Unmap(), SyscallSucceeds());
+    }
+  }
+
+  // Remembers mapping, so it can be automatically unmapped.
+  uintptr_t Map(uintptr_t addr, size_t length, int prot, int flags, int fd,
+                off_t offset) {
+    void* ret =
+        mmap(reinterpret_cast<void*>(addr), length, prot, flags, fd, offset);
+
+    if (ret != MAP_FAILED) {
+      addr_ = ret;
+      length_ = length;
+    }
+
+    return reinterpret_cast<uintptr_t>(ret);
+  }
+
+  // Unmap previous mapping
+  int Unmap() {
+    if (!addr_) {
+      return -1;
+    }
+
+    int ret = munmap(addr_, length_);
+
+    addr_ = nullptr;
+    length_ = 0;
+
+    return ret;
+  }
+
+  // Msync the mapping.
+  int Msync() { return msync(addr_, length_, MS_SYNC); }
+
+  // Mlock the mapping.
+  int Mlock() { return mlock(addr_, length_); }
+
+  // Munlock the mapping.
+  int Munlock() { return munlock(addr_, length_); }
+
+  int Protect(uintptr_t addr, size_t length, int prot) {
+    return mprotect(reinterpret_cast<void*>(addr), length, prot);
+  }
+
+  void* addr_ = nullptr;
+  size_t length_ = 0;
+};
+
+// Matches if arg contains the same contents as std::string str.
+MATCHER_P(EqualsMemory, str, "") {
+  if (0 == memcmp(arg, str.c_str(), str.size())) {
+    return true;
+  }
+
+  *result_listener << "Memory did not match. Got:\n"
+                   << absl::BytesToHexString(
+                          std::string(static_cast<char*>(arg), str.size()))
+                   << "Want:\n"
+                   << absl::BytesToHexString(str);
+  return false;
+}
+
+// We can't map pipes, but for different reasons.
+TEST_F(MMapTest, MapPipe) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fds[0], 0),
+              SyscallFailsWithErrno(ENODEV));
+  EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fds[1], 0),
+              SyscallFailsWithErrno(EACCES));
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+// It's very common to mmap /dev/zero because anonymous mappings aren't part
+// of POSIX although they are widely supported. So a zero initialized memory
+// region would actually come from a "file backed" /dev/zero mapping.
+TEST_F(MMapTest, MapDevZeroShared) {
+  // This test will verify that we're able to map a page backed by /dev/zero
+  // as MAP_SHARED.
+  const FileDescriptor dev_zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+  // Test that we can create a RW SHARED mapping of /dev/zero.
+  ASSERT_THAT(
+      Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, dev_zero.get(), 0),
+      SyscallSucceeds());
+}
+
+TEST_F(MMapTest, MapDevZeroPrivate) {
+  // This test will verify that we're able to map a page backed by /dev/zero
+  // as MAP_PRIVATE.
+  const FileDescriptor dev_zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+  // Test that we can create a RW SHARED mapping of /dev/zero.
+  ASSERT_THAT(
+      Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, dev_zero.get(), 0),
+      SyscallSucceeds());
+}
+
+TEST_F(MMapTest, MapDevZeroNoPersistence) {
+  // This test will verify that two independent mappings of /dev/zero do not
+  // appear to reference the same "backed file."
+
+  const FileDescriptor dev_zero1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+  const FileDescriptor dev_zero2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+  ASSERT_THAT(
+      Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, dev_zero1.get(), 0),
+      SyscallSucceeds());
+
+  // Create a second mapping via the second /dev/zero fd.
+  void* psec_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                        dev_zero2.get(), 0);
+  ASSERT_THAT(reinterpret_cast<intptr_t>(psec_map), SyscallSucceeds());
+
+  // Always unmap.
+  auto cleanup_psec_map = Cleanup(
+      [&] { EXPECT_THAT(munmap(psec_map, kPageSize), SyscallSucceeds()); });
+
+  // Verify that we have independently addressed pages.
+  ASSERT_NE(psec_map, addr_);
+
+  std::string buf_zero(kPageSize, 0x00);
+  std::string buf_ones(kPageSize, 0xFF);
+
+  // Verify the first is actually all zeros after mmap.
+  EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+
+  // Let's fill in the first mapping with 0xFF.
+  memcpy(addr_, buf_ones.data(), kPageSize);
+
+  // Verify that the memcpy actually stuck in the page.
+  EXPECT_THAT(addr_, EqualsMemory(buf_ones));
+
+  // Verify that it didn't affect the second page which should be all zeros.
+  EXPECT_THAT(psec_map, EqualsMemory(buf_zero));
+}
+
+TEST_F(MMapTest, MapDevZeroSharedMultiplePages) {
+  // This will test that we're able to map /dev/zero over multiple pages.
+  const FileDescriptor dev_zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+  // Test that we can create a RW SHARED mapping of /dev/zero.
+  ASSERT_THAT(Map(0, kPageSize * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                  dev_zero.get(), 0),
+              SyscallSucceeds());
+
+  std::string buf_zero(kPageSize * 2, 0x00);
+  std::string buf_ones(kPageSize * 2, 0xFF);
+
+  // Verify the two pages are actually all zeros after mmap.
+  EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+
+  // Fill out the pages with all ones.
+  memcpy(addr_, buf_ones.data(), kPageSize * 2);
+
+  // Verify that the memcpy actually stuck in the pages.
+  EXPECT_THAT(addr_, EqualsMemory(buf_ones));
+}
+
+TEST_F(MMapTest, MapDevZeroSharedFdNoPersistence) {
+  // This test will verify that two independent mappings of /dev/zero do not
+  // appear to reference the same "backed file" even when mapped from the
+  // same initial fd.
+  const FileDescriptor dev_zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+  ASSERT_THAT(
+      Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, dev_zero.get(), 0),
+      SyscallSucceeds());
+
+  // Create a second mapping via the same fd.
+  void* psec_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                        dev_zero.get(), 0);
+  ASSERT_THAT(reinterpret_cast<int64_t>(psec_map), SyscallSucceeds());
+
+  // Always unmap.
+  auto cleanup_psec_map = Cleanup(
+      [&] { ASSERT_THAT(munmap(psec_map, kPageSize), SyscallSucceeds()); });
+
+  // Verify that we have independently addressed pages.
+  ASSERT_NE(psec_map, addr_);
+
+  std::string buf_zero(kPageSize, 0x00);
+  std::string buf_ones(kPageSize, 0xFF);
+
+  // Verify the first is actually all zeros after mmap.
+  EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+
+  // Let's fill in the first mapping with 0xFF.
+  memcpy(addr_, buf_ones.data(), kPageSize);
+
+  // Verify that the memcpy actually stuck in the page.
+  EXPECT_THAT(addr_, EqualsMemory(buf_ones));
+
+  // Verify that it didn't affect the second page which should be all zeros.
+  EXPECT_THAT(psec_map, EqualsMemory(buf_zero));
+}
+
+TEST_F(MMapTest, MapDevZeroSegfaultAfterUnmap) {
+  SetupGvisorDeathTest();
+
+  // This test will verify that we're able to map a page backed by /dev/zero
+  // as MAP_SHARED and after it's unmapped any access results in a SIGSEGV.
+  // This test is redundant but given the special nature of /dev/zero mappings
+  // it doesn't hurt.
+  const FileDescriptor dev_zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+  const auto rest = [&] {
+    // Test that we can create a RW SHARED mapping of /dev/zero.
+    TEST_PCHECK(Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                    dev_zero.get(),
+                    0) != reinterpret_cast<uintptr_t>(MAP_FAILED));
+
+    // Confirm that accesses after the unmap result in a SIGSEGV.
+    //
+    // N.B. We depend on this process being single-threaded to ensure there
+    // can't be another mmap to map addr before the dereference below.
+    void* addr_saved = addr_;  // Unmap resets addr_.
+    TEST_PCHECK(Unmap() == 0);
+    *reinterpret_cast<volatile int*>(addr_saved) = 0xFF;
+  };
+
+  EXPECT_THAT(InForkedProcess(rest),
+              IsPosixErrorOkAndHolds(W_EXITCODE(0, SIGSEGV)));
+}
+
+TEST_F(MMapTest, MapDevZeroUnaligned) {
+  const FileDescriptor dev_zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+  const size_t size = kPageSize + kPageSize / 2;
+  const std::string buf_zero(size, 0x00);
+
+  ASSERT_THAT(
+      Map(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, dev_zero.get(), 0),
+      SyscallSucceeds());
+  EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+  ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+  ASSERT_THAT(
+      Map(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, dev_zero.get(), 0),
+      SyscallSucceeds());
+  EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+}
+
+// We can't map _some_ character devices.
+TEST_F(MMapTest, MapCharDevice) {
+  const FileDescriptor cdevfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/random", 0, 0));
+  EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_PRIVATE, cdevfd.get(), 0),
+              SyscallFailsWithErrno(ENODEV));
+}
+
+// We can't map directories.
+TEST_F(MMapTest, MapDirectory) {
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), 0, 0));
+  EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_PRIVATE, dirfd.get(), 0),
+              SyscallFailsWithErrno(ENODEV));
+}
+
+// We can map *something*
+TEST_F(MMapTest, MapAnything) {
+  EXPECT_THAT(Map(0, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceedsWithValue(Gt(0)));
+}
+
+// Map length < PageSize allowed
+TEST_F(MMapTest, SmallMap) {
+  EXPECT_THAT(Map(0, 128, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceeds());
+}
+
+// Hint address doesn't break anything.
+// Note: there is no requirement we actually get the hint address
+TEST_F(MMapTest, HintAddress) {
+  EXPECT_THAT(
+      Map(0x30000000, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallSucceeds());
+}
+
+// MAP_FIXED gives us exactly the requested address
+TEST_F(MMapTest, MapFixed) {
+  EXPECT_THAT(Map(0x30000000, kPageSize, PROT_NONE,
+                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0),
+              SyscallSucceedsWithValue(0x30000000));
+}
+
+// 64-bit addresses work too
+#ifdef __x86_64__
+TEST_F(MMapTest, MapFixed64) {
+  EXPECT_THAT(Map(0x300000000000, kPageSize, PROT_NONE,
+                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0),
+              SyscallSucceedsWithValue(0x300000000000));
+}
+#endif
+
+// MAP_STACK allowed.
+// There isn't a good way to verify it did anything.
+TEST_F(MMapTest, MapStack) {
+  EXPECT_THAT(Map(0, kPageSize, PROT_NONE,
+                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0),
+              SyscallSucceeds());
+}
+
+// MAP_LOCKED allowed.
+// There isn't a good way to verify it did anything.
+TEST_F(MMapTest, MapLocked) {
+  EXPECT_THAT(Map(0, kPageSize, PROT_NONE,
+                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, -1, 0),
+              SyscallSucceeds());
+}
+
+// MAP_PRIVATE or MAP_SHARED must be passed
+TEST_F(MMapTest, NotPrivateOrShared) {
+  EXPECT_THAT(Map(0, kPageSize, PROT_NONE, MAP_ANONYMOUS, -1, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Only one of MAP_PRIVATE or MAP_SHARED may be passed
+TEST_F(MMapTest, PrivateAndShared) {
+  EXPECT_THAT(Map(0, kPageSize, PROT_NONE,
+                  MAP_PRIVATE | MAP_SHARED | MAP_ANONYMOUS, -1, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(MMapTest, FixedAlignment) {
+  // Addr must be page aligned (MAP_FIXED)
+  EXPECT_THAT(Map(0x30000001, kPageSize, PROT_NONE,
+                  MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Non-MAP_FIXED address does not need to be page aligned
+TEST_F(MMapTest, NonFixedAlignment) {
+  EXPECT_THAT(
+      Map(0x30000001, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallSucceeds());
+}
+
+// Length = 0 results in EINVAL.
+TEST_F(MMapTest, InvalidLength) {
+  EXPECT_THAT(Map(0, 0, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Bad fd not allowed.
+TEST_F(MMapTest, BadFd) {
+  EXPECT_THAT(Map(0, kPageSize, PROT_NONE, MAP_PRIVATE, 999, 0),
+              SyscallFailsWithErrno(EBADF));
+}
+
+// Mappings are writable.
+TEST_F(MMapTest, ProtWrite) {
+  uint64_t addr;
+  constexpr uint8_t kFirstWord[] = {42, 42, 42, 42};
+
+  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceeds());
+
+  // This shouldn't cause a SIGSEGV.
+  memset(reinterpret_cast<void*>(addr), 42, kPageSize);
+
+  // The written data should actually be there.
+  EXPECT_EQ(
+      0, memcmp(reinterpret_cast<void*>(addr), kFirstWord, sizeof(kFirstWord)));
+}
+
+// "Write-only" mappings are writable *and* readable.
+TEST_F(MMapTest, ProtWriteOnly) {
+  uint64_t addr;
+  constexpr uint8_t kFirstWord[] = {42, 42, 42, 42};
+
+  EXPECT_THAT(
+      addr = Map(0, kPageSize, PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallSucceeds());
+
+  // This shouldn't cause a SIGSEGV.
+  memset(reinterpret_cast<void*>(addr), 42, kPageSize);
+
+  // The written data should actually be there.
+  EXPECT_EQ(
+      0, memcmp(reinterpret_cast<void*>(addr), kFirstWord, sizeof(kFirstWord)));
+}
+
+// "Write-only" mappings are readable.
+//
+// This is distinct from above to ensure the page is accessible even if the
+// initial fault is a write fault.
+TEST_F(MMapTest, ProtWriteOnlyReadable) {
+  uint64_t addr;
+  constexpr uint64_t kFirstWord = 0;
+
+  EXPECT_THAT(
+      addr = Map(0, kPageSize, PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallSucceeds());
+
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), &kFirstWord,
+                      sizeof(kFirstWord)));
+}
+
+// Mappings are writable after mprotect from PROT_NONE to PROT_READ|PROT_WRITE.
+TEST_F(MMapTest, ProtectProtWrite) {
+  uint64_t addr;
+  constexpr uint8_t kFirstWord[] = {42, 42, 42, 42};
+
+  EXPECT_THAT(
+      addr = Map(0, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallSucceeds());
+
+  ASSERT_THAT(Protect(addr, kPageSize, PROT_READ | PROT_WRITE),
+              SyscallSucceeds());
+
+  // This shouldn't cause a SIGSEGV.
+  memset(reinterpret_cast<void*>(addr), 42, kPageSize);
+
+  // The written data should actually be there.
+  EXPECT_EQ(
+      0, memcmp(reinterpret_cast<void*>(addr), kFirstWord, sizeof(kFirstWord)));
+}
+
+// SIGSEGV raised when reading PROT_NONE memory
+TEST_F(MMapTest, ProtNoneDeath) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+
+  ASSERT_THAT(
+      addr = Map(0, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallSucceeds());
+
+  EXPECT_EXIT(*reinterpret_cast<volatile int*>(addr),
+              ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+// SIGSEGV raised when writing PROT_READ only memory
+TEST_F(MMapTest, ReadOnlyDeath) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+
+  ASSERT_THAT(
+      addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallSucceeds());
+
+  EXPECT_EXIT(*reinterpret_cast<volatile int*>(addr) = 42,
+              ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+// Writable mapping mprotect'd to read-only should not be writable.
+TEST_F(MMapTest, MprotectReadOnlyDeath) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceeds());
+
+  volatile int* val = reinterpret_cast<int*>(addr);
+
+  // Copy to ensure page is mapped in.
+  *val = 42;
+
+  ASSERT_THAT(Protect(addr, kPageSize, PROT_READ), SyscallSucceeds());
+
+  // Now it shouldn't be writable.
+  EXPECT_EXIT(*val = 0, ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+// Verify that calling mprotect an address that's not page aligned fails.
+TEST_F(MMapTest, MprotectNotPageAligned) {
+  uintptr_t addr;
+
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceeds());
+  ASSERT_THAT(Protect(addr + 1, kPageSize - 1, PROT_READ),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Verify that calling mprotect with an absurdly huge length fails.
+TEST_F(MMapTest, MprotectHugeLength) {
+  uintptr_t addr;
+
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceeds());
+  ASSERT_THAT(Protect(addr, static_cast<size_t>(-1), PROT_READ),
+              SyscallFailsWithErrno(ENOMEM));
+}
+
+#if defined(__x86_64__) || defined(__i386__)
+// This code is equivalent in 32 and 64-bit mode
+const uint8_t machine_code[] = {
+    0xb8, 0x2a, 0x00, 0x00, 0x00,  // movl $42, %eax
+    0xc3,                          // retq
+};
+
+// PROT_EXEC allows code execution
+TEST_F(MMapTest, ProtExec) {
+  uintptr_t addr;
+  uint32_t (*func)(void);
+
+  EXPECT_THAT(addr = Map(0, kPageSize, PROT_EXEC | PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceeds());
+
+  memcpy(reinterpret_cast<void*>(addr), machine_code, sizeof(machine_code));
+
+  func = reinterpret_cast<uint32_t (*)(void)>(addr);
+
+  EXPECT_EQ(42, func());
+}
+
+// No PROT_EXEC disallows code execution
+TEST_F(MMapTest, NoProtExecDeath) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+  uint32_t (*func)(void);
+
+  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceeds());
+
+  memcpy(reinterpret_cast<void*>(addr), machine_code, sizeof(machine_code));
+
+  func = reinterpret_cast<uint32_t (*)(void)>(addr);
+
+  EXPECT_EXIT(func(), ::testing::KilledBySignal(SIGSEGV), "");
+}
+#endif
+
+TEST_F(MMapTest, NoExceedLimitData) {
+  void* prevbrk;
+  void* target_brk;
+  struct rlimit setlim;
+
+  prevbrk = sbrk(0);
+  ASSERT_NE(-1, reinterpret_cast<intptr_t>(prevbrk));
+  target_brk = reinterpret_cast<char*>(prevbrk) + 1;
+
+  setlim.rlim_cur = RLIM_INFINITY;
+  setlim.rlim_max = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+  EXPECT_THAT(brk(target_brk), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(MMapTest, ExceedLimitData) {
+  // To unit test this more precisely, we'd need access to the mm's start_brk
+  // and end_brk, which we don't have direct access to :/
+  void* prevbrk;
+  void* target_brk;
+  struct rlimit setlim;
+
+  prevbrk = sbrk(0);
+  ASSERT_NE(-1, reinterpret_cast<intptr_t>(prevbrk));
+  target_brk = reinterpret_cast<char*>(prevbrk) + 8192;
+
+  setlim.rlim_cur = 0;
+  setlim.rlim_max = RLIM_INFINITY;
+  // Set RLIMIT_DATA very low so any subsequent brk() calls fail.
+  // Reset RLIMIT_DATA during teardown step.
+  ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+  EXPECT_THAT(brk(target_brk), SyscallFailsWithErrno(ENOMEM));
+  // Teardown step...
+  setlim.rlim_cur = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+}
+
+TEST_F(MMapTest, ExceedLimitDataPrlimit) {
+  // To unit test this more precisely, we'd need access to the mm's start_brk
+  // and end_brk, which we don't have direct access to :/
+  void* prevbrk;
+  void* target_brk;
+  struct rlimit setlim;
+
+  prevbrk = sbrk(0);
+  ASSERT_NE(-1, reinterpret_cast<intptr_t>(prevbrk));
+  target_brk = reinterpret_cast<char*>(prevbrk) + 8192;
+
+  setlim.rlim_cur = 0;
+  setlim.rlim_max = RLIM_INFINITY;
+  // Set RLIMIT_DATA very low so any subsequent brk() calls fail.
+  // Reset RLIMIT_DATA during teardown step.
+  ASSERT_THAT(prlimit(0, RLIMIT_DATA, &setlim, nullptr), SyscallSucceeds());
+  EXPECT_THAT(brk(target_brk), SyscallFailsWithErrno(ENOMEM));
+  // Teardown step...
+  setlim.rlim_cur = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+}
+
+TEST_F(MMapTest, ExceedLimitDataPrlimitPID) {
+  // To unit test this more precisely, we'd need access to the mm's start_brk
+  // and end_brk, which we don't have direct access to :/
+  void* prevbrk;
+  void* target_brk;
+  struct rlimit setlim;
+
+  prevbrk = sbrk(0);
+  ASSERT_NE(-1, reinterpret_cast<intptr_t>(prevbrk));
+  target_brk = reinterpret_cast<char*>(prevbrk) + 8192;
+
+  setlim.rlim_cur = 0;
+  setlim.rlim_max = RLIM_INFINITY;
+  // Set RLIMIT_DATA very low so any subsequent brk() calls fail.
+  // Reset RLIMIT_DATA during teardown step.
+  ASSERT_THAT(prlimit(syscall(__NR_gettid), RLIMIT_DATA, &setlim, nullptr),
+              SyscallSucceeds());
+  EXPECT_THAT(brk(target_brk), SyscallFailsWithErrno(ENOMEM));
+  // Teardown step...
+  setlim.rlim_cur = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+}
+
+TEST_F(MMapTest, NoExceedLimitAS) {
+  constexpr uint64_t kAllocBytes = 200 << 20;
+  // Add some headroom to the AS limit in case of e.g. unexpected stack
+  // expansion.
+  constexpr uint64_t kExtraASBytes = kAllocBytes + (20 << 20);
+  static_assert(kAllocBytes < kExtraASBytes,
+                "test depends on allocation not exceeding AS limit");
+
+  auto vss = ASSERT_NO_ERRNO_AND_VALUE(VirtualMemorySize());
+  struct rlimit setlim;
+  setlim.rlim_cur = vss + kExtraASBytes;
+  setlim.rlim_max = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_AS, &setlim), SyscallSucceeds());
+  EXPECT_THAT(
+      Map(0, kAllocBytes, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallSucceedsWithValue(Gt(0)));
+}
+
+TEST_F(MMapTest, ExceedLimitAS) {
+  constexpr uint64_t kAllocBytes = 200 << 20;
+  // Add some headroom to the AS limit in case of e.g. unexpected stack
+  // expansion.
+  constexpr uint64_t kExtraASBytes = 20 << 20;
+  static_assert(kAllocBytes > kExtraASBytes,
+                "test depends on allocation exceeding AS limit");
+
+  auto vss = ASSERT_NO_ERRNO_AND_VALUE(VirtualMemorySize());
+  struct rlimit setlim;
+  setlim.rlim_cur = vss + kExtraASBytes;
+  setlim.rlim_max = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_AS, &setlim), SyscallSucceeds());
+  EXPECT_THAT(
+      Map(0, kAllocBytes, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+      SyscallFailsWithErrno(ENOMEM));
+}
+
+// Tests that setting an anonymous mmap to PROT_NONE doesn't free the memory.
+TEST_F(MMapTest, SettingProtNoneDoesntFreeMemory) {
+  uintptr_t addr;
+  constexpr uint8_t kFirstWord[] = {42, 42, 42, 42};
+
+  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceedsWithValue(Gt(0)));
+
+  memset(reinterpret_cast<void*>(addr), 42, kPageSize);
+
+  ASSERT_THAT(Protect(addr, kPageSize, PROT_NONE), SyscallSucceeds());
+  ASSERT_THAT(Protect(addr, kPageSize, PROT_READ | PROT_WRITE),
+              SyscallSucceeds());
+
+  // The written data should still be there.
+  EXPECT_EQ(
+      0, memcmp(reinterpret_cast<void*>(addr), kFirstWord, sizeof(kFirstWord)));
+}
+
+constexpr char kFileContents[] = "Hello World!";
+
+class MMapFileTest : public MMapTest {
+ protected:
+  FileDescriptor fd_;
+  std::string filename_;
+
+  // Open a file for read/write
+  void SetUp() override {
+    MMapTest::SetUp();
+
+    filename_ = NewTempAbsPath();
+    fd_ = ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_CREAT | O_RDWR, 0644));
+
+    // Extend file so it can be written once mapped. Deliberately make the file
+    // only half a page in size, so we can test what happens when we access the
+    // second half.
+    // Use ftruncate(2) once the sentry supports it.
+    char zero = 0;
+    size_t count = 0;
+    do {
+      const DisableSave ds;  // saving 2048 times is slow and useless.
+      Write(&zero, 1), SyscallSucceedsWithValue(1);
+    } while (++count < (kPageSize / 2));
+    ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+  }
+
+  // Close and delete file
+  void TearDown() override {
+    MMapTest::TearDown();
+    fd_.reset();  // Make sure the files is closed before we unlink it.
+    ASSERT_THAT(unlink(filename_.c_str()), SyscallSucceeds());
+  }
+
+  ssize_t Read(char* buf, size_t count) {
+    ssize_t len = 0;
+    do {
+      ssize_t ret = read(fd_.get(), buf, count);
+      if (ret < 0) {
+        return ret;
+      } else if (ret == 0) {
+        return len;
+      }
+
+      len += ret;
+      buf += ret;
+    } while (len < static_cast<ssize_t>(count));
+
+    return len;
+  }
+
+  ssize_t Write(const char* buf, size_t count) {
+    ssize_t len = 0;
+    do {
+      ssize_t ret = write(fd_.get(), buf, count);
+      if (ret < 0) {
+        return ret;
+      } else if (ret == 0) {
+        return len;
+      }
+
+      len += ret;
+      buf += ret;
+    } while (len < static_cast<ssize_t>(count));
+
+    return len;
+  }
+};
+
+// MAP_POPULATE allowed.
+// There isn't a good way to verify it actually did anything.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, MapPopulate) {
+  ASSERT_THAT(
+      Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd_.get(), 0),
+      SyscallSucceeds());
+}
+
+// MAP_POPULATE on a short file.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, MapPopulateShort) {
+  ASSERT_THAT(Map(0, 2 * kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE,
+                  fd_.get(), 0),
+              SyscallSucceeds());
+}
+
+// Read contents from mapped file.
+TEST_F(MMapFileTest, Read) {
+  size_t len = strlen(kFileContents);
+  ASSERT_EQ(len, Write(kFileContents, len));
+
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd_.get(), 0),
+              SyscallSucceeds());
+
+  EXPECT_THAT(reinterpret_cast<char*>(addr),
+              EqualsMemory(std::string(kFileContents)));
+}
+
+// Map at an offset.
+TEST_F(MMapFileTest, MapOffset) {
+  ASSERT_THAT(lseek(fd_.get(), kPageSize, SEEK_SET), SyscallSucceeds());
+
+  size_t len = strlen(kFileContents);
+  ASSERT_EQ(len, Write(kFileContents, len));
+
+  uintptr_t addr;
+  ASSERT_THAT(
+      addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd_.get(), kPageSize),
+      SyscallSucceeds());
+
+  EXPECT_THAT(reinterpret_cast<char*>(addr),
+              EqualsMemory(std::string(kFileContents)));
+}
+
+TEST_F(MMapFileTest, MapOffsetBeyondEnd) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                         fd_.get(), 10 * kPageSize),
+              SyscallSucceeds());
+
+  // Touching the memory causes SIGBUS.
+  size_t len = strlen(kFileContents);
+  EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
+                        reinterpret_cast<volatile char*>(addr)),
+              ::testing::KilledBySignal(SIGBUS), "");
+}
+
+// Verify mmap fails when sum of length and offset overflows.
+TEST_F(MMapFileTest, MapLengthPlusOffsetOverflows) {
+  const size_t length = static_cast<size_t>(-kPageSize);
+  const off_t offset = kPageSize;
+  ASSERT_THAT(Map(0, length, PROT_READ, MAP_PRIVATE, fd_.get(), offset),
+              SyscallFailsWithErrno(ENOMEM));
+}
+
+// MAP_PRIVATE PROT_WRITE is allowed on read-only FDs.
+TEST_F(MMapFileTest, WritePrivateOnReadOnlyFd) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_RDONLY));
+
+  uintptr_t addr;
+  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                         fd.get(), 0),
+              SyscallSucceeds());
+
+  // Touch the page to ensure the kernel didn't lie about writability.
+  size_t len = strlen(kFileContents);
+  std::copy(kFileContents, kFileContents + len,
+            reinterpret_cast<volatile char*>(addr));
+}
+
+// MAP_PRIVATE PROT_READ is not allowed on write-only FDs.
+TEST_F(MMapFileTest, ReadPrivateOnWriteOnlyFd) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
+
+  uintptr_t addr;
+  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd.get(), 0),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// MAP_SHARED PROT_WRITE not allowed on read-only FDs.
+TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_RDONLY));
+
+  uintptr_t addr;
+  EXPECT_THAT(
+      addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0),
+      SyscallFailsWithErrno(EACCES));
+}
+
+// MAP_SHARED PROT_READ not allowed on write-only FDs.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
+
+  uintptr_t addr;
+  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd.get(), 0),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// MAP_SHARED PROT_WRITE not allowed on write-only FDs.
+// The FD must always be readable.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, WriteSharedOnWriteOnlyFd) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
+
+  uintptr_t addr;
+  EXPECT_THAT(addr = Map(0, kPageSize, PROT_WRITE, MAP_SHARED, fd.get(), 0),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// Overwriting the contents of a file mapped MAP_SHARED PROT_READ
+// should cause the new data to be reflected in the mapping.
+TEST_F(MMapFileTest, ReadSharedConsistentWithOverwrite) {
+  // Start from scratch.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+  // Expand the file to two pages and dirty them.
+  std::string bufA(kPageSize, 'a');
+  ASSERT_THAT(Write(bufA.c_str(), bufA.size()),
+              SyscallSucceedsWithValue(bufA.size()));
+  std::string bufB(kPageSize, 'b');
+  ASSERT_THAT(Write(bufB.c_str(), bufB.size()),
+              SyscallSucceedsWithValue(bufB.size()));
+
+  // Map the page.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+
+  // Check that the mapping contains the right file data.
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), bufA.c_str(), kPageSize));
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize), bufB.c_str(),
+                      kPageSize));
+
+  // Start at the beginning of the file.
+  ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Swap the write pattern.
+  ASSERT_THAT(Write(bufB.c_str(), bufB.size()),
+              SyscallSucceedsWithValue(bufB.size()));
+  ASSERT_THAT(Write(bufA.c_str(), bufA.size()),
+              SyscallSucceedsWithValue(bufA.size()));
+
+  // Check that the mapping got updated.
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), bufB.c_str(), kPageSize));
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize), bufA.c_str(),
+                      kPageSize));
+}
+
+// Partially overwriting a file mapped MAP_SHARED PROT_READ should be reflected
+// in the mapping.
+TEST_F(MMapFileTest, ReadSharedConsistentWithPartialOverwrite) {
+  // Start from scratch.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+  // Expand the file to two pages and dirty them.
+  std::string bufA(kPageSize, 'a');
+  ASSERT_THAT(Write(bufA.c_str(), bufA.size()),
+              SyscallSucceedsWithValue(bufA.size()));
+  std::string bufB(kPageSize, 'b');
+  ASSERT_THAT(Write(bufB.c_str(), bufB.size()),
+              SyscallSucceedsWithValue(bufB.size()));
+
+  // Map the page.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+
+  // Check that the mapping contains the right file data.
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), bufA.c_str(), kPageSize));
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize), bufB.c_str(),
+                      kPageSize));
+
+  // Start at the beginning of the file.
+  ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Do a partial overwrite, spanning both pages.
+  std::string bufC(kPageSize + (kPageSize / 2), 'c');
+  ASSERT_THAT(Write(bufC.c_str(), bufC.size()),
+              SyscallSucceedsWithValue(bufC.size()));
+
+  // Check that the mapping got updated.
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), bufC.c_str(),
+                      kPageSize + (kPageSize / 2)));
+  EXPECT_EQ(0,
+            memcmp(reinterpret_cast<void*>(addr + kPageSize + (kPageSize / 2)),
+                   bufB.c_str(), kPageSize / 2));
+}
+
+// Overwriting a file mapped MAP_SHARED PROT_READ should be reflected in the
+// mapping and the file.
+TEST_F(MMapFileTest, ReadSharedConsistentWithWriteAndFile) {
+  // Start from scratch.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+  // Expand the file to two full pages and dirty it.
+  std::string bufA(2 * kPageSize, 'a');
+  ASSERT_THAT(Write(bufA.c_str(), bufA.size()),
+              SyscallSucceedsWithValue(bufA.size()));
+
+  // Map only the first page.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+
+  // Prepare to overwrite the file contents.
+  ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Overwrite everything, beyond the mapped portion.
+  std::string bufB(2 * kPageSize, 'b');
+  ASSERT_THAT(Write(bufB.c_str(), bufB.size()),
+              SyscallSucceedsWithValue(bufB.size()));
+
+  // What the mapped portion should now look like.
+  std::string bufMapped(kPageSize, 'b');
+
+  // Expect that the mapped portion is consistent.
+  EXPECT_EQ(
+      0, memcmp(reinterpret_cast<void*>(addr), bufMapped.c_str(), kPageSize));
+
+  // Prepare to read the entire file contents.
+  ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Expect that the file was fully updated.
+  std::vector<char> bufFile(2 * kPageSize);
+  ASSERT_THAT(Read(bufFile.data(), bufFile.size()),
+              SyscallSucceedsWithValue(bufFile.size()));
+  // Cast to void* to avoid EXPECT_THAT assuming bufFile.data() is a
+  // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C
+  // std::string, possibly overruning the buffer.
+  EXPECT_THAT(reinterpret_cast<void*>(bufFile.data()), EqualsMemory(bufB));
+}
+
+// Write data to mapped file.
+TEST_F(MMapFileTest, WriteShared) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  size_t len = strlen(kFileContents);
+  memcpy(reinterpret_cast<void*>(addr), kFileContents, len);
+
+  // The file may not actually be updated until munmap is called.
+  ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+  std::vector<char> buf(len);
+  ASSERT_THAT(Read(buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+  // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C
+  // std::string, possibly overruning the buffer.
+  EXPECT_THAT(reinterpret_cast<void*>(buf.data()),
+              EqualsMemory(std::string(kFileContents)));
+}
+
+// Write data to portion of mapped page beyond the end of the file.
+// These writes are not reflected in the file.
+TEST_F(MMapFileTest, WriteSharedBeyondEnd) {
+  // The file is only half of a page. We map an entire page. Writes to the
+  // end of the mapping must not be reflected in the file.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // First half; this is reflected in the file.
+  std::string first(kPageSize / 2, 'A');
+  memcpy(reinterpret_cast<void*>(addr), first.c_str(), first.size());
+
+  // Second half; this is not reflected in the file.
+  std::string second(kPageSize / 2, 'B');
+  memcpy(reinterpret_cast<void*>(addr + kPageSize / 2), second.c_str(),
+         second.size());
+
+  // The file may not actually be updated until munmap is called.
+  ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+  // Big enough to fit the entire page, if the writes are mistakenly written to
+  // the file.
+  std::vector<char> buf(kPageSize);
+
+  // Only the first half is in the file.
+  ASSERT_THAT(Read(buf.data(), buf.size()),
+              SyscallSucceedsWithValue(first.size()));
+  // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+  // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C
+  // std::string, possibly overruning the buffer.
+  EXPECT_THAT(reinterpret_cast<void*>(buf.data()), EqualsMemory(first));
+}
+
+// The portion of a mapped page that becomes part of the file after a truncate
+// is reflected in the file.
+TEST_F(MMapFileTest, WriteSharedTruncateUp) {
+  // The file is only half of a page. We map an entire page. Writes to the
+  // end of the mapping must not be reflected in the file.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // First half; this is reflected in the file.
+  std::string first(kPageSize / 2, 'A');
+  memcpy(reinterpret_cast<void*>(addr), first.c_str(), first.size());
+
+  // Second half; this is not reflected in the file now (see
+  // WriteSharedBeyondEnd), but will be after the truncate.
+  std::string second(kPageSize / 2, 'B');
+  memcpy(reinterpret_cast<void*>(addr + kPageSize / 2), second.c_str(),
+         second.size());
+
+  // Extend the file to a full page. The second half of the page will be
+  // reflected in the file.
+  EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds());
+
+  // The file may not actually be updated until munmap is called.
+  ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+  // The whole page is in the file.
+  std::vector<char> buf(kPageSize);
+  ASSERT_THAT(Read(buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+  // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C
+  // std::string, possibly overruning the buffer.
+  EXPECT_THAT(reinterpret_cast<void*>(buf.data()), EqualsMemory(first));
+  EXPECT_THAT(reinterpret_cast<void*>(buf.data() + kPageSize / 2),
+              EqualsMemory(second));
+}
+
+TEST_F(MMapFileTest, ReadSharedTruncateDownThenUp) {
+  // Start from scratch.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+  // Expand the file to a full page and dirty it.
+  std::string buf(kPageSize, 'a');
+  ASSERT_THAT(Write(buf.c_str(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Map the page.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+
+  // Check that the memory contains he file data.
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), buf.c_str(), kPageSize));
+
+  // Truncate down, then up.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+  EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds());
+
+  // Check that the memory was zeroed.
+  std::string zeroed(kPageSize, '\0');
+  EXPECT_EQ(0,
+            memcmp(reinterpret_cast<void*>(addr), zeroed.c_str(), kPageSize));
+
+  // The file may not actually be updated until msync is called.
+  ASSERT_THAT(Msync(), SyscallSucceeds());
+
+  // Prepare to read the entire file contents.
+  ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Expect that the file is fully updated.
+  std::vector<char> bufFile(kPageSize);
+  ASSERT_THAT(Read(bufFile.data(), bufFile.size()),
+              SyscallSucceedsWithValue(bufFile.size()));
+  EXPECT_EQ(0, memcmp(bufFile.data(), zeroed.c_str(), kPageSize));
+}
+
+TEST_F(MMapFileTest, WriteSharedTruncateDownThenUp) {
+  // The file is only half of a page. We map an entire page. Writes to the
+  // end of the mapping must not be reflected in the file.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // First half; this will be deleted by truncate(0).
+  std::string first(kPageSize / 2, 'A');
+  memcpy(reinterpret_cast<void*>(addr), first.c_str(), first.size());
+
+  // Truncate down, then up.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+  EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds());
+
+  // The whole page is zeroed in memory.
+  std::string zeroed(kPageSize, '\0');
+  EXPECT_EQ(0,
+            memcmp(reinterpret_cast<void*>(addr), zeroed.c_str(), kPageSize));
+
+  // The file may not actually be updated until munmap is called.
+  ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+  // The whole file is also zeroed.
+  std::vector<char> buf(kPageSize);
+  ASSERT_THAT(Read(buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+  // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C
+  // std::string, possibly overruning the buffer.
+  EXPECT_THAT(reinterpret_cast<void*>(buf.data()), EqualsMemory(zeroed));
+}
+
+TEST_F(MMapFileTest, ReadSharedTruncateSIGBUS) {
+  SetupGvisorDeathTest();
+
+  // Start from scratch.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+  // Expand the file to a full page and dirty it.
+  std::string buf(kPageSize, 'a');
+  ASSERT_THAT(Write(buf.c_str(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Map the page.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+
+  // Check that the mapping contains the file data.
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), buf.c_str(), kPageSize));
+
+  // Truncate down.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+  // Accessing the truncated region should cause a SIGBUS.
+  std::vector<char> in(kPageSize);
+  EXPECT_EXIT(
+      std::copy(reinterpret_cast<volatile char*>(addr),
+                reinterpret_cast<volatile char*>(addr) + kPageSize, in.data()),
+      ::testing::KilledBySignal(SIGBUS), "");
+}
+
+TEST_F(MMapFileTest, WriteSharedTruncateSIGBUS) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // Touch the memory to be sure it really is mapped.
+  size_t len = strlen(kFileContents);
+  memcpy(reinterpret_cast<void*>(addr), kFileContents, len);
+
+  // Truncate down.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+  // Accessing the truncated file should cause a SIGBUS.
+  EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
+                        reinterpret_cast<volatile char*>(addr)),
+              ::testing::KilledBySignal(SIGBUS), "");
+}
+
+TEST_F(MMapFileTest, ReadSharedTruncatePartialPage) {
+  // Start from scratch.
+  EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+  // Dirty the file.
+  std::string buf(kPageSize, 'a');
+  ASSERT_THAT(Write(buf.c_str(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Map a page.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+
+  // Truncate to half of the page.
+  EXPECT_THAT(ftruncate(fd_.get(), kPageSize / 2), SyscallSucceeds());
+
+  // First half of the page untouched.
+  EXPECT_EQ(0,
+            memcmp(reinterpret_cast<void*>(addr), buf.data(), kPageSize / 2));
+
+  // Second half is zeroed.
+  std::string zeroed(kPageSize / 2, '\0');
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize / 2),
+                      zeroed.c_str(), kPageSize / 2));
+}
+
+// Page can still be accessed and contents are intact after truncating a partial
+// page.
+TEST_F(MMapFileTest, WriteSharedTruncatePartialPage) {
+  // Expand the file to a full page.
+  EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds());
+
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // Fill the entire page.
+  std::string contents(kPageSize, 'A');
+  memcpy(reinterpret_cast<void*>(addr), contents.c_str(), contents.size());
+
+  // Truncate half of the page.
+  EXPECT_THAT(ftruncate(fd_.get(), kPageSize / 2), SyscallSucceeds());
+
+  // First half of the page untouched.
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), contents.c_str(),
+                      kPageSize / 2));
+
+  // Second half zeroed.
+  std::string zeroed(kPageSize / 2, '\0');
+  EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize / 2),
+                      zeroed.c_str(), kPageSize / 2));
+}
+
+// MAP_PRIVATE writes are not carried through to the underlying file.
+TEST_F(MMapFileTest, WritePrivate) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  size_t len = strlen(kFileContents);
+  memcpy(reinterpret_cast<void*>(addr), kFileContents, len);
+
+  // The file should not be updated, but if it mistakenly is, it may not be
+  // until after munmap is called.
+  ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+  std::vector<char> buf(len);
+  ASSERT_THAT(Read(buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+  // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C
+  // std::string, possibly overruning the buffer.
+  EXPECT_THAT(reinterpret_cast<void*>(buf.data()),
+              EqualsMemory(std::string(len, '\0')));
+}
+
+// SIGBUS raised when writing past end of file to a private mapping.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, SigBusDeathWritePrivate) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
+  // accessible. Write just beyond that.
+  size_t len = strlen(kFileContents);
+  EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
+                        reinterpret_cast<volatile char*>(addr + kPageSize)),
+              ::testing::KilledBySignal(SIGBUS), "");
+}
+
+// SIGBUS raised when reading past end of file on a shared mapping.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, SigBusDeathReadShared) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+
+  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
+  // accessible. Read just beyond that.
+  std::vector<char> in(kPageSize);
+  EXPECT_EXIT(
+      std::copy(reinterpret_cast<volatile char*>(addr + kPageSize),
+                reinterpret_cast<volatile char*>(addr + kPageSize) + kPageSize,
+                in.data()),
+      ::testing::KilledBySignal(SIGBUS), "");
+}
+
+// SIGBUS raised when reading past end of file on a shared mapping.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, SigBusDeathWriteShared) {
+  SetupGvisorDeathTest();
+
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
+  // accessible. Write just beyond that.
+  size_t len = strlen(kFileContents);
+  EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
+                        reinterpret_cast<volatile char*>(addr + kPageSize)),
+              ::testing::KilledBySignal(SIGBUS), "");
+}
+
+// Tests that SIGBUS is not raised when writing to a file-mapped page before
+// EOF, even if part of the mapping extends beyond EOF.
+TEST_F(MMapFileTest, NoSigBusOnPagesBeforeEOF) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // The test passes if this survives.
+  size_t len = strlen(kFileContents);
+  std::copy(kFileContents, kFileContents + len,
+            reinterpret_cast<volatile char*>(addr));
+}
+
+// Tests that SIGBUS is not raised when writing to a file-mapped page containing
+// EOF, *after* the EOF for a private mapping.
+TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWritePrivate) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // The test passes if this survives. (Technically addr+kPageSize/2 is already
+  // beyond EOF, but +1 to check for fencepost errors.)
+  size_t len = strlen(kFileContents);
+  std::copy(kFileContents, kFileContents + len,
+            reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1));
+}
+
+// Tests that SIGBUS is not raised when reading from a file-mapped page
+// containing EOF, *after* the EOF for a shared mapping.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+
+  // The test passes if this survives. (Technically addr+kPageSize/2 is already
+  // beyond EOF, but +1 to check for fencepost errors.)
+  auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
+  size_t len = strlen(kFileContents);
+  std::vector<char> in(len);
+  std::copy(start, start + len, in.data());
+}
+
+// Tests that SIGBUS is not raised when writing to a file-mapped page containing
+// EOF, *after* the EOF for a shared mapping.
+//
+// FIXME: Parameterize.
+TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWriteShared) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // The test passes if this survives. (Technically addr+kPageSize/2 is already
+  // beyond EOF, but +1 to check for fencepost errors.)
+  size_t len = strlen(kFileContents);
+  std::copy(kFileContents, kFileContents + len,
+            reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1));
+}
+
+// Tests that reading from writable shared file-mapped pages succeeds.
+//
+// On most platforms this is trivial, but when the file is mapped via the sentry
+// page cache (which does not yet support writing to shared mappings), a bug
+// caused reads to fail unnecessarily on such mappings.
+TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
+  uintptr_t addr;
+  size_t len = strlen(kFileContents);
+
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  std::vector<char> buf(kPageSize);
+  // The test passes if this survives.
+  std::copy(reinterpret_cast<volatile char*>(addr),
+            reinterpret_cast<volatile char*>(addr) + len, buf.data());
+}
+
+// Tests that EFAULT is returned when invoking a syscall that requires the OS to
+// read past end of file (resulting in a fault in sentry context in the gVisor
+// case).
+TEST_F(MMapFileTest, InternalSigBus) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  // This depends on the fact that gVisor implements pipes internally.
+  int pipefd[2];
+  ASSERT_THAT(pipe(pipefd), SyscallSucceeds());
+  EXPECT_THAT(
+      write(pipefd[1], reinterpret_cast<void*>(addr + kPageSize), kPageSize),
+      SyscallFailsWithErrno(EFAULT));
+
+  EXPECT_THAT(close(pipefd[0]), SyscallSucceeds());
+  EXPECT_THAT(close(pipefd[1]), SyscallSucceeds());
+}
+
+// Like InternalSigBus, but test the WriteZerosAt path by reading from
+// /dev/zero to a shared mapping (so that the SIGBUS isn't caught during
+// copy-on-write breaking).
+TEST_F(MMapFileTest, InternalSigBusZeroing) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+
+  const FileDescriptor dev_zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+  EXPECT_THAT(read(dev_zero.get(), reinterpret_cast<void*>(addr + kPageSize),
+                   kPageSize),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+// Checks that mmaps with a length of uint64_t(-PAGE_SIZE + 1) or greater do not
+// induce a sentry panic (due to "rounding up" to 0).
+TEST_F(MMapTest, HugeLength) {
+  EXPECT_THAT(Map(0, static_cast<uint64_t>(-kPageSize + 1), PROT_NONE,
+                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallFailsWithErrno(ENOMEM));
+}
+
+// Tests for a specific gVisor MM caching bug.
+TEST_F(MMapTest, AccessCOWInvalidatesCachedSegments) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+  auto zero_fd = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+
+  // Get a two-page private mapping and fill it with 1s.
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+              SyscallSucceeds());
+  memset(addr_, 1, 2 * kPageSize);
+  MaybeSave();
+
+  // Fork to make the mapping copy-on-write.
+  pid_t const pid = fork();
+  if (pid == 0) {
+    // The child process waits for the parent to SIGKILL it.
+    while (true) {
+      pause();
+    }
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  auto cleanup_child = Cleanup([&] {
+    EXPECT_THAT(kill(pid, SIGKILL), SyscallSucceeds());
+    int status;
+    EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  });
+
+  // Induce a read-only Access of the first page of the mapping, which will not
+  // cause a copy. The usermem.Segment should be cached.
+  ASSERT_THAT(PwriteFd(fd.get(), addr_, kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Induce a writable Access of both pages of the mapping. This should
+  // invalidate the cached Segment.
+  ASSERT_THAT(PreadFd(zero_fd.get(), addr_, 2 * kPageSize, 0),
+              SyscallSucceedsWithValue(2 * kPageSize));
+
+  // Induce a read-only Access of the first page of the mapping again. It should
+  // read the 0s that were stored in the mapping by the read from /dev/zero. If
+  // the read failed to invalidate the cached Segment, it will instead read the
+  // 1s in the stale page.
+  ASSERT_THAT(PwriteFd(fd.get(), addr_, kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+  std::vector<char> buf(kPageSize);
+  ASSERT_THAT(PreadFd(fd.get(), buf.data(), kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+  for (size_t i = 0; i < kPageSize; i++) {
+    ASSERT_EQ(0, buf[i]) << "at offset " << i;
+  }
+}
+
+TEST_F(MMapTest, NoReserve) {
+  const size_t kSize = 10 * 1 << 20;  // 10M
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0),
+              SyscallSucceeds());
+  EXPECT_GT(addr, 0);
+
+  // Check that every page can be read/written. Technically, writing to memory
+  // could SIGSEGV in case there is no more memory available. In gVisor it
+  // would never happen though because NORESERVE is ignored. In Linux, it's
+  // possible to fail, but allocation is small enough that it's highly likely
+  // to succeed.
+  for (size_t j = 0; j < kSize; j += kPageSize) {
+    EXPECT_EQ(0, reinterpret_cast<char*>(addr)[j]);
+    reinterpret_cast<char*>(addr)[j] = j;
+  }
+}
+
+// Map more than the gVisor page-cache map unit (64k) and ensure that
+// it is consistent with reading from the file.
+TEST_F(MMapFileTest, Bug38498194) {
+  // Choose a sufficiently large map unit.
+  constexpr int kSize = 4 * 1024 * 1024;
+  EXPECT_THAT(ftruncate(fd_.get(), kSize), SyscallSucceeds());
+
+  // Map a large enough region so that multiple internal segments
+  // are created to back the mapping.
+  uintptr_t addr;
+  ASSERT_THAT(
+      addr = Map(0, kSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd_.get(), 0),
+      SyscallSucceeds());
+
+  std::vector<char> expect(kSize, 'a');
+  std::copy(expect.data(), expect.data() + expect.size(),
+            reinterpret_cast<volatile char*>(addr));
+
+  // Trigger writeback for gVisor. In Linux pages stay cached until
+  // it can't hold onto them anymore.
+  ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+  std::vector<char> buf(kSize);
+  ASSERT_THAT(Read(buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  EXPECT_EQ(buf, expect) << std::string(buf.data(), buf.size());
+}
+
+// Tests that reading from a file to a memory mapping of the same file does not
+// deadlock.
+TEST_F(MMapFileTest, SelfRead) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         fd_.get(), 0),
+              SyscallSucceeds());
+  EXPECT_THAT(Read(reinterpret_cast<char*>(addr), kPageSize / 2),
+              SyscallSucceedsWithValue(kPageSize / 2));
+  // The resulting file contents are poorly-specified and irrelevant.
+}
+
+// Tests that writing to a file from a memory mapping of the same file does not
+// deadlock.
+TEST_F(MMapFileTest, SelfWrite) {
+  uintptr_t addr;
+  ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+              SyscallSucceeds());
+  EXPECT_THAT(Write(reinterpret_cast<char*>(addr), kPageSize / 2),
+              SyscallSucceedsWithValue(kPageSize / 2));
+  // The resulting file contents are poorly-specified and irrelevant.
+}
+
+TEST(MMapDeathTest, TruncateAfterCOWBreak) {
+  SetupGvisorDeathTest();
+
+  // Create and map a single-page file.
+  auto const temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDWR));
+  ASSERT_THAT(ftruncate(fd.get(), kPageSize), SyscallSucceeds());
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd.get(), 0));
+
+  // Write to this mapping, causing the page to be copied for write.
+  memset(mapping.ptr(), 'a', mapping.len());
+  MaybeSave();  // Trigger a co-operative save cycle.
+
+  // Truncate the file and expect it to invalidate the copied page.
+  ASSERT_THAT(ftruncate(fd.get(), 0), SyscallSucceeds());
+  EXPECT_EXIT(*reinterpret_cast<volatile char*>(mapping.ptr()),
+              ::testing::KilledBySignal(SIGBUS), "");
+}
+
+// Conditional on MAP_32BIT.
+#ifdef __x86_64__
+
+TEST(MMapNoFixtureTest, Map32Bit) {
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE | MAP_32BIT));
+  EXPECT_LT(mapping.addr(), static_cast<uintptr_t>(1) << 32);
+  EXPECT_LE(mapping.endaddr(), static_cast<uintptr_t>(1) << 32);
+}
+
+#endif  // defined(__x86_64__)
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
new file mode 100644
index 000000000..76da8b75a
--- /dev/null
+++ b/test/syscalls/linux/mount.cc
@@ -0,0 +1,302 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/mount_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(MountTest, MountBadFilesystem) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  // Linux expects a valid target before it checks the file system name.
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(mount("", dir.path().c_str(), "foobar", 0, ""),
+              SyscallFailsWithErrno(ENODEV));
+}
+
+TEST(MountTest, MountInvalidTarget) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = NewTempAbsPath();
+  EXPECT_THAT(mount("", dir.c_str(), "tmpfs", 0, ""),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(MountTest, MountPermDenied) {
+  // Clear CAP_SYS_ADMIN.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))) {
+    EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false));
+  }
+
+  // Linux expects a valid target before checking capability.
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(mount("", dir.path().c_str(), "", 0, ""),
+              SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MountTest, UmountPermDenied) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mount =
+      ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "", 0));
+
+  // Drop privileges in another thread, so we can still unmount the mounted
+  // directory.
+  ScopedThread([&]() {
+    EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false));
+    EXPECT_THAT(umount(dir.path().c_str()), SyscallFailsWithErrno(EPERM));
+  });
+}
+
+TEST(MountTest, MountOverBusy) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));
+
+  // Should be able to mount over a busy directory.
+  ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "", 0));
+}
+
+TEST(MountTest, OpenFileBusy) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", 0, "mode=0700", 0));
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));
+
+  // An open file should prevent unmounting.
+  EXPECT_THAT(umount(dir.path().c_str()), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(MountTest, UmountDetach) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  // structure:
+  //
+  // dir (mount point)
+  //   subdir
+  //   file
+  //
+  // We show that we can walk around in the mount after detach-unmount dir.
+  //
+  // We show that even though dir is unreachable from outside the mount, we can
+  // still reach dir's (former) parent!
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  const struct stat before = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+  auto mount =
+      ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "mode=0700",
+                                      /* umountflags= */ MNT_DETACH));
+  const struct stat after = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+  EXPECT_NE(before.st_ino, after.st_ino);
+
+  // Create files in the new mount.
+  constexpr char kContents[] = "no no no";
+  auto const subdir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+  auto const file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(dir.path(), kContents, 0777));
+
+  auto const dir_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(subdir.path(), O_RDONLY | O_DIRECTORY));
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  // Unmount the tmpfs.
+  mount.Release()();
+
+  const struct stat after2 = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+  EXPECT_EQ(before.st_ino, after2.st_ino);
+
+  // Can still read file after unmounting.
+  std::vector<char> buf(sizeof(kContents));
+  EXPECT_THAT(ReadFd(fd.get(), buf.data(), buf.size()), SyscallSucceeds());
+
+  // Walk to dir.
+  auto const mounted_dir = ASSERT_NO_ERRNO_AND_VALUE(
+      OpenAt(dir_fd.get(), "..", O_DIRECTORY | O_RDONLY));
+  // Walk to dir/file.
+  auto const fd_again = ASSERT_NO_ERRNO_AND_VALUE(
+      OpenAt(mounted_dir.get(), std::string(Basename(file.path())), O_RDONLY));
+
+  std::vector<char> buf2(sizeof(kContents));
+  EXPECT_THAT(ReadFd(fd_again.get(), buf2.data(), buf2.size()),
+              SyscallSucceeds());
+  EXPECT_EQ(buf, buf2);
+
+  // Walking outside the unmounted realm should still work, too!
+  auto const dir_parent = ASSERT_NO_ERRNO_AND_VALUE(
+      OpenAt(mounted_dir.get(), "..", O_DIRECTORY | O_RDONLY));
+}
+
+TEST(MountTest, ActiveSubmountBusy) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mount1 = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", 0, "mode=0700", 0));
+
+  auto const dir2 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+  auto const mount2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir2.path(), "tmpfs", 0, "", 0));
+
+  // Since dir now has an active submount, should not be able to unmount.
+  EXPECT_THAT(umount(dir.path().c_str()), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(MountTest, MountTmpfs) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const struct stat before = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+
+  {
+    auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+        Mount("", dir.path(), "tmpfs", 0, "mode=0700", 0));
+
+    const struct stat s = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+    EXPECT_EQ(s.st_mode, S_IFDIR | 0700);
+    EXPECT_NE(s.st_ino, before.st_ino);
+
+    EXPECT_NO_ERRNO(Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));
+  }
+
+  // Now that dir is unmounted again, we should have the old inode back.
+  const struct stat after = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+  EXPECT_EQ(before.st_ino, after.st_ino);
+}
+
+TEST(MountTest, MountTmpfsMagicValIgnored) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", MS_MGC_VAL, "mode=0700", 0));
+}
+
+// Passing nullptr to data is equivalent to "".
+TEST(MountTest, NullData) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  EXPECT_THAT(mount("", dir.path().c_str(), "tmpfs", 0, nullptr),
+              SyscallSucceeds());
+  EXPECT_THAT(umount2(dir.path().c_str(), 0), SyscallSucceeds());
+}
+
+TEST(MountTest, MountReadonly) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", MS_RDONLY, "mode=0777", 0));
+
+  const struct stat s = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+  EXPECT_EQ(s.st_mode, S_IFDIR | 0777);
+
+  std::string const filename = JoinPath(dir.path(), "foo");
+  EXPECT_THAT(open(filename.c_str(), O_RDWR | O_CREAT, 0777),
+              SyscallFailsWithErrno(EROFS));
+}
+
+PosixErrorOr<absl::Time> ATime(absl::string_view file) {
+  struct stat s = {};
+  if (stat(std::string(file).c_str(), &s) == -1) {
+    return PosixError(errno, "stat failed");
+  }
+  return absl::TimeFromTimespec(s.st_atim);
+}
+
+// FIXME: Disabled until tmpfs stops using Handle, as only the gofer
+// and host file system respect the MS_NOATIME flag.
+TEST(MountTest, DISABLED_MountNoAtime) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", MS_NOATIME, "mode=0777", 0));
+
+  std::string const contents = "No no no, don't follow the instructions!";
+  auto const file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(dir.path(), contents, 0777));
+
+  absl::Time const before = ASSERT_NO_ERRNO_AND_VALUE(ATime(file.path()));
+
+  // Reading from the file should change the atime, but the MS_NOATIME flag
+  // should prevent that.
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+  char buf[100];
+  int read_n;
+  ASSERT_THAT(read_n = read(fd.get(), buf, sizeof(buf)), SyscallSucceeds());
+  EXPECT_EQ(std::string(buf, read_n), contents);
+
+  absl::Time const after = ASSERT_NO_ERRNO_AND_VALUE(ATime(file.path()));
+
+  // Expect that atime hasn't changed.
+  EXPECT_EQ(before, after);
+}
+
+TEST(MountTest, RenameRemoveMountPoint) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir_parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const dir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir_parent.path()));
+  auto const new_dir = NewTempAbsPath();
+
+  auto const mount =
+      ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "", 0));
+
+  ASSERT_THAT(rename(dir.path().c_str(), new_dir.c_str()),
+              SyscallFailsWithErrno(EBUSY));
+
+  ASSERT_THAT(rmdir(dir.path().c_str()), SyscallFailsWithErrno(EBUSY));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/mremap.cc b/test/syscalls/linux/mremap.cc
new file mode 100644
index 000000000..ededab336
--- /dev/null
+++ b/test/syscalls/linux/mremap.cc
@@ -0,0 +1,514 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <string>
+
+#include "gmock/gmock.h"
+#include "absl/strings/string_view.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::_;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Wrapper for mremap that returns a PosixErrorOr<>, since the return type of
+// void* isn't directly compatible with SyscallSucceeds.
+PosixErrorOr<void*> Mremap(void* old_address, size_t old_size, size_t new_size,
+                           int flags, void* new_address) {
+  void* rv = mremap(old_address, old_size, new_size, flags, new_address);
+  if (rv == MAP_FAILED) {
+    return PosixError(errno, "mremap failed");
+  }
+  return rv;
+}
+
+// Returns true if the page containing addr is mapped.
+bool IsMapped(uintptr_t addr) {
+  int const rv = msync(reinterpret_cast<void*>(addr & ~(kPageSize - 1)),
+                       kPageSize, MS_ASYNC);
+  if (rv == 0) {
+    return true;
+  }
+  TEST_PCHECK_MSG(errno == ENOMEM, "msync failed with unexpected errno");
+  return false;
+}
+
+// Fixture for mremap tests parameterized by mmap flags.
+using MremapParamTest = ::testing::TestWithParam<int>;
+
+TEST_P(MremapParamTest, Noop) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+
+  ASSERT_THAT(Mremap(m.ptr(), kPageSize, kPageSize, 0, nullptr),
+              IsPosixErrorOkAndHolds(m.ptr()));
+  EXPECT_TRUE(IsMapped(m.addr()));
+}
+
+TEST_P(MremapParamTest, InPlace_ShrinkingWholeVMA) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // N.B. we must be in a single-threaded subprocess to ensure a
+    // background thread doesn't concurrently map the second page.
+    void* addr = mremap(m.ptr(), 2 * kPageSize, kPageSize, 0, nullptr);
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == m.ptr());
+    MaybeSave();
+
+    TEST_CHECK(IsMapped(m.addr()));
+    TEST_CHECK(!IsMapped(m.addr() + kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, InPlace_ShrinkingPartialVMA) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    void* addr = mremap(m.ptr(), 2 * kPageSize, kPageSize, 0, nullptr);
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == m.ptr());
+    MaybeSave();
+
+    TEST_CHECK(IsMapped(m.addr()));
+    TEST_CHECK(!IsMapped(m.addr() + kPageSize));
+    TEST_CHECK(IsMapped(m.addr() + 2 * kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, InPlace_ShrinkingAcrossVMAs) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_READ, GetParam()));
+  // Changing permissions on the first page forces it to become a separate vma.
+  ASSERT_THAT(mprotect(m.ptr(), kPageSize, PROT_NONE), SyscallSucceeds());
+
+  const auto rest = [&] {
+    // Both old_size and new_size now span two vmas; mremap
+    // shouldn't care.
+    void* addr = mremap(m.ptr(), 3 * kPageSize, 2 * kPageSize, 0, nullptr);
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == m.ptr());
+    MaybeSave();
+
+    TEST_CHECK(IsMapped(m.addr()));
+    TEST_CHECK(IsMapped(m.addr() + kPageSize));
+    TEST_CHECK(!IsMapped(m.addr() + 2 * kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, InPlace_ExpansionSuccess) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // Unmap the second page so that the first can be expanded back into it.
+    //
+    // N.B. we must be in a single-threaded subprocess to ensure a
+    // background thread doesn't concurrently map this page.
+    TEST_PCHECK(
+        munmap(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize) == 0);
+    MaybeSave();
+
+    void* addr = mremap(m.ptr(), kPageSize, 2 * kPageSize, 0, nullptr);
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == m.ptr());
+    MaybeSave();
+
+    TEST_CHECK(IsMapped(m.addr()));
+    TEST_CHECK(IsMapped(m.addr() + kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, InPlace_ExpansionFailure) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // Unmap the second page, leaving a one-page hole. Trying to expand the
+    // first page to three pages should fail since the original third page
+    // is still mapped.
+    TEST_PCHECK(
+        munmap(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize) == 0);
+    MaybeSave();
+
+    void* addr = mremap(m.ptr(), kPageSize, 3 * kPageSize, 0, nullptr);
+    TEST_CHECK_MSG(addr == MAP_FAILED, "mremap unexpectedly succeeded");
+    TEST_PCHECK_MSG(errno == ENOMEM, "mremap failed with wrong errno");
+    MaybeSave();
+
+    TEST_CHECK(IsMapped(m.addr()));
+    TEST_CHECK(!IsMapped(m.addr() + kPageSize));
+    TEST_CHECK(IsMapped(m.addr() + 2 * kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, MayMove_Expansion) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // Unmap the second page, leaving a one-page hole. Trying to expand the
+    // first page to three pages with MREMAP_MAYMOVE should force the
+    // mapping to be relocated since the original third page is still
+    // mapped.
+    TEST_PCHECK(
+        munmap(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize) == 0);
+    MaybeSave();
+
+    void* addr2 =
+        mremap(m.ptr(), kPageSize, 3 * kPageSize, MREMAP_MAYMOVE, nullptr);
+    TEST_PCHECK_MSG(addr2 != MAP_FAILED, "mremap failed");
+    MaybeSave();
+
+    const Mapping m2 = Mapping(addr2, 3 * kPageSize);
+    TEST_CHECK(m.addr() != m2.addr());
+
+    TEST_CHECK(!IsMapped(m.addr()));
+    TEST_CHECK(!IsMapped(m.addr() + kPageSize));
+    TEST_CHECK(IsMapped(m.addr() + 2 * kPageSize));
+    TEST_CHECK(IsMapped(m2.addr()));
+    TEST_CHECK(IsMapped(m2.addr() + kPageSize));
+    TEST_CHECK(IsMapped(m2.addr() + 2 * kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_SourceAndDestinationCannotOverlap) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+
+  ASSERT_THAT(Mremap(m.ptr(), kPageSize, kPageSize,
+                     MREMAP_MAYMOVE | MREMAP_FIXED, m.ptr()),
+              PosixErrorIs(EINVAL, _));
+  EXPECT_TRUE(IsMapped(m.addr()));
+}
+
+TEST_P(MremapParamTest, Fixed_SameSize) {
+  Mapping const src =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+  Mapping const dst =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // Unmap dst to create a hole.
+    TEST_PCHECK(munmap(dst.ptr(), kPageSize) == 0);
+    MaybeSave();
+
+    void* addr = mremap(src.ptr(), kPageSize, kPageSize,
+                        MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == dst.ptr());
+    MaybeSave();
+
+    TEST_CHECK(!IsMapped(src.addr()));
+    TEST_CHECK(IsMapped(dst.addr()));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_SameSize_Unmapping) {
+  // Like the Fixed_SameSize case, but expect mremap to unmap the destination
+  // automatically.
+  Mapping const src =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+  Mapping const dst =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    void* addr = mremap(src.ptr(), kPageSize, kPageSize,
+                        MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == dst.ptr());
+    MaybeSave();
+
+    TEST_CHECK(!IsMapped(src.addr()));
+    TEST_CHECK(IsMapped(dst.addr()));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_ShrinkingWholeVMA) {
+  Mapping const src =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+  Mapping const dst =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // Unmap dst so we can check that mremap does not keep the
+    // second page.
+    TEST_PCHECK(munmap(dst.ptr(), 2 * kPageSize) == 0);
+    MaybeSave();
+
+    void* addr = mremap(src.ptr(), 2 * kPageSize, kPageSize,
+                        MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == dst.ptr());
+    MaybeSave();
+
+    TEST_CHECK(!IsMapped(src.addr()));
+    TEST_CHECK(!IsMapped(src.addr() + kPageSize));
+    TEST_CHECK(IsMapped(dst.addr()));
+    TEST_CHECK(!IsMapped(dst.addr() + kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_ShrinkingPartialVMA) {
+  Mapping const src =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_NONE, GetParam()));
+  Mapping const dst =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // Unmap dst so we can check that mremap does not keep the
+    // second page.
+    TEST_PCHECK(munmap(dst.ptr(), 2 * kPageSize) == 0);
+    MaybeSave();
+
+    void* addr = mremap(src.ptr(), 2 * kPageSize, kPageSize,
+                        MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == dst.ptr());
+    MaybeSave();
+
+    TEST_CHECK(!IsMapped(src.addr()));
+    TEST_CHECK(!IsMapped(src.addr() + kPageSize));
+    TEST_CHECK(IsMapped(src.addr() + 2 * kPageSize));
+    TEST_CHECK(IsMapped(dst.addr()));
+    TEST_CHECK(!IsMapped(dst.addr() + kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_ShrinkingAcrossVMAs) {
+  Mapping const src =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_READ, GetParam()));
+  // Changing permissions on the first page forces it to become a separate vma.
+  ASSERT_THAT(mprotect(src.ptr(), kPageSize, PROT_NONE), SyscallSucceeds());
+  Mapping const dst =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // Unlike flags=0, MREMAP_FIXED requires that [old_address,
+    // old_address+new_size) only spans a single vma.
+    void* addr = mremap(src.ptr(), 3 * kPageSize, 2 * kPageSize,
+                        MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+    TEST_CHECK_MSG(addr == MAP_FAILED, "mremap unexpectedly succeeded");
+    TEST_PCHECK_MSG(errno == EFAULT, "mremap failed with wrong errno");
+    MaybeSave();
+
+    TEST_CHECK(IsMapped(src.addr()));
+    TEST_CHECK(IsMapped(src.addr() + kPageSize));
+    // Despite failing, mremap should have unmapped [old_address+new_size,
+    // old_address+old_size) (i.e. the third page).
+    TEST_CHECK(!IsMapped(src.addr() + 2 * kPageSize));
+    // Despite failing, mremap should have unmapped the destination pages.
+    TEST_CHECK(!IsMapped(dst.addr()));
+    TEST_CHECK(!IsMapped(dst.addr() + kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_Expansion) {
+  Mapping const src =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+  Mapping const dst =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+  const auto rest = [&] {
+    // Unmap dst so we can check that mremap actually maps all pages
+    // at the destination.
+    TEST_PCHECK(munmap(dst.ptr(), 2 * kPageSize) == 0);
+    MaybeSave();
+
+    void* addr = mremap(src.ptr(), kPageSize, 2 * kPageSize,
+                        MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+    TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(addr == dst.ptr());
+    MaybeSave();
+
+    TEST_CHECK(!IsMapped(src.addr()));
+    TEST_CHECK(IsMapped(dst.addr()));
+    TEST_CHECK(IsMapped(dst.addr() + kPageSize));
+  };
+
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+INSTANTIATE_TEST_CASE_P(PrivateShared, MremapParamTest,
+                        ::testing::Values(MAP_PRIVATE, MAP_SHARED));
+
+// mremap with old_size == 0 only works with MAP_SHARED after Linux 4.14
+// (dba58d3b8c50 "mm/mremap: fail map duplication attempts for private
+// mappings").
+
+TEST(MremapTest, InPlace_Copy) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_SHARED));
+  EXPECT_THAT(Mremap(m.ptr(), 0, kPageSize, 0, nullptr),
+              PosixErrorIs(ENOMEM, _));
+}
+
+TEST(MremapTest, MayMove_Copy) {
+  Mapping const m =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_SHARED));
+
+  // Remainder of this test executes in a subprocess to ensure that if mremap
+  // incorrectly removes m, it is not remapped by another thread.
+  const auto rest = [&] {
+    void* ptr = mremap(m.ptr(), 0, kPageSize, MREMAP_MAYMOVE, nullptr);
+    MaybeSave();
+    TEST_PCHECK_MSG(ptr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(ptr != m.ptr());
+    TEST_CHECK(IsMapped(m.addr()));
+    TEST_CHECK(IsMapped(reinterpret_cast<uintptr_t>(ptr)));
+  };
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MremapTest, MustMove_Copy) {
+  Mapping const src =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_SHARED));
+  Mapping const dst =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+
+  // Remainder of this test executes in a subprocess to ensure that if mremap
+  // incorrectly removes src, it is not remapped by another thread.
+  const auto rest = [&] {
+    void* ptr = mremap(src.ptr(), 0, kPageSize, MREMAP_MAYMOVE | MREMAP_FIXED,
+                       dst.ptr());
+    MaybeSave();
+    TEST_PCHECK_MSG(ptr != MAP_FAILED, "mremap failed");
+    TEST_CHECK(ptr == dst.ptr());
+    TEST_CHECK(IsMapped(src.addr()));
+    TEST_CHECK(IsMapped(dst.addr()));
+  };
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+void ExpectAllBytesAre(absl::string_view v, char c) {
+  for (size_t i = 0; i < v.size(); i++) {
+    ASSERT_EQ(v[i], c) << "at offset " << i;
+  }
+}
+
+TEST(MremapTest, ExpansionPreservesCOWPagesAndExposesNewFilePages) {
+  // Create a file with 3 pages. The first is filled with 'a', the second is
+  // filled with 'b', and the third is filled with 'c'.
+  TempPath const file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+  ASSERT_THAT(WriteFd(fd.get(), std::string(kPageSize, 'a').c_str(), kPageSize),
+              SyscallSucceedsWithValue(kPageSize));
+  ASSERT_THAT(WriteFd(fd.get(), std::string(kPageSize, 'b').c_str(), kPageSize),
+              SyscallSucceedsWithValue(kPageSize));
+  ASSERT_THAT(WriteFd(fd.get(), std::string(kPageSize, 'c').c_str(), kPageSize),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Create a private mapping of the first 2 pages, and fill the second page
+  // with 'd'.
+  Mapping const src = ASSERT_NO_ERRNO_AND_VALUE(Mmap(nullptr, 2 * kPageSize,
+                                                     PROT_READ | PROT_WRITE,
+                                                     MAP_PRIVATE, fd.get(), 0));
+  memset(reinterpret_cast<void*>(src.addr() + kPageSize), 'd', kPageSize);
+  MaybeSave();
+
+  // Move the mapping while expanding it to 3 pages. The resulting mapping
+  // should contain the original first page of the file (filled with 'a'),
+  // followed by the private copy of the second page (filled with 'd'), followed
+  // by the newly-mapped third page of the file (filled with 'c').
+  Mapping const dst = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(3 * kPageSize, PROT_NONE, MAP_PRIVATE));
+  ASSERT_THAT(Mremap(src.ptr(), 2 * kPageSize, 3 * kPageSize,
+                     MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr()),
+              IsPosixErrorOkAndHolds(dst.ptr()));
+  auto const v = dst.view();
+  ExpectAllBytesAre(v.substr(0, kPageSize), 'a');
+  ExpectAllBytesAre(v.substr(kPageSize, kPageSize), 'd');
+  ExpectAllBytesAre(v.substr(2 * kPageSize, kPageSize), 'c');
+}
+
+TEST(MremapDeathTest, SharedAnon) {
+  SetupGvisorDeathTest();
+
+  // Reserve 4 pages of address space.
+  Mapping const reserved = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(4 * kPageSize, PROT_NONE, MAP_PRIVATE));
+
+  // Create a 2-page shared anonymous mapping at the beginning of the
+  // reservation. Fill the first page with 'a' and the second with 'b'.
+  Mapping const m = ASSERT_NO_ERRNO_AND_VALUE(
+      Mmap(reserved.ptr(), 2 * kPageSize, PROT_READ | PROT_WRITE,
+           MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0));
+  memset(m.ptr(), 'a', kPageSize);
+  memset(reinterpret_cast<void*>(m.addr() + kPageSize), 'b', kPageSize);
+  MaybeSave();
+
+  // Shrink the mapping to 1 page in-place.
+  ASSERT_THAT(Mremap(m.ptr(), 2 * kPageSize, kPageSize, 0, m.ptr()),
+              IsPosixErrorOkAndHolds(m.ptr()));
+
+  // Expand the mapping to 3 pages, moving it forward by 1 page in the process
+  // since the old and new mappings can't overlap.
+  void* const new_m = reinterpret_cast<void*>(m.addr() + kPageSize);
+  ASSERT_THAT(Mremap(m.ptr(), kPageSize, 3 * kPageSize,
+                     MREMAP_MAYMOVE | MREMAP_FIXED, new_m),
+              IsPosixErrorOkAndHolds(new_m));
+
+  // The first 2 pages of the mapping should still contain the data we wrote
+  // (i.e. shrinking should not have discarded the second page's data), while
+  // touching the third page should raise SIGBUS.
+  auto const v =
+      absl::string_view(static_cast<char const*>(new_m), 3 * kPageSize);
+  ExpectAllBytesAre(v.substr(0, kPageSize), 'a');
+  ExpectAllBytesAre(v.substr(kPageSize, kPageSize), 'b');
+  EXPECT_EXIT(ExpectAllBytesAre(v.substr(2 * kPageSize, kPageSize), '\0'),
+              ::testing::KilledBySignal(SIGBUS), "");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
new file mode 100644
index 000000000..0ddc621aa
--- /dev/null
+++ b/test/syscalls/linux/msync.cc
@@ -0,0 +1,145 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "test/util/file_descriptor.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Parameters for msync tests. Use a std::tuple so we can use
+// ::testing::Combine.
+using MsyncTestParam =
+    std::tuple<int,                                    // msync flags
+               std::function<PosixErrorOr<Mapping>()>  // returns mapping to
+                                                       // msync
+               >;
+
+class MsyncParameterizedTest : public ::testing::TestWithParam<MsyncTestParam> {
+ protected:
+  int msync_flags() const { return std::get<0>(GetParam()); }
+
+  PosixErrorOr<Mapping> GetMapping() const {
+    auto rv = std::get<1>(GetParam())();
+    return rv;
+  }
+};
+
+// All valid msync(2) flag combinations (not including MS_INVALIDATE, which
+// gVisor doesn't implement).
+constexpr std::initializer_list<int> kMsyncFlags = {MS_SYNC, MS_ASYNC, 0};
+
+// Returns functions that return mappings that should be successfully
+// msync()able.
+std::vector<std::function<PosixErrorOr<Mapping>()>> SyncableMappings() {
+  std::vector<std::function<PosixErrorOr<Mapping>()>> funcs;
+  for (bool const writable : {false, true}) {
+    for (int const mflags : {MAP_PRIVATE, MAP_SHARED}) {
+      int const prot = PROT_READ | (writable ? PROT_WRITE : 0);
+      int const oflags = O_CREAT | (writable ? O_RDWR : O_RDONLY);
+      funcs.push_back([=] {
+        return MmapAnon(kPageSize, prot, mflags);
+      });
+      funcs.push_back([=]() -> PosixErrorOr<Mapping> {
+        std::string const path = NewTempAbsPath();
+        ASSIGN_OR_RETURN_ERRNO(auto fd, Open(path, oflags, 0644));
+        // Don't unlink the file since that breaks save/restore. Just let the
+        // test infrastructure clean up all of our temporary files when we're
+        // done.
+        return Mmap(nullptr, kPageSize, prot, mflags, fd.get(), 0);
+      });
+    }
+  }
+  return funcs;
+}
+
+PosixErrorOr<Mapping> NoMappings() {
+  return PosixError(EINVAL, "unexpected attempt to create a mapping");
+}
+
+// "Fixture" for msync tests that hold for all valid flags, but do not create
+// mappings.
+using MsyncNoMappingTest = MsyncParameterizedTest;
+
+TEST_P(MsyncNoMappingTest, UnmappedAddressWithZeroLengthSucceeds) {
+  EXPECT_THAT(msync(nullptr, 0, msync_flags()), SyscallSucceeds());
+}
+
+TEST_P(MsyncNoMappingTest, UnmappedAddressWithNonzeroLengthFails) {
+  EXPECT_THAT(msync(nullptr, kPageSize, msync_flags()),
+              SyscallFailsWithErrno(ENOMEM));
+}
+
+INSTANTIATE_TEST_CASE_P(All, MsyncNoMappingTest,
+                        ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
+                                           ::testing::Values(NoMappings)));
+
+// "Fixture" for msync tests that are not parameterized by msync flags, but do
+// create mappings.
+using MsyncNoFlagsTest = MsyncParameterizedTest;
+
+TEST_P(MsyncNoFlagsTest, BothSyncAndAsyncFails) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+  EXPECT_THAT(msync(m.ptr(), m.len(), MS_SYNC | MS_ASYNC),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    All, MsyncNoFlagsTest,
+    ::testing::Combine(::testing::Values(0),  // ignored
+                       ::testing::ValuesIn(SyncableMappings())));
+
+// "Fixture" for msync tests parameterized by both msync flags and sources of
+// mappings.
+using MsyncFullParamTest = MsyncParameterizedTest;
+
+TEST_P(MsyncFullParamTest, NormallySucceeds) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+  EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags()), SyscallSucceeds());
+}
+
+TEST_P(MsyncFullParamTest, UnalignedLengthSucceeds) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+  EXPECT_THAT(msync(m.ptr(), m.len() - 1, msync_flags()), SyscallSucceeds());
+}
+
+TEST_P(MsyncFullParamTest, UnalignedAddressFails) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+  EXPECT_THAT(
+      msync(reinterpret_cast<void*>(m.addr() + 1), m.len() - 1, msync_flags()),
+      SyscallFailsWithErrno(EINVAL));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    All, MsyncFullParamTest,
+    ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
+                       ::testing::ValuesIn(SyncableMappings())));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/munmap.cc b/test/syscalls/linux/munmap.cc
new file mode 100644
index 000000000..e20039950
--- /dev/null
+++ b/test/syscalls/linux/munmap.cc
@@ -0,0 +1,53 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/mman.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class MunmapTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    m_ = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+              MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    ASSERT_NE(MAP_FAILED, m_);
+  }
+
+  void* m_ = nullptr;
+};
+
+TEST_F(MunmapTest, HappyCase) {
+  EXPECT_THAT(munmap(m_, kPageSize), SyscallSucceeds());
+}
+
+TEST_F(MunmapTest, ZeroLength) {
+  EXPECT_THAT(munmap(m_, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(MunmapTest, LastPageRoundUp) {
+  // Attempt to unmap up to and including the last page.
+  EXPECT_THAT(munmap(m_, static_cast<size_t>(-kPageSize + 1)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
new file mode 100644
index 000000000..5770680cd
--- /dev/null
+++ b/test/syscalls/linux/open.cc
@@ -0,0 +1,340 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/capability.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// This test is currently very rudimentary.
+//
+// There are plenty of extra cases to cover once the sentry supports them.
+//
+// Different types of opens:
+// * O_CREAT
+// * O_DIRECTORY
+// * O_NOFOLLOW
+// * O_PATH <- Will we ever support this?
+//
+// Special operations on open:
+// * O_EXCL
+//
+// Special files:
+// * Blocking behavior for a named pipe.
+//
+// Different errors:
+// * EACCES
+// * EEXIST
+// * ENAMETOOLONG
+// * ELOOP
+// * ENOTDIR
+// * EPERM
+class OpenTest : public FileTest {
+  void SetUp() override {
+    FileTest::SetUp();
+
+    ASSERT_THAT(
+        write(test_file_fd_.get(), test_data_.c_str(), test_data_.length()),
+        SyscallSucceedsWithValue(test_data_.length()));
+    EXPECT_THAT(lseek(test_file_fd_.get(), 0, SEEK_SET), SyscallSucceeds());
+  }
+
+ public:
+  const std::string test_data_ = "hello world\n";
+};
+
+TEST_F(OpenTest, ReadOnly) {
+  char buf;
+  const FileDescriptor ro_file =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+  EXPECT_THAT(read(ro_file.get(), &buf, 1), SyscallSucceedsWithValue(1));
+  EXPECT_THAT(lseek(ro_file.get(), 0, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(write(ro_file.get(), &buf, 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(OpenTest, WriteOnly) {
+  char buf;
+  const FileDescriptor wo_file =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_WRONLY));
+
+  EXPECT_THAT(read(wo_file.get(), &buf, 1), SyscallFailsWithErrno(EBADF));
+  EXPECT_THAT(lseek(wo_file.get(), 0, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(write(wo_file.get(), &buf, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_F(OpenTest, ReadWrite) {
+  char buf;
+  const FileDescriptor rw_file =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  EXPECT_THAT(read(rw_file.get(), &buf, 1), SyscallSucceedsWithValue(1));
+  EXPECT_THAT(lseek(rw_file.get(), 0, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(write(rw_file.get(), &buf, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_F(OpenTest, RelPath) {
+  auto name = std::string(Basename(test_file_name_));
+
+  ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name, O_RDONLY));
+}
+
+TEST_F(OpenTest, AbsPath) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+}
+
+TEST_F(OpenTest, AtRelPath) {
+  auto name = std::string(Basename(test_file_name_));
+  const FileDescriptor dirfd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(GetAbsoluteTestTmpdir(), O_RDONLY | O_DIRECTORY));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAt(dirfd.get(), name, O_RDONLY));
+}
+
+TEST_F(OpenTest, AtAbsPath) {
+  const FileDescriptor dirfd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(GetAbsoluteTestTmpdir(), O_RDONLY | O_DIRECTORY));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAt(dirfd.get(), test_file_name_, O_RDONLY));
+}
+
+TEST_F(OpenTest, OpenNoFollowSymlink) {
+  const std::string link_path = JoinPath(GetAbsoluteTestTmpdir(), "link");
+  ASSERT_THAT(symlink(test_file_name_.c_str(), link_path.c_str()),
+              SyscallSucceeds());
+  auto cleanup = Cleanup([link_path]() {
+    EXPECT_THAT(unlink(link_path.c_str()), SyscallSucceeds());
+  });
+
+  // Open will succeed without O_NOFOLLOW and fails with O_NOFOLLOW.
+  const FileDescriptor fd2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(link_path, O_RDONLY));
+  ASSERT_THAT(open(link_path.c_str(), O_RDONLY | O_NOFOLLOW),
+              SyscallFailsWithErrno(ELOOP));
+}
+
+TEST_F(OpenTest, OpenNoFollowStillFollowsLinksInPath) {
+  // We will create the following structure:
+  // tmp_folder/real_folder/file
+  // tmp_folder/sym_folder -> tmp_folder/real_folder
+  //
+  // We will then open tmp_folder/sym_folder/file with O_NOFOLLOW and it
+  // should succeed as O_NOFOLLOW only applies to the final path component.
+  auto tmp_path =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(GetAbsoluteTestTmpdir()));
+  auto sym_path = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), tmp_path.path()));
+  auto file_path =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(tmp_path.path()));
+
+  auto path_via_symlink = JoinPath(sym_path.path(), Basename(file_path.path()));
+  const FileDescriptor fd2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path_via_symlink, O_RDONLY | O_NOFOLLOW));
+}
+
+TEST_F(OpenTest, Fault) {
+  char* totally_not_null = nullptr;
+  ASSERT_THAT(open(totally_not_null, O_RDONLY), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(OpenTest, AppendOnly) {
+  // First write some data to the fresh file.
+  const int64_t kBufSize = 1024;
+  std::vector<char> buf(kBufSize, 'a');
+
+  FileDescriptor fd0 = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  std::fill(buf.begin(), buf.end(), 'a');
+  EXPECT_THAT(WriteFd(fd0.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  fd0.reset();  // Close the file early.
+
+  // Next get two handles to the same file. We open two files because we want
+  // to make sure that appending is respected between them.
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR | O_APPEND));
+  EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR | O_APPEND));
+  EXPECT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+  // Then try to write to the first file and make sure the bytes are appended.
+  EXPECT_THAT(WriteFd(fd1.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Check that the size of the file is correct and that the offset has been
+  // incremented to that size.
+  struct stat s0;
+  EXPECT_THAT(fstat(fd1.get(), &s0), SyscallSucceeds());
+  EXPECT_EQ(s0.st_size, kBufSize * 2);
+  EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR),
+              SyscallSucceedsWithValue(kBufSize * 2));
+
+  // Then try to write to the second file and make sure the bytes are appended.
+  EXPECT_THAT(WriteFd(fd2.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Check that the size of the file is correct and that the offset has been
+  // incremented to that size.
+  struct stat s1;
+  EXPECT_THAT(fstat(fd2.get(), &s1), SyscallSucceeds());
+  EXPECT_EQ(s1.st_size, kBufSize * 3);
+  EXPECT_THAT(lseek(fd2.get(), 0, SEEK_CUR),
+              SyscallSucceedsWithValue(kBufSize * 3));
+}
+
+TEST_F(OpenTest, Truncate) {
+  {
+    // First write some data to the new file and close it.
+    FileDescriptor fd0 =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_WRONLY));
+    std::vector<char> orig(10, 'a');
+    EXPECT_THAT(WriteFd(fd0.get(), orig.data(), orig.size()),
+                SyscallSucceedsWithValue(orig.size()));
+  }
+
+  // Then open with truncate and verify that offset is set to 0.
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR | O_TRUNC));
+  EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+  // Then write less data to the file and ensure the old content is gone.
+  std::vector<char> want(5, 'b');
+  EXPECT_THAT(WriteFd(fd1.get(), want.data(), want.size()),
+              SyscallSucceedsWithValue(want.size()));
+
+  struct stat stat;
+  EXPECT_THAT(fstat(fd1.get(), &stat), SyscallSucceeds());
+  EXPECT_EQ(stat.st_size, want.size());
+  EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR),
+              SyscallSucceedsWithValue(want.size()));
+
+  // Read the data and ensure only the latest write is in the file.
+  std::vector<char> got(want.size() + 1, 'c');
+  ASSERT_THAT(pread(fd1.get(), got.data(), got.size(), 0),
+              SyscallSucceedsWithValue(want.size()));
+  EXPECT_EQ(memcmp(want.data(), got.data(), want.size()), 0)
+      << "rbuf=" << got.data();
+  EXPECT_EQ(got.back(), 'c');  // Last byte should not have been modified.
+}
+
+TEST_F(OpenTest, NameTooLong) {
+  char buf[4097] = {};
+  memset(buf, 'a', 4097);
+  EXPECT_THAT(open(buf, O_RDONLY), SyscallFailsWithErrno(ENAMETOOLONG));
+}
+
+TEST_F(OpenTest, DotsFromRoot) {
+  const FileDescriptor rootfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/", O_RDONLY | O_DIRECTORY));
+  const FileDescriptor other_rootfd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAt(rootfd.get(), "..", O_RDONLY));
+}
+
+TEST_F(OpenTest, DirectoryWritableFails) {
+  ASSERT_THAT(open(GetAbsoluteTestTmpdir().c_str(), O_RDWR),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(OpenTest, FileNotDirectory) {
+  // Create a file and try to open it with O_DIRECTORY.
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  ASSERT_THAT(open(file.path().c_str(), O_RDONLY | O_DIRECTORY),
+              SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST_F(OpenTest, Null) {
+  char c = '\0';
+  ASSERT_THAT(open(&c, O_RDONLY), SyscallFailsWithErrno(ENOENT));
+}
+
+// NOTE: While the man pages specify that this behavior should be
+// undefined, Linux truncates the file on opening read only if we have write
+// permission, so we will too.
+TEST_F(OpenTest, CanTruncateReadOnly) {
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY | O_TRUNC));
+
+  struct stat stat;
+  EXPECT_THAT(fstat(fd1.get(), &stat), SyscallSucceeds());
+  EXPECT_EQ(stat.st_size, 0);
+}
+
+// If we don't have read permission on the file, opening with
+// O_TRUNC should fail.
+TEST_F(OpenTest, CanTruncateReadOnlyNoWritePermission) {
+  // Drop capabilities that allow us to override file permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  const DisableSave ds;  // Permissions are dropped.
+  ASSERT_THAT(chmod(test_file_name_.c_str(), S_IRUSR | S_IRGRP),
+              SyscallSucceeds());
+
+  ASSERT_THAT(open(test_file_name_.c_str(), O_RDONLY | O_TRUNC),
+              SyscallFailsWithErrno(EACCES));
+
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+  struct stat stat;
+  EXPECT_THAT(fstat(fd1.get(), &stat), SyscallSucceeds());
+  EXPECT_EQ(stat.st_size, test_data_.size());
+}
+
+// If we don't have read permission but have write permission, opening O_WRONLY
+// and O_TRUNC should succeed.
+TEST_F(OpenTest, CanTruncateWriteOnlyNoReadPermission) {
+  const DisableSave ds;  // Permissions are dropped.
+
+  EXPECT_THAT(fchmod(test_file_fd_.get(), S_IWUSR | S_IWGRP),
+              SyscallSucceeds());
+
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_WRONLY | O_TRUNC));
+
+  EXPECT_THAT(fchmod(test_file_fd_.get(), S_IRUSR | S_IRGRP),
+              SyscallSucceeds());
+
+  const FileDescriptor fd2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+  struct stat stat;
+  EXPECT_THAT(fstat(fd2.get(), &stat), SyscallSucceeds());
+  EXPECT_EQ(stat.st_size, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
new file mode 100644
index 000000000..b2cbd63d1
--- /dev/null
+++ b/test/syscalls/linux/open_create.cc
@@ -0,0 +1,130 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/temp_umask.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+TEST(CreateTest, TmpFile) {
+  int fd;
+  EXPECT_THAT(fd = open(JoinPath(GetAbsoluteTestTmpdir(), "a").c_str(),
+                        O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(CreateTest, ExistingFile) {
+  int fd;
+  EXPECT_THAT(
+      fd = open(JoinPath(GetAbsoluteTestTmpdir(), "ExistingFile").c_str(),
+                O_RDWR | O_CREAT, 0666),
+      SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(
+      fd = open(JoinPath(GetAbsoluteTestTmpdir(), "ExistingFile").c_str(),
+                O_RDWR | O_CREAT, 0666),
+      SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(CreateTest, CreateAtFile) {
+  int dirfd;
+  EXPECT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_DIRECTORY, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(openat(dirfd, "CreateAtFile", O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(CreateTest, HonorsUmask_NoRandomSave) {
+  const DisableSave ds;  // file cannot be re-opened as writable.
+  TempUmask mask(0222);
+  int fd;
+  ASSERT_THAT(
+      fd = open(JoinPath(GetAbsoluteTestTmpdir(), "UmaskedFile").c_str(),
+                O_RDWR | O_CREAT, 0666),
+      SyscallSucceeds());
+  struct stat statbuf;
+  ASSERT_THAT(fstat(fd, &statbuf), SyscallSucceeds());
+  EXPECT_EQ(0444, statbuf.st_mode & 0777);
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(CreateTest, CreateExclusively) {
+  std::string filename = NewTempAbsPath();
+
+  int fd;
+  ASSERT_THAT(fd = open(filename.c_str(), O_CREAT | O_RDWR, 0644),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(open(filename.c_str(), O_CREAT | O_EXCL | O_RDWR, 0644),
+              SyscallFailsWithErrno(EEXIST));
+}
+
+TEST(CreateTest, CreateFailsOnUnpermittedDir) {
+  // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+  // always override directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_THAT(open("/foo", O_CREAT | O_RDWR, 0644),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) {
+  // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+  // always override directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  auto parent = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0555));
+  auto file = JoinPath(parent.path(), "foo");
+  ASSERT_THAT(open(file.c_str(), O_CREAT | O_RDWR, 0644),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// A file originally created RW, but opened RO can later be opened RW.
+TEST(CreateTest, OpenCreateROThenRW) {
+  TempPath file(NewTempAbsPath());
+
+  // Create a RW file, but only open it RO.
+  FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(file.path(), O_CREAT | O_EXCL | O_RDONLY, 0644));
+
+  // Now get a RW FD.
+  FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  // fd1 is not writable, but fd2 is.
+  char c = 'a';
+  EXPECT_THAT(WriteFd(fd1.get(), &c, 1), SyscallFailsWithErrno(EBADF));
+  EXPECT_THAT(WriteFd(fd2.get(), &c, 1), SyscallSucceedsWithValue(1));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
new file mode 100644
index 000000000..073a6b8c1
--- /dev/null
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -0,0 +1,305 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Gt;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kMessage[] = "hello world";
+
+// PartialBadBufferTest checks the result of various IO syscalls when passed a
+// buffer that does not have the space specified in the syscall (most of it is
+// PROT_NONE). Linux is annoyingly inconsistent among different syscalls, so we
+// test all of them.
+class PartialBadBufferTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Create and open a directory for getdents cases.
+    directory_ = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+    ASSERT_THAT(
+        directory_fd_ = open(directory_.path().c_str(), O_RDONLY | O_DIRECTORY),
+        SyscallSucceeds());
+
+    // Create and open a normal file, placing it in the directory
+    // so the getdents cases have some dirents.
+    name_ = JoinPath(directory_.path(), "a");
+    ASSERT_THAT(fd_ = open(name_.c_str(), O_RDWR | O_CREAT, 0644),
+                SyscallSucceeds());
+
+    // Write some initial data.
+    size_t size = sizeof(kMessage) - 1;
+    EXPECT_THAT(WriteFd(fd_, &kMessage, size), SyscallSucceedsWithValue(size));
+
+    ASSERT_THAT(lseek(fd_, 0, SEEK_SET), SyscallSucceeds());
+
+    addr_ = mmap(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
+                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    ASSERT_NE(addr_, MAP_FAILED);
+    char* buf = reinterpret_cast<char*>(addr_);
+
+    // Guard page for our read to run into.
+    ASSERT_THAT(mprotect(reinterpret_cast<void*>(buf + kPageSize), kPageSize,
+                         PROT_NONE),
+                SyscallSucceeds());
+
+    // Leave only one free byte in the buffer.
+    bad_buffer_ = buf + kPageSize - 1;
+  }
+
+  void TearDown() override {
+    EXPECT_THAT(munmap(addr_, 2 * kPageSize), SyscallSucceeds()) << addr_;
+    EXPECT_THAT(close(fd_), SyscallSucceeds());
+    EXPECT_THAT(unlink(name_.c_str()), SyscallSucceeds());
+    EXPECT_THAT(close(directory_fd_), SyscallSucceeds());
+  }
+
+  // Return buffer with n bytes of free space.
+  // N.B. this is the same buffer used to back bad_buffer_.
+  char* FreeBytes(size_t n) {
+    TEST_CHECK(n <= static_cast<size_t>(4096));
+    return reinterpret_cast<char*>(addr_) + kPageSize - n;
+  }
+
+  std::string name_;
+  int fd_;
+  TempPath directory_;
+  int directory_fd_;
+  void* addr_;
+  char* bad_buffer_;
+};
+
+// We do both "big" and "small" tests to try to hit the "zero copy" and
+// non-"zero copy" paths, which have different code paths for handling faults.
+
+TEST_F(PartialBadBufferTest, ReadBig) {
+  EXPECT_THAT(RetryEINTR(read)(fd_, bad_buffer_, kPageSize),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, ReadSmall) {
+  EXPECT_THAT(RetryEINTR(read)(fd_, bad_buffer_, 10),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, PreadBig) {
+  EXPECT_THAT(RetryEINTR(pread)(fd_, bad_buffer_, kPageSize, 0),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, PreadSmall) {
+  EXPECT_THAT(RetryEINTR(pread)(fd_, bad_buffer_, 10, 0),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, ReadvBig) {
+  struct iovec vec;
+  vec.iov_base = bad_buffer_;
+  vec.iov_len = kPageSize;
+
+  EXPECT_THAT(RetryEINTR(readv)(fd_, &vec, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, ReadvSmall) {
+  struct iovec vec;
+  vec.iov_base = bad_buffer_;
+  vec.iov_len = 10;
+
+  EXPECT_THAT(RetryEINTR(readv)(fd_, &vec, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, PreadvBig) {
+  struct iovec vec;
+  vec.iov_base = bad_buffer_;
+  vec.iov_len = kPageSize;
+
+  EXPECT_THAT(RetryEINTR(preadv)(fd_, &vec, 1, 0), SyscallSucceedsWithValue(1));
+  EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, PreadvSmall) {
+  struct iovec vec;
+  vec.iov_base = bad_buffer_;
+  vec.iov_len = 10;
+
+  EXPECT_THAT(RetryEINTR(preadv)(fd_, &vec, 1, 0), SyscallSucceedsWithValue(1));
+  EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, WriteBig) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, kPageSize),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(PartialBadBufferTest, WriteSmall) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(PartialBadBufferTest, PwriteBig) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, 0),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(PartialBadBufferTest, PwriteSmall) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, 10, 0),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(PartialBadBufferTest, WritevBig) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  struct iovec vec;
+  vec.iov_base = bad_buffer_;
+  vec.iov_len = kPageSize;
+
+  EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(PartialBadBufferTest, WritevSmall) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  struct iovec vec;
+  vec.iov_base = bad_buffer_;
+  vec.iov_len = 10;
+
+  EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(PartialBadBufferTest, PwritevBig) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  struct iovec vec;
+  vec.iov_base = bad_buffer_;
+  vec.iov_len = kPageSize;
+
+  EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(PartialBadBufferTest, PwritevSmall) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  struct iovec vec;
+  vec.iov_base = bad_buffer_;
+  vec.iov_len = 10;
+
+  EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+// getdents returns EFAULT when the you claim the buffer is large enough, but
+// it actually isn't.
+TEST_F(PartialBadBufferTest, GetdentsBig) {
+  EXPECT_THAT(RetryEINTR(syscall)(SYS_getdents64, directory_fd_, bad_buffer_,
+                                  kPageSize),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+// getdents returns EINVAL when the you claim the buffer is too small.
+TEST_F(PartialBadBufferTest, GetdentsSmall) {
+  EXPECT_THAT(
+      RetryEINTR(syscall)(SYS_getdents64, directory_fd_, bad_buffer_, 10),
+      SyscallFailsWithErrno(EINVAL));
+}
+
+// getdents will write entries into a buffer if there is space before it faults.
+TEST_F(PartialBadBufferTest, GetdentsOneEntry) {
+  // 30 bytes is enough for one (small) entry.
+  char* buf = FreeBytes(30);
+
+  EXPECT_THAT(
+      RetryEINTR(syscall)(SYS_getdents64, directory_fd_, buf, kPageSize),
+      SyscallSucceedsWithValue(Gt(0)));
+}
+
+// Verify that when write returns EFAULT the kernel hasn't silently written
+// the initial valid bytes.
+TEST_F(PartialBadBufferTest, WriteEfaultIsntPartial) {
+  // FIXME: The sentry write syscalls will return immediately
+  // if Access returns an error, but Access may not return an error
+  // and the sentry will instead perform a partial write.
+  SKIP_IF(IsRunningOnGvisor());
+
+  bad_buffer_[0] = 'A';
+  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10),
+              SyscallFailsWithErrno(EFAULT));
+
+  size_t size = 255;
+  char buf[255];
+  memset(buf, 0, size);
+
+  EXPECT_THAT(RetryEINTR(pread)(fd_, buf, size, 0),
+              SyscallSucceedsWithValue(sizeof(kMessage) - 1));
+
+  // 'A' has not been written.
+  EXPECT_STREQ(buf, kMessage);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/pause.cc b/test/syscalls/linux/pause.cc
new file mode 100644
index 000000000..4e1148c24
--- /dev/null
+++ b/test/syscalls/linux/pause.cc
@@ -0,0 +1,88 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <atomic>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void NoopSignalHandler(int sig, siginfo_t* info, void* context) {}
+
+}  // namespace
+
+TEST(PauseTest, OnlyReturnsWhenSignalHandled) {
+  struct sigaction sa;
+  sigfillset(&sa.sa_mask);
+
+  // Ensure that SIGUSR1 is ignored.
+  sa.sa_handler = SIG_IGN;
+  ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+  // Register a handler for SIGUSR2.
+  sa.sa_sigaction = NoopSignalHandler;
+  sa.sa_flags = SA_SIGINFO;
+  ASSERT_THAT(sigaction(SIGUSR2, &sa, nullptr), SyscallSucceeds());
+
+  // The child sets their own tid.
+  absl::Mutex mu;
+  pid_t child_tid = 0;
+  bool child_tid_available = false;
+  std::atomic<int> sent_signal{0};
+  std::atomic<int> waking_signal{0};
+  ScopedThread t([&] {
+    mu.Lock();
+    child_tid = gettid();
+    child_tid_available = true;
+    mu.Unlock();
+    EXPECT_THAT(pause(), SyscallFailsWithErrno(EINTR));
+    waking_signal.store(sent_signal.load());
+  });
+  mu.Lock();
+  mu.Await(absl::Condition(&child_tid_available));
+  mu.Unlock();
+
+  // Wait a bit to let the child enter pause().
+  absl::SleepFor(absl::Seconds(3));
+
+  // The child should not be woken by SIGUSR1.
+  sent_signal.store(SIGUSR1);
+  ASSERT_THAT(tgkill(getpid(), child_tid, SIGUSR1), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(3));
+
+  // The child should be woken by SIGUSR2.
+  sent_signal.store(SIGUSR2);
+  ASSERT_THAT(tgkill(getpid(), child_tid, SIGUSR2), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(3));
+
+  EXPECT_EQ(SIGUSR2, waking_signal.load());
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
new file mode 100644
index 000000000..4731157e8
--- /dev/null
+++ b/test/syscalls/linux/pipe.cc
@@ -0,0 +1,480 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h> /* Obtain O_* constant definitions */
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Buffer size of a pipe.
+//
+// TODO: Get this from F_GETPIPE_SZ.
+constexpr int kPipeSize = 65536;
+
+class PipeTest : public ::testing::Test {
+ public:
+  static void SetUpTestCase() {
+    // Tests intentionally generate SIGPIPE.
+    TEST_PCHECK(signal(SIGPIPE, SIG_IGN) != SIG_ERR);
+  }
+
+  static void TearDownTestCase() {
+    TEST_PCHECK(signal(SIGPIPE, SIG_DFL) != SIG_ERR);
+  }
+};
+
+TEST_F(PipeTest, Basic) {
+  // fds[0] is read end, fds[1] is write end.
+  int fds[2];
+  int i = 0x12345678;
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  // Ensure that the inode number is the same for each end.
+  struct stat rst;
+  ASSERT_THAT(fstat(fds[0], &rst), SyscallSucceeds());
+  struct stat wst;
+  ASSERT_THAT(fstat(fds[1], &wst), SyscallSucceeds());
+  EXPECT_EQ(rst.st_ino, wst.st_ino);
+
+  ASSERT_THAT(write(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+  ASSERT_THAT(read(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+
+  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
+              SyscallSucceedsWithValue(sizeof(i)));
+  int j;
+  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
+  EXPECT_EQ(i, j);
+
+  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(fds[1], F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
+
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, BasicCloExec) {
+  // fds[0] is read end, fds[1] is write end.
+  int fds[2];
+  int i = 0x12345678;
+  ASSERT_THAT(pipe2(fds, O_CLOEXEC), SyscallSucceeds());
+
+  ASSERT_THAT(write(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+  ASSERT_THAT(read(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+
+  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
+              SyscallSucceedsWithValue(sizeof(i)));
+  int j;
+  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
+  EXPECT_EQ(i, j);
+
+  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(fds[1], F_GETFL), SyscallSucceeds());
+
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, BasicNoBlock) {
+  // fds[0] is read end, fds[1] is write end.
+  int fds[2];
+  int i = 0x12345678;
+  ASSERT_THAT(pipe2(fds, O_NONBLOCK), SyscallSucceeds());
+
+  ASSERT_THAT(write(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+  ASSERT_THAT(read(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+
+  ASSERT_THAT(read(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EWOULDBLOCK));
+  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
+              SyscallSucceedsWithValue(sizeof(i)));
+  int j;
+  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
+  EXPECT_EQ(i, j);
+  ASSERT_THAT(read(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EWOULDBLOCK));
+
+  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceedsWithValue(O_NONBLOCK));
+  ASSERT_THAT(fcntl(fds[1], F_GETFL),
+              SyscallSucceedsWithValue(O_NONBLOCK | O_WRONLY));
+
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, BasicBothOptions) {
+  // fds[0] is read end, fds[1] is write end.
+  int fds[2];
+  int i = 0x12345678;
+  ASSERT_THAT(pipe2(fds, O_NONBLOCK | O_CLOEXEC), SyscallSucceeds());
+
+  ASSERT_THAT(write(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+  ASSERT_THAT(read(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+
+  ASSERT_THAT(read(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EWOULDBLOCK));
+  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
+              SyscallSucceedsWithValue(sizeof(i)));
+  int j;
+  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
+  EXPECT_EQ(i, j);
+  ASSERT_THAT(read(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EWOULDBLOCK));
+
+  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceedsWithValue(O_NONBLOCK));
+  ASSERT_THAT(fcntl(fds[1], F_GETFL),
+              SyscallSucceedsWithValue(O_NONBLOCK | O_WRONLY));
+
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, BasicBadOptions) {
+  int fds[2];
+  ASSERT_THAT(pipe2(fds, 0xDEAD), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(PipeTest, Seek) {
+  // fds[0] is read end, fds[1] is write end.
+  int fds[2];
+  int i = 0x12345678;
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  ASSERT_THAT(lseek(fds[0], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[0], 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[0], 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+
+  ASSERT_THAT(lseek(fds[0], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[0], 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+
+  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
+              SyscallSucceedsWithValue(sizeof(i)));
+  int j;
+
+  ASSERT_THAT(lseek(fds[0], 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[0], 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+
+  ASSERT_THAT(lseek(fds[0], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[0], 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+  ASSERT_THAT(lseek(fds[1], 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+
+  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
+  EXPECT_EQ(i, j);
+
+  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(fds[1], F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
+
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, AbsoluteOffsetSyscallsFail) {
+  // Syscalls for IO at absolute offsets fail because pipes are not seekable.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  std::vector<char> buf(4096);
+  struct iovec iov;
+
+  EXPECT_THAT(pread(fds[1], buf.data(), buf.size(), 0),
+              SyscallFailsWithErrno(ESPIPE));
+  EXPECT_THAT(pwrite(fds[0], buf.data(), buf.size(), 0),
+              SyscallFailsWithErrno(ESPIPE));
+  EXPECT_THAT(preadv(fds[1], &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
+  EXPECT_THAT(pwritev(fds[0], &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
+
+  EXPECT_THAT(close(fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, WriterSideCloses) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  int rfd = fds[0];
+  int i = 123;
+  ScopedThread t([rfd]() {
+    int j;
+    ASSERT_THAT(read(rfd, &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
+    // This will return when the close() completes.
+    ASSERT_THAT(read(rfd, &j, sizeof(j)), SyscallSucceeds());
+    // This will return straight away.
+    ASSERT_THAT(read(rfd, &j, sizeof(j)), SyscallSucceeds());
+  });
+  // Sleep a bit so the thread can block.
+  absl::SleepFor(absl::Seconds(1.0));
+  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
+              SyscallSucceedsWithValue(sizeof(i)));
+  // Sleep a bit so the thread can block again.
+  absl::SleepFor(absl::Seconds(3.0));
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+  t.Join();
+
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, WriterSideClosesReadDataFirst) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  int i = 123;
+  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
+              SyscallSucceedsWithValue(sizeof(i)));
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+  int j;
+  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
+  ASSERT_EQ(j, i);
+  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceeds());
+
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, ReaderSideCloses) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  int i = 123;
+  ASSERT_THAT(write(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EPIPE));
+
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, CloseTwice) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[0]), SyscallFailsWithErrno(EBADF));
+  ASSERT_THAT(close(fds[1]), SyscallFailsWithErrno(EBADF));
+
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(fds[0]), SyscallFailsWithErrno(EBADF));
+  ASSERT_THAT(close(fds[1]), SyscallFailsWithErrno(EBADF));
+}
+
+// Blocking write returns EPIPE when read end is closed if nothing has been
+// written.
+TEST_F(PipeTest, BlockWriteClosed) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  int wfd = fds[1];
+
+  absl::Notification notify;
+  ScopedThread t([wfd, &notify]() {
+    std::vector<char> buf(kPipeSize);
+    // Exactly fill the pipe buffer.
+    ASSERT_THAT(WriteFd(wfd, buf.data(), buf.size()),
+                SyscallSucceedsWithValue(buf.size()));
+
+    notify.Notify();
+
+    // Attempt to write one more byte. Blocks.
+    // N.B. Don't use WriteFd, we don't want a retry.
+    ASSERT_THAT(write(wfd, buf.data(), 1), SyscallFailsWithErrno(EPIPE));
+  });
+
+  notify.WaitForNotification();
+  absl::SleepFor(absl::Seconds(1.0));
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+
+  t.Join();
+
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+// Blocking write returns EPIPE when read end is closed even if something has
+// been written.
+//
+// FIXME: Pipe writes blocking early allows S/R to interrupt the
+// write(2) call before the buffer is full. Then the next call will will return
+// non-zero instead of EPIPE.
+TEST_F(PipeTest, BlockPartialWriteClosed_NoRandomSave) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  int wfd = fds[1];
+
+  ScopedThread t([wfd]() {
+    std::vector<char> buf(2 * kPipeSize);
+    // Write more than fits in the buffer. Blocks then returns partial write
+    // when the other end is closed. The next call returns EPIPE.
+    if (IsRunningOnGvisor()) {
+      // FIXME: Pipe writes block early on gVisor, resulting in a
+      // shorter than expected partial write.
+      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                  SyscallSucceedsWithValue(::testing::Gt(0)));
+    } else {
+      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                  SyscallSucceedsWithValue(kPipeSize));
+    }
+    ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                SyscallFailsWithErrno(EPIPE));
+  });
+
+  // Leave time for write to become blocked.
+  absl::SleepFor(absl::Seconds(1.0));
+
+  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+
+  t.Join();
+
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, ReadFromClosedFd_NoRandomSave) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  int rfd = fds[0];
+  absl::Notification notify;
+  ScopedThread t([rfd, &notify]() {
+    int f;
+    notify.Notify();
+    ASSERT_THAT(read(rfd, &f, sizeof(f)), SyscallSucceedsWithValue(sizeof(f)));
+    ASSERT_EQ(123, f);
+  });
+  notify.WaitForNotification();
+  // Make sure that the thread gets to read().
+  absl::SleepFor(absl::Seconds(5.0));
+  {
+    // We cannot save/restore here as the read end of pipe is closed but there
+    // is ongoing read() above. We will not be able to restart the read()
+    // successfully in restore run since the read fd is closed.
+    const DisableSave ds;
+    ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+    int i = 123;
+    ASSERT_THAT(write(fds[1], &i, sizeof(i)),
+                SyscallSucceedsWithValue(sizeof(i)));
+    t.Join();
+  }
+  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(PipeTest, FionRead) {
+  // fds[0] is read end, fds[1] is write end.
+  int fds[2];
+  int data[2] = {0x12345678, 0x9101112};
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  int n = -1;
+  EXPECT_THAT(ioctl(fds[0], FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+  n = -1;
+  EXPECT_THAT(ioctl(fds[1], FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(write(fds[1], data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+
+  n = -1;
+  EXPECT_THAT(ioctl(fds[0], FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(data));
+  n = -1;
+  EXPECT_THAT(ioctl(fds[1], FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(data));
+}
+
+// Test that opening an empty anonymous pipe RDONLY via /proc/self/fd/N does not
+// block waiting for a writer.
+TEST_F(PipeTest, OpenViaProcSelfFD) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  // Close the write end of the pipe.
+  wfd.release();
+
+  // Open other side via /proc/self/fd.  It should not block.
+  FileDescriptor proc_self_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(absl::StrCat("/proc/self/fd/", fds[0]), O_RDONLY));
+}
+
+// Test that opening and reading from an anonymous pipe (with existing writes)
+// RDONLY via /proc/self/fd/N returns the existing data.
+TEST_F(PipeTest, OpenViaProcSelfFDWithWrites) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  // Write to the pipe and then close the write fd.
+  char data = 'x';
+  ASSERT_THAT(write(fds[1], &data, 1), SyscallSucceedsWithValue(1));
+  wfd.release();
+
+  // Open read side via /proc/self/fd, and read from it.
+  FileDescriptor proc_self_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(absl::StrCat("/proc/self/fd/", fds[0]), O_RDONLY));
+  char got;
+  ASSERT_THAT(read(proc_self_fd.get(), &got, 1), SyscallSucceedsWithValue(1));
+
+  // We should get what we sent.
+  EXPECT_EQ(got, data);
+}
+
+TEST_F(PipeTest, LargeFile) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  int rflags;
+  EXPECT_THAT(rflags = fcntl(rfd.get(), F_GETFL), SyscallSucceeds());
+
+  // The kernel did *not* set O_LARGEFILE.
+  EXPECT_EQ(rflags, 0);
+}
+
+// Test that accessing /proc/<PID>/fd/<FD> correctly decrements the refcount of
+// that file descriptor.
+TEST_F(PipeTest, ProcFDReleasesFile) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  // Stat the pipe FD, which shouldn't alter the refcount of the write end of
+  // the pipe.
+  struct stat wst;
+  ASSERT_THAT(lstat(absl::StrCat("/proc/self/fd/", wfd.get()).c_str(), &wst),
+              SyscallSucceeds());
+
+  // Close the write end of the pipe and ensure that read indicates EOF.
+  wfd.reset();
+  char buf;
+  ASSERT_THAT(read(rfd.get(), &buf, 1), SyscallSucceedsWithValue(0));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
new file mode 100644
index 000000000..897fd0bec
--- /dev/null
+++ b/test/syscalls/linux/poll.cc
@@ -0,0 +1,279 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <poll.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <algorithm>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class PollTest : public BasePollTest {
+ protected:
+  void SetUp() override { BasePollTest::SetUp(); }
+  void TearDown() override { BasePollTest::TearDown(); }
+};
+
+TEST_F(PollTest, InvalidFds) {
+  // fds is invalid because it's null, but we tell ppoll the length is non-zero.
+  EXPECT_THAT(poll(nullptr, 1, 1), SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(poll(nullptr, -1, 1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(PollTest, NullFds) {
+  EXPECT_THAT(poll(nullptr, 0, 10), SyscallSucceeds());
+}
+
+TEST_F(PollTest, ZeroTimeout) {
+  EXPECT_THAT(poll(nullptr, 0, 0), SyscallSucceeds());
+}
+
+// If random S/R interrupts the poll, SIGALRM may be delivered before poll
+// restarts, causing the poll to hang forever.
+TEST_F(PollTest, NegativeTimeout_NoRandomSave) {
+  // Negative timeout mean wait forever so set a timer.
+  SetTimer(absl::Milliseconds(100));
+  EXPECT_THAT(poll(nullptr, 0, -1), SyscallFailsWithErrno(EINTR));
+  EXPECT_TRUE(TimerFired());
+}
+
+TEST_F(PollTest, NonBlockingEventPOLLIN) {
+  // Create a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  FileDescriptor fd0(fds[0]);
+  FileDescriptor fd1(fds[1]);
+
+  // Write some data to the pipe.
+  char s[] = "foo\n";
+  ASSERT_THAT(WriteFd(fd1.get(), s, strlen(s) + 1), SyscallSucceeds());
+
+  // Poll on the reader fd with POLLIN event.
+  struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 0), SyscallSucceedsWithValue(1));
+
+  // Should trigger POLLIN event.
+  EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+}
+
+TEST_F(PollTest, BlockingEventPOLLIN) {
+  // Create a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  FileDescriptor fd0(fds[0]);
+  FileDescriptor fd1(fds[1]);
+
+  // Start a blocking poll on the read fd.
+  absl::Notification notify;
+  ScopedThread t([&fd0, &notify]() {
+    notify.Notify();
+
+    // Poll on the reader fd with POLLIN event.
+    struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
+    EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, -1), SyscallSucceedsWithValue(1));
+
+    // Should trigger POLLIN event.
+    EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+  });
+
+  notify.WaitForNotification();
+  absl::SleepFor(absl::Seconds(1.0));
+
+  // Write some data to the pipe.
+  char s[] = "foo\n";
+  ASSERT_THAT(WriteFd(fd1.get(), s, strlen(s) + 1), SyscallSucceeds());
+}
+
+TEST_F(PollTest, NonBlockingEventPOLLHUP) {
+  // Create a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  FileDescriptor fd0(fds[0]);
+  FileDescriptor fd1(fds[1]);
+
+  // Close the writer fd.
+  fd1.reset();
+
+  // Poll on the reader fd with POLLIN event.
+  struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 0), SyscallSucceedsWithValue(1));
+
+  // Should trigger POLLHUP event.
+  EXPECT_EQ(poll_fd.revents & POLLHUP, POLLHUP);
+
+  // Should not trigger POLLIN event.
+  EXPECT_EQ(poll_fd.revents & POLLIN, 0);
+}
+
+TEST_F(PollTest, BlockingEventPOLLHUP) {
+  // Create a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  FileDescriptor fd0(fds[0]);
+  FileDescriptor fd1(fds[1]);
+
+  // Start a blocking poll on the read fd.
+  absl::Notification notify;
+  ScopedThread t([&fd0, &notify]() {
+    notify.Notify();
+
+    // Poll on the reader fd with POLLIN event.
+    struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
+    EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, -1), SyscallSucceedsWithValue(1));
+
+    // Should trigger POLLHUP event.
+    EXPECT_EQ(poll_fd.revents & POLLHUP, POLLHUP);
+
+    // Should not trigger POLLIN event.
+    EXPECT_EQ(poll_fd.revents & POLLIN, 0);
+  });
+
+  notify.WaitForNotification();
+  absl::SleepFor(absl::Seconds(1.0));
+
+  // Write some data and close the writer fd.
+  fd1.reset();
+}
+
+TEST_F(PollTest, NonBlockingEventPOLLERR) {
+  // Create a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  FileDescriptor fd0(fds[0]);
+  FileDescriptor fd1(fds[1]);
+
+  // Close the reader fd.
+  fd0.reset();
+
+  // Poll on the writer fd with POLLOUT event.
+  struct pollfd poll_fd = {fd1.get(), POLLOUT, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 0), SyscallSucceedsWithValue(1));
+
+  // Should trigger POLLERR event.
+  EXPECT_EQ(poll_fd.revents & POLLERR, POLLERR);
+
+  // Should also trigger POLLOUT event.
+  EXPECT_EQ(poll_fd.revents & POLLOUT, POLLOUT);
+}
+
+// This test will validate that if an FD is already ready on some event, whether
+// it's POLLIN or POLLOUT it will not immediately return unless that's actually
+// what the caller was interested in.
+TEST_F(PollTest, ImmediatelyReturnOnlyOnPollEvents) {
+  // Create a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  FileDescriptor fd0(fds[0]);
+  FileDescriptor fd1(fds[1]);
+
+  // Wait for read related event on the write side of the pipe, since a write
+  // is possible on fds[1] it would mean that POLLOUT would return immediately.
+  // We should make sure that we're not woken up with that state that we didn't
+  // specificially request.
+  constexpr int kTimeoutMs = 100;
+  struct pollfd poll_fd = {fd1.get(), POLLIN | POLLPRI | POLLRDHUP, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, kTimeoutMs),
+              SyscallSucceedsWithValue(0));  // We should timeout.
+  EXPECT_EQ(poll_fd.revents, 0);  // Nothing should be in returned events.
+
+  // Now let's poll on POLLOUT and we should get back 1 fd as being ready and
+  // it should contain POLLOUT in the revents.
+  poll_fd.events = POLLOUT;
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, kTimeoutMs),
+              SyscallSucceedsWithValue(1));  // 1 fd should have an event.
+  EXPECT_EQ(poll_fd.revents, POLLOUT);       // POLLOUT should be in revents.
+}
+
+// This test validates that poll(2) while data is available immediately returns.
+TEST_F(PollTest, PollLevelTriggered) {
+  int fds[2] = {};
+  ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, /*protocol=*/0, fds),
+              SyscallSucceeds());
+
+  FileDescriptor fd0(fds[0]);
+  FileDescriptor fd1(fds[1]);
+
+  // Write two bytes to the socket.
+  const char* kBuf = "aa";
+  ASSERT_THAT(RetryEINTR(send)(fd0.get(), kBuf, /*len=*/2, /*flags=*/0),
+              SyscallSucceedsWithValue(2));  // 2 bytes should be written.
+
+  // Poll(2) should immediately return as there is data available to read.
+  constexpr int kInfiniteTimeout = -1;
+  struct pollfd poll_fd = {fd1.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, /*nfds=*/1, kInfiniteTimeout),
+              SyscallSucceedsWithValue(1));  // 1 fd should be ready to read.
+  EXPECT_NE(poll_fd.revents & POLLIN, 0);
+
+  // Read a single byte.
+  char read_byte = 0;
+  ASSERT_THAT(RetryEINTR(recv)(fd1.get(), &read_byte, /*len=*/1, /*flags=*/0),
+              SyscallSucceedsWithValue(1));  // 1 byte should be read.
+  ASSERT_EQ(read_byte, 'a');  // We should have read a single 'a'.
+
+  // Create a separate pollfd for our second poll.
+  struct pollfd poll_fd_after = {fd1.get(), POLLIN, 0};
+
+  // Poll(2) should again immediately return since we only read one byte.
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd_after, /*nfds=*/1, kInfiniteTimeout),
+              SyscallSucceedsWithValue(1));  // 1 fd should be ready to read.
+  EXPECT_NE(poll_fd_after.revents & POLLIN, 0);
+}
+
+TEST_F(PollTest, Nfds) {
+  // Stash value of RLIMIT_NOFILES.
+  struct rlimit rlim;
+  TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0);
+  rlim_t max_fds = rlim.rlim_cur;
+
+  // Create the biggest possible pollfd array such that each element is valid.
+  //
+  // Each entry in the 'fds' array refers to stdout (fd=1) and polls for
+  // "writable" events (events=POLLOUT). This essentially guarantees that the
+  // poll() is a no-op and allows negative testing of the 'nfds' parameter.
+  std::vector<struct pollfd> fds(max_fds, {.fd = 1, .events = POLLOUT});
+
+  // Verify that 'nfds' up to RLIMIT_NOFILE are allowed.
+  EXPECT_THAT(RetryEINTR(poll)(fds.data(), 1, 1), SyscallSucceedsWithValue(1));
+  EXPECT_THAT(RetryEINTR(poll)(fds.data(), max_fds / 2, 1),
+              SyscallSucceedsWithValue(max_fds / 2));
+  EXPECT_THAT(RetryEINTR(poll)(fds.data(), max_fds, 1),
+              SyscallSucceedsWithValue(max_fds));
+
+  // If 'nfds' exceeds RLIMIT_NOFILE then it must fail with EINVAL.
+  EXPECT_THAT(poll(fds.data(), max_fds + 1, 1), SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/ppoll.cc b/test/syscalls/linux/ppoll.cc
new file mode 100644
index 000000000..f8c388c00
--- /dev/null
+++ b/test/syscalls/linux/ppoll.cc
@@ -0,0 +1,155 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <poll.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// Linux and glibc have a different idea of the sizeof sigset_t. When calling
+// the syscall directly, use what the kernel expects.
+unsigned kSigsetSize = SIGRTMAX / 8;
+
+// Linux ppoll(2) differs from the glibc wrapper function in that Linux updates
+// the timeout with the amount of time remaining. In order to test this behavior
+// we need to use the syscall directly.
+int syscallPpoll(struct pollfd* fds, nfds_t nfds, struct timespec* timeout_ts,
+                 const sigset_t* sigmask, unsigned mask_size) {
+  return syscall(SYS_ppoll, fds, nfds, timeout_ts, sigmask, mask_size);
+}
+
+class PpollTest : public BasePollTest {
+ protected:
+  void SetUp() override { BasePollTest::SetUp(); }
+  void TearDown() override { BasePollTest::TearDown(); }
+};
+
+TEST_F(PpollTest, InvalidFds) {
+  // fds is invalid because it's null, but we tell ppoll the length is non-zero.
+  struct timespec timeout = {};
+  sigset_t sigmask;
+  TEST_PCHECK(sigemptyset(&sigmask) == 0);
+  EXPECT_THAT(syscallPpoll(nullptr, 1, &timeout, &sigmask, kSigsetSize),
+              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(syscallPpoll(nullptr, -1, &timeout, &sigmask, kSigsetSize),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// See that when fds is null, ppoll behaves like sleep.
+TEST_F(PpollTest, NullFds) {
+  struct timespec timeout = absl::ToTimespec(absl::Milliseconds(10));
+  ASSERT_THAT(syscallPpoll(nullptr, 0, &timeout, nullptr, 0),
+              SyscallSucceeds());
+  EXPECT_EQ(timeout.tv_sec, 0);
+  EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+TEST_F(PpollTest, ZeroTimeout) {
+  struct timespec timeout = {};
+  ASSERT_THAT(syscallPpoll(nullptr, 0, &timeout, nullptr, 0),
+              SyscallSucceeds());
+  EXPECT_EQ(timeout.tv_sec, 0);
+  EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+// If random S/R interrupts the ppoll, SIGALRM may be delivered before ppoll
+// restarts, causing the ppoll to hang forever.
+TEST_F(PpollTest, NoTimeout_NoRandomSave) {
+  // When there's no timeout, ppoll may never return so set a timer.
+  SetTimer(absl::Milliseconds(100));
+  // See that we get interrupted by the timer.
+  ASSERT_THAT(syscallPpoll(nullptr, 0, nullptr, nullptr, 0),
+              SyscallFailsWithErrno(EINTR));
+  EXPECT_TRUE(TimerFired());
+}
+
+TEST_F(PpollTest, InvalidTimeoutNegative) {
+  struct timespec timeout = absl::ToTimespec(absl::Nanoseconds(-1));
+  EXPECT_THAT(syscallPpoll(nullptr, 0, &timeout, nullptr, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(PpollTest, InvalidTimeoutNotNormalized) {
+  struct timespec timeout = {0, 1000000001};
+  EXPECT_THAT(syscallPpoll(nullptr, 0, &timeout, nullptr, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(PpollTest, InvalidMaskSize) {
+  struct timespec timeout = {};
+  sigset_t sigmask;
+  TEST_PCHECK(sigemptyset(&sigmask) == 0);
+  EXPECT_THAT(syscallPpoll(nullptr, 0, &timeout, &sigmask, 128),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Verify that signals blocked by the ppoll mask (that would otherwise be
+// allowed) do not interrupt ppoll.
+TEST_F(PpollTest, SignalMaskBlocksSignal) {
+  absl::Duration duration(absl::Seconds(30));
+  struct timespec timeout = absl::ToTimespec(duration);
+  absl::Duration timer_duration(absl::Seconds(10));
+
+  // Call with a mask that blocks SIGALRM. See that ppoll is not interrupted
+  // (i.e. returns 0) and that upon completion, the timer has fired.
+  sigset_t mask;
+  ASSERT_THAT(sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+  TEST_PCHECK(sigaddset(&mask, SIGALRM) == 0);
+  SetTimer(timer_duration);
+  MaybeSave();
+  ASSERT_FALSE(TimerFired());
+  ASSERT_THAT(syscallPpoll(nullptr, 0, &timeout, &mask, kSigsetSize),
+              SyscallSucceeds());
+  EXPECT_TRUE(TimerFired());
+  EXPECT_EQ(absl::DurationFromTimespec(timeout), absl::Duration());
+}
+
+// Verify that signals allowed by the ppoll mask (that would otherwise be
+// blocked) interrupt ppoll.
+TEST_F(PpollTest, SignalMaskAllowsSignal) {
+  absl::Duration duration(absl::Seconds(30));
+  struct timespec timeout = absl::ToTimespec(duration);
+  absl::Duration timer_duration(absl::Seconds(10));
+
+  sigset_t mask;
+  ASSERT_THAT(sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+
+  // Block SIGALRM.
+  auto cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGALRM));
+
+  // Call with a mask that unblocks SIGALRM. See that ppoll is interrupted.
+  SetTimer(timer_duration);
+  MaybeSave();
+  ASSERT_FALSE(TimerFired());
+  ASSERT_THAT(syscallPpoll(nullptr, 0, &timeout, &mask, kSigsetSize),
+              SyscallFailsWithErrno(EINTR));
+  EXPECT_TRUE(TimerFired());
+  EXPECT_GT(absl::DurationFromTimespec(timeout), absl::Duration());
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
new file mode 100644
index 000000000..44f3df6a3
--- /dev/null
+++ b/test/syscalls/linux/prctl.cc
@@ -0,0 +1,171 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_bool(prctl_no_new_privs_test_child, false,
+            "If true, exit with the return value of prctl(PR_GET_NO_NEW_PRIVS) "
+            "plus an offset (see test source).");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(PrctlTest, NameInitialized) {
+  const size_t name_length = 20;
+  char name[name_length] = {};
+  ASSERT_THAT(prctl(PR_GET_NAME, name), SyscallSucceeds());
+  ASSERT_NE(std::string(name), "");
+}
+
+TEST(PrctlTest, SetNameLongName) {
+  const size_t name_length = 20;
+  const std::string long_name(name_length, 'A');
+  ASSERT_THAT(prctl(PR_SET_NAME, long_name.c_str()), SyscallSucceeds());
+  char truncated_name[name_length] = {};
+  ASSERT_THAT(prctl(PR_GET_NAME, truncated_name), SyscallSucceeds());
+  const size_t truncated_length = 15;
+  ASSERT_EQ(long_name.substr(0, truncated_length), std::string(truncated_name));
+}
+
+// Offset added to exit code from test child to distinguish from other abnormal
+// exits.
+constexpr int kPrctlNoNewPrivsTestChildExitBase = 100;
+
+TEST(PrctlTest, NoNewPrivsPreservedAcrossCloneForkAndExecve) {
+  // Check if no_new_privs is already set. If it is, we can still test that it's
+  // preserved across clone/fork/execve, but we also expect it to still be set
+  // at the end of the test. Otherwise, call prctl(PR_SET_NO_NEW_PRIVS) so as
+  // not to contaminate the original thread.
+  int no_new_privs;
+  ASSERT_THAT(no_new_privs = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+              SyscallSucceeds());
+  ScopedThread([] {
+    ASSERT_THAT(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), SyscallSucceeds());
+    EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+                SyscallSucceedsWithValue(1));
+    ScopedThread([] {
+      EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+                  SyscallSucceedsWithValue(1));
+      // Note that these ASSERT_*s failing will only return from this thread,
+      // but this is the intended behavior.
+      pid_t child_pid = -1;
+      int execve_errno = 0;
+      auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+          ForkAndExec("/proc/self/exe",
+                      {"/proc/self/exe", "--prctl_no_new_privs_test_child"}, {},
+                      nullptr, &child_pid, &execve_errno));
+
+      ASSERT_GT(child_pid, 0);
+      ASSERT_EQ(execve_errno, 0);
+
+      int status = 0;
+      ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+                  SyscallSucceeds());
+      ASSERT_TRUE(WIFEXITED(status));
+      ASSERT_EQ(WEXITSTATUS(status), kPrctlNoNewPrivsTestChildExitBase + 1);
+
+      EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+                  SyscallSucceedsWithValue(1));
+    });
+    EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+                SyscallSucceedsWithValue(1));
+  });
+  EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+              SyscallSucceedsWithValue(no_new_privs));
+}
+
+TEST(PrctlTest, PDeathSig) {
+  pid_t child_pid;
+
+  // Make the new process' parent a separate thread since the parent death
+  // signal fires when the parent *thread* exits.
+  ScopedThread([&] {
+    child_pid = fork();
+    TEST_CHECK(child_pid >= 0);
+    if (child_pid == 0) {
+      // In child process.
+      TEST_CHECK(prctl(PR_SET_PDEATHSIG, SIGKILL) >= 0);
+      int signo;
+      TEST_CHECK(prctl(PR_GET_PDEATHSIG, &signo) >= 0);
+      TEST_CHECK(signo == SIGKILL);
+      // Enable tracing, then raise SIGSTOP and expect our parent to suppress
+      // it.
+      TEST_CHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) >= 0);
+      raise(SIGSTOP);
+      // Sleep until killed by our parent death signal. sleep(3) is
+      // async-signal-safe, absl::SleepFor isn't.
+      while (true) {
+        sleep(10);
+      }
+    }
+    // In parent process.
+
+    // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+    int status;
+    ASSERT_THAT(waitpid(child_pid, &status, 0),
+                SyscallSucceedsWithValue(child_pid));
+    EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+        << "status = " << status;
+
+    // Suppress the SIGSTOP and detach from the child.
+    ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+  });
+
+  // The child should have been killed by its parent death SIGKILL.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+      << "status = " << status;
+}
+
+// This test is to validate that calling prctl with PR_SET_MM without the
+// CAP_SYS_RESOURCE returns EPERM.
+TEST(PrctlTest, InvalidPrSetMM) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_SYS_RESOURCE,
+                                  false));  // Drop capability to test below.
+  }
+  ASSERT_THAT(prctl(PR_SET_MM, 0, 0, 0, 0), SyscallFailsWithErrno(EPERM));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  gvisor::testing::TestInit(&argc, &argv);
+
+  if (FLAGS_prctl_no_new_privs_test_child) {
+    exit(gvisor::testing::kPrctlNoNewPrivsTestChildExitBase +
+         prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0));
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
new file mode 100644
index 000000000..c1b561464
--- /dev/null
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -0,0 +1,262 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+#include <sys/prctl.h>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_int32(scratch_uid, 65534, "scratch UID");
+// This flag is used to verify that after an exec PR_GET_KEEPCAPS
+// returns 0, the return code will be offset by kPrGetKeepCapsExitBase.
+DEFINE_bool(prctl_pr_get_keepcaps, false,
+            "If true the test will verify that prctl with pr_get_keepcaps"
+            "returns 0. The test will exit with the result of that check.");
+
+// These tests exist seperately from prctl because we need to start
+// them as root. Setuid() has the behavior that permissions are fully
+// removed if one of the UIDs were 0 before a setuid() call. This
+// behavior can be changed by using PR_SET_KEEPCAPS and that is what
+// is tested here.
+//
+// Reference setuid(2):
+// The setuid() function checks the effective user ID of
+// the caller and if it is the superuser, all process-related user ID's
+// are set to uid.  After this has occurred, it is impossible for the
+// program to regain root privileges.
+//
+// Thus, a set-user-ID-root program wishing to temporarily drop root
+// privileges, assume the identity of an unprivileged user, and then
+// regain root privileges afterward cannot use setuid().  You can
+// accomplish this with seteuid(2).
+namespace gvisor {
+namespace testing {
+
+// Offset added to exit code from test child to distinguish from other abnormal
+// exits.
+constexpr int kPrGetKeepCapsExitBase = 100;
+
+namespace {
+
+class PrctlKeepCapsSetuidTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // PR_GET_KEEPCAPS will only return 0 or 1 (on success).
+    ASSERT_THAT(original_keepcaps_ = prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+                SyscallSucceeds());
+    ASSERT_TRUE(original_keepcaps_ == 0 || original_keepcaps_ == 1);
+  }
+
+  void TearDown() override {
+    // Restore PR_SET_KEEPCAPS.
+    ASSERT_THAT(prctl(PR_SET_KEEPCAPS, original_keepcaps_, 0, 0, 0),
+                SyscallSucceeds());
+
+    // Verify that it was restored.
+    ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+                SyscallSucceedsWithValue(original_keepcaps_));
+  }
+
+  // The original keep caps value exposed so tests can use it if they need.
+  int original_keepcaps_ = 0;
+};
+
+// This test will verify that a bad value, eg. not 0 or 1 for
+// PR_SET_KEEPCAPS will return EINVAL as required by prctl(2).
+TEST_F(PrctlKeepCapsSetuidTest, PrctlBadArgsToKeepCaps) {
+  ASSERT_THAT(prctl(PR_SET_KEEPCAPS, 2, 0, 0, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// This test will verify that a setuid(2) without PR_SET_KEEPCAPS will cause
+// all capabilities to be dropped.
+TEST_F(PrctlKeepCapsSetuidTest, SetUidNoKeepCaps) {
+  // getuid(2) never fails.
+  if (getuid() != 0) {
+    SKIP_IF(!IsRunningOnGvisor());
+    FAIL() << "User is not root on gvisor platform.";
+  }
+
+  // Do setuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting
+  // this test. Otherwise, the files are created by root (UID before the
+  // test), but cannot be opened by the `uid` set below after the test. After
+  // calling setuid(non-zero-UID), there is no way to get root privileges
+  // back.
+  ScopedThread([] {
+    // Start by verifying we have a capability.
+    TEST_CHECK(HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+
+    // Verify that PR_GET_KEEPCAPS is disabled.
+    ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+                SyscallSucceedsWithValue(0));
+
+    // Use syscall instead of glibc setuid wrapper because we want this setuid
+    // call to only apply to this task. POSIX threads, however, require that
+    // all threads have the same UIDs, so using the setuid wrapper sets all
+    // threads' real UID.
+    EXPECT_THAT(syscall(SYS_setuid, FLAGS_scratch_uid), SyscallSucceeds());
+
+    // Verify that we changed uid.
+    EXPECT_THAT(getuid(), SyscallSucceedsWithValue(FLAGS_scratch_uid));
+
+    // Verify we lost the capability in the effective set, this always happens.
+    TEST_CHECK(!HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+
+    // We should have also lost it in the permitted set by the setuid() so
+    // SetCapability should fail when we try to add it back to the effective set
+    ASSERT_FALSE(SetCapability(CAP_SYS_ADMIN, true).ok());
+  });
+}
+
+// This test will verify that a setuid with PR_SET_KEEPCAPS will cause
+// capabilities to be retained after we switch away from the root user.
+TEST_F(PrctlKeepCapsSetuidTest, SetUidKeepCaps) {
+  // getuid(2) never fails.
+  if (getuid() != 0) {
+    SKIP_IF(!IsRunningOnGvisor());
+    FAIL() << "User is not root on gvisor platform.";
+  }
+
+  // Do setuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting
+  // this test. Otherwise, the files are created by root (UID before the
+  // test), but cannot be opened by the `uid` set below after the test. After
+  // calling setuid(non-zero-UID), there is no way to get root privileges
+  // back.
+  ScopedThread([] {
+    // Start by verifying we have a capability.
+    TEST_CHECK(HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+
+    // Set PR_SET_KEEPCAPS.
+    ASSERT_THAT(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0), SyscallSucceeds());
+
+    // Verify PR_SET_KEEPCAPS was set before we proceed.
+    ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+                SyscallSucceedsWithValue(1));
+
+    // Use syscall instead of glibc setuid wrapper because we want this setuid
+    // call to only apply to this task. POSIX threads, however, require that
+    // all threads have the same UIDs, so using the setuid wrapper sets all
+    // threads' real UID.
+    EXPECT_THAT(syscall(SYS_setuid, FLAGS_scratch_uid), SyscallSucceeds());
+
+    // Verify that we changed uid.
+    EXPECT_THAT(getuid(), SyscallSucceedsWithValue(FLAGS_scratch_uid));
+
+    // Verify we lost the capability in the effective set, this always happens.
+    TEST_CHECK(!HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+
+    // We lost the capability in the effective set, but it will still
+    // exist in the permitted set so we can elevate the capability.
+    ASSERT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, true));
+
+    // Verify we got back the capability in the effective set.
+    TEST_CHECK(HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+  });
+}
+
+// This test will verify that PR_SET_KEEPCAPS is not retained
+// across an execve. According to prctl(2):
+// "The "keep capabilities" value will  be reset to 0 on subsequent
+// calls to execve(2)."
+TEST_F(PrctlKeepCapsSetuidTest, NoKeepCapsAfterExec) {
+  ASSERT_THAT(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0), SyscallSucceeds());
+
+  // Verify PR_SET_KEEPCAPS was set before we proceed.
+  ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0), SyscallSucceedsWithValue(1));
+
+  pid_t child_pid = -1;
+  int execve_errno = 0;
+  // Do an exec and then verify that PR_GET_KEEPCAPS returns 0
+  // see the body of main below.
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+      "/proc/self/exe", {"/proc/self/exe", "--prctl_pr_get_keepcaps"}, {},
+      nullptr, &child_pid, &execve_errno));
+
+  ASSERT_GT(child_pid, 0);
+  ASSERT_EQ(execve_errno, 0);
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  ASSERT_TRUE(WIFEXITED(status));
+  // PR_SET_KEEPCAPS should have been cleared by the exec.
+  // Success should return gvisor::testing::kPrGetKeepCapsExitBase + 0
+  ASSERT_EQ(WEXITSTATUS(status), kPrGetKeepCapsExitBase);
+}
+
+TEST_F(PrctlKeepCapsSetuidTest, NoKeepCapsAfterNewUserNamespace) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+
+  // Fork to avoid changing the user namespace of the original test process.
+  pid_t const child_pid = fork();
+
+  if (child_pid == 0) {
+    // Verify that the keepcaps flag is set to 0 when we change user namespaces.
+    TEST_PCHECK(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) == 0);
+    MaybeSave();
+
+    TEST_PCHECK(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0) == 1);
+    MaybeSave();
+
+    TEST_PCHECK(unshare(CLONE_NEWUSER) == 0);
+    MaybeSave();
+
+    TEST_PCHECK(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0) == 0);
+    MaybeSave();
+
+    _exit(0);
+  }
+
+  int status;
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status = " << status;
+}
+
+// This test will verify that PR_SET_KEEPCAPS and PR_GET_KEEPCAPS work correctly
+TEST_F(PrctlKeepCapsSetuidTest, PrGetKeepCaps) {
+  // Set PR_SET_KEEPCAPS to the negation of the original.
+  ASSERT_THAT(prctl(PR_SET_KEEPCAPS, !original_keepcaps_, 0, 0, 0),
+              SyscallSucceeds());
+
+  // Verify it was set.
+  ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+              SyscallSucceedsWithValue(!original_keepcaps_));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  gvisor::testing::TestInit(&argc, &argv);
+
+  if (FLAGS_prctl_pr_get_keepcaps) {
+    return gvisor::testing::kPrGetKeepCapsExitBase +
+           prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0);
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
new file mode 100644
index 000000000..4e5bcfcde
--- /dev/null
+++ b/test/syscalls/linux/pread64.cc
@@ -0,0 +1,152 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class Pread64Test : public ::testing::Test {
+  void SetUp() override {
+    name_ = NewTempAbsPath();
+    ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_CREAT, 0644));
+  }
+
+  void TearDown() override { unlink(name_.c_str()); }
+
+ public:
+  std::string name_;
+};
+
+TEST(Pread64TestNoTempFile, BadFileDescriptor) {
+  char buf[1024];
+  EXPECT_THAT(pread64(-1, buf, 1024, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(Pread64Test, ZeroBuffer) {
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDWR));
+
+  char msg[] = "hello world";
+  EXPECT_THAT(pwrite64(fd.get(), msg, strlen(msg), 0),
+              SyscallSucceedsWithValue(strlen(msg)));
+
+  char buf[10];
+  EXPECT_THAT(pread64(fd.get(), buf, 0, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(Pread64Test, BadBuffer) {
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDWR));
+
+  char msg[] = "hello world";
+  EXPECT_THAT(pwrite64(fd.get(), msg, strlen(msg), 0),
+              SyscallSucceedsWithValue(strlen(msg)));
+
+  char* bad_buffer = nullptr;
+  EXPECT_THAT(pread64(fd.get(), bad_buffer, 1024, 0),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(Pread64Test, WriteOnlyNotReadable) {
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_WRONLY));
+
+  char buf[1024];
+  EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(Pread64Test, DirNotReadable) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), O_RDONLY));
+
+  char buf[1024];
+  EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(Pread64Test, BadOffset) {
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDONLY));
+
+  char buf[1024];
+  EXPECT_THAT(pread64(fd.get(), buf, 1024, -1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(Pread64Test, OffsetNotIncremented) {
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDWR));
+
+  char msg[] = "hello world";
+  EXPECT_THAT(write(fd.get(), msg, strlen(msg)),
+              SyscallSucceedsWithValue(strlen(msg)));
+  int offset;
+  EXPECT_THAT(offset = lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+
+  char buf1[1024];
+  EXPECT_THAT(pread64(fd.get(), buf1, 1024, 0),
+              SyscallSucceedsWithValue(strlen(msg)));
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(offset));
+
+  char buf2[1024];
+  EXPECT_THAT(pread64(fd.get(), buf2, 1024, 3),
+              SyscallSucceedsWithValue(strlen(msg) - 3));
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(offset));
+}
+
+TEST_F(Pread64Test, EndOfFile) {
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDONLY));
+
+  char buf[1024];
+  EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST(Pread64TestNoTempFile, CantReadSocketPair_NoRandomSave) {
+  int sock_fds[2];
+  EXPECT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds), SyscallSucceeds());
+
+  char buf[1024];
+  EXPECT_THAT(pread64(sock_fds[0], buf, 1024, 0),
+              SyscallFailsWithErrno(ESPIPE));
+  EXPECT_THAT(pread64(sock_fds[1], buf, 1024, 0),
+              SyscallFailsWithErrno(ESPIPE));
+
+  EXPECT_THAT(close(sock_fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(sock_fds[1]), SyscallSucceeds());
+}
+
+TEST(Pread64TestNoTempFile, CantReadPipe) {
+  char buf[1024];
+
+  int pipe_fds[2];
+  EXPECT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+  EXPECT_THAT(pread64(pipe_fds[0], buf, 1024, 0),
+              SyscallFailsWithErrno(ESPIPE));
+
+  EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
new file mode 100644
index 000000000..8d3aed43c
--- /dev/null
+++ b/test/syscalls/linux/preadv.cc
@@ -0,0 +1,94 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+TEST(PreadvTest, MMConcurrencyStress) {
+  // Fill a one-page file with zeroes (the contents don't really matter).
+  const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      /* parent = */ GetAbsoluteTestTmpdir(),
+      /* content = */ std::string(kPageSize, 0), TempPath::kDefaultFileMode));
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+  // Get a one-page private mapping to read to.
+  const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+
+  // Repeatedly fork in a separate thread to force the mapping to become
+  // copy-on-write.
+  std::atomic<bool> done(false);
+  const ScopedThread t([&] {
+    while (!done.load()) {
+      const pid_t pid = fork();
+      TEST_CHECK(pid >= 0);
+      if (pid == 0) {
+        // In child. The parent was obviously multithreaded, so it's neither
+        // safe nor necessary to do much more than exit.
+        syscall(SYS_exit_group, 0);
+      }
+      int status;
+      ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+                  SyscallSucceedsWithValue(pid));
+      EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+          << "status = " << status;
+    }
+  });
+
+  // Repeatedly read to the mapping.
+  struct iovec iov[2];
+  iov[0].iov_base = m.ptr();
+  iov[0].iov_len = kPageSize / 2;
+  iov[1].iov_base = reinterpret_cast<void*>(m.addr() + kPageSize / 2);
+  iov[1].iov_len = kPageSize / 2;
+  constexpr absl::Duration kTestDuration = absl::Seconds(5);
+  const absl::Time end = absl::Now() + kTestDuration;
+  while (absl::Now() < end) {
+    // Among other causes, save/restore cycles may cause interruptions resulting
+    // in partial reads, so we don't expect any particular return value.
+    EXPECT_THAT(RetryEINTR(preadv)(fd.get(), iov, 2, 0), SyscallSucceeds());
+  }
+
+  // Stop the other thread.
+  done.store(true);
+
+  // The test passes if it neither deadlocks nor crashes the OS.
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
new file mode 100644
index 000000000..642eed624
--- /dev/null
+++ b/test/syscalls/linux/preadv2.cc
@@ -0,0 +1,217 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/syscalls/linux/readv_common.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/memory_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifndef SYS_preadv2
+#if defined(__x86_64__)
+#define SYS_preadv2 327
+#else
+#error "Unknown architecture"
+#endif
+#endif  // SYS_preadv2
+
+#ifndef RWF_HIPRI
+#define RWF_HIPRI 0x1
+#endif  // RWF_HIPRI
+
+constexpr int kBufSize = 1024;
+
+std::string SetContent() {
+  std::string content;
+  for (int i = 0; i < kBufSize; i++) {
+    content += static_cast<char>((i % 10) + '0');
+  }
+  return content;
+}
+
+// This test is the base case where we call preadv (no offset, no flags).
+TEST(Preadv2Test, TestBaseCall) {
+  if (!IsRunningOnGvisor()) {
+    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
+  }
+  std::string content = SetContent();
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), content, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  std::vector<char> buf(kBufSize);
+  struct iovec iov;
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt*/ 1,
+                      /*offset=*/0, /*flags=*/0),
+              SyscallSucceedsWithValue(kBufSize));
+
+  EXPECT_EQ(content, std::string(buf.data(), buf.size()));
+}
+
+// This test is where we call preadv with an offset and no flags.
+TEST(Preadv2Test, TestValidPositiveOffset) {
+  if (!IsRunningOnGvisor()) {
+    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
+  }
+  std::string content = SetContent();
+  const std::string prefix = "0";
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), prefix + content, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  std::vector<char> buf(kBufSize, '0');
+  struct iovec iov;
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
+                      /*offset=*/prefix.size(), /*flags=*/0),
+              SyscallSucceedsWithValue(kBufSize));
+
+  EXPECT_EQ(content, std::string(buf.data(), buf.size()));
+}
+
+// This test is the base case where we call readv by using -1 as the offset. The
+// read should use the file offset, so the test increments it by one prior to
+// calling preadv2.
+TEST(Preadv2Test, TestNegativeOneOffset) {
+  if (!IsRunningOnGvisor()) {
+    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
+  }
+  std::string content = SetContent();
+  const std::string prefix = "231";
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), prefix + content, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+  ASSERT_THAT(lseek(fd.get(), prefix.size(), SEEK_SET),
+              SyscallSucceedsWithValue(prefix.size()));
+
+  std::vector<char> buf(kBufSize, '0');
+  struct iovec iov;
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
+                      /*offset=*/static_cast<off_t>(-1), /*flags=*/0),
+              SyscallSucceedsWithValue(kBufSize));
+
+  EXPECT_EQ(content, std::string(buf.data(), buf.size()));
+}
+
+// This test calls preadv2 with an invalid flag.
+TEST(Preadv2Test, TestInvalidFlag) {
+  if (!IsRunningOnGvisor()) {
+    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
+  }
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY | O_DIRECT));
+
+  struct iovec iov;
+
+  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
+                      /*offset=*/0, /*flags=*/RWF_HIPRI << 1),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// This test calls preadv2 with an invalid offset.
+TEST(Preadv2Test, TestInvalidOffset) {
+  if (!IsRunningOnGvisor()) {
+    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
+  }
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY | O_DIRECT));
+  struct iovec iov;
+
+  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
+                      /*offset=*/static_cast<off_t>(-8), /*flags=*/RWF_HIPRI),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// This test calls preadv with a file set O_WRONLY.
+TEST(Preadv2Test, TestUnreadableFile) {
+  if (!IsRunningOnGvisor()) {
+    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
+  }
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+  struct iovec iov;
+
+  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
+                      /*offset=*/0, /*flags=*/0),
+              SyscallFailsWithErrno(EBADF));
+}
+
+// Calling preadv2 with a non-negative offset calls preadv.  Calling preadv with
+// an unseekable file is not allowed. A pipe is used for an unseekable file.
+TEST(Preadv2Test, TestUnseekableFile) {
+  if (!IsRunningOnGvisor()) {
+    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
+  }
+
+  int pipe_fds[2];
+
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+  struct iovec iov;
+
+  EXPECT_THAT(syscall(SYS_preadv2, pipe_fds[0], &iov, /*iov_cnt=*/1,
+                      /*offset=*/2, /*flags=*/0),
+              SyscallFailsWithErrno(ESPIPE));
+
+  EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/priority.cc b/test/syscalls/linux/priority.cc
new file mode 100644
index 000000000..69a58a422
--- /dev/null
+++ b/test/syscalls/linux/priority.cc
@@ -0,0 +1,215 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "test/util/capability_util.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// These tests are for both the getpriority(2) and setpriority(2) syscalls
+// These tests are very rudimentary because getpriority and setpriority
+// have not yet been fully implemented.
+
+// Getpriority does something
+TEST(GetpriorityTest, Implemented) {
+  // "getpriority() can legitimately return the value -1, it is necessary to
+  // clear the external variable errno prior to the call"
+  errno = 0;
+  EXPECT_THAT(getpriority(PRIO_PROCESS, /*who=*/0), SyscallSucceeds());
+}
+
+// Invalid which
+TEST(GetpriorityTest, InvalidWhich) {
+  errno = 0;
+  EXPECT_THAT(getpriority(/*which=*/3, /*who=*/0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Process is found when which=PRIO_PROCESS
+TEST(GetpriorityTest, ValidWho) {
+  errno = 0;
+  EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()), SyscallSucceeds());
+}
+
+// Process is not found when which=PRIO_PROCESS
+TEST(GetpriorityTest, InvalidWho) {
+  errno = 0;
+  // Flaky, but it's tough to avoid a race condition when finding an unused pid
+  EXPECT_THAT(getpriority(PRIO_PROCESS, /*who=*/INT_MAX - 1),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+// Setpriority does something
+TEST(SetpriorityTest, Implemented) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+  // No need to clear errno for setpriority():
+  // "The setpriority() call returns 0 if there is no error, or -1 if there is"
+  EXPECT_THAT(setpriority(PRIO_PROCESS, /*who=*/0, /*nice=*/16),
+              SyscallSucceeds());
+}
+
+// Invalid which
+TEST(Setpriority, InvalidWhich) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+  EXPECT_THAT(setpriority(/*which=*/3, /*who=*/0, /*nice=*/16),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Process is found when which=PRIO_PROCESS
+TEST(SetpriorityTest, ValidWho) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+  EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/16),
+              SyscallSucceeds());
+}
+
+// niceval is within the range [-20, 19]
+TEST(SetpriorityTest, InsideRange) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+  // Set 0 < niceval < 19
+  int nice = 12;
+  EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), nice), SyscallSucceeds());
+
+  errno = 0;
+  EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()),
+              SyscallSucceedsWithValue(nice));
+
+  // Set -20 < niceval < 0
+  nice = -12;
+  EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), nice), SyscallSucceeds());
+
+  errno = 0;
+  EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()),
+              SyscallSucceedsWithValue(nice));
+}
+
+// Verify that priority/niceness are exposed via /proc/PID/stat.
+TEST(SetpriorityTest, NicenessExposedViaProcfs) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+  constexpr int kNiceVal = 12;
+  ASSERT_THAT(setpriority(PRIO_PROCESS, getpid(), kNiceVal), SyscallSucceeds());
+
+  errno = 0;
+  ASSERT_THAT(getpriority(PRIO_PROCESS, getpid()),
+              SyscallSucceedsWithValue(kNiceVal));
+
+  // Now verify we can read that same value via /proc/self/stat.
+  std::string proc_stat;
+  ASSERT_NO_ERRNO(GetContents("/proc/self/stat", &proc_stat));
+  std::vector<std::string> pieces = absl::StrSplit(proc_stat, ' ');
+  ASSERT_GT(pieces.size(), 20);
+
+  int niceness_procfs = 0;
+  ASSERT_TRUE(absl::SimpleAtoi(pieces[18], &niceness_procfs));
+  EXPECT_EQ(niceness_procfs, kNiceVal);
+}
+
+// In the kernel's implementation, values outside the range of [-20, 19] are
+// truncated to these minimum and maximum values. See
+// https://elixir.bootlin.com/linux/v4.4/source/kernel/sys.c#L190
+TEST(SetpriorityTest, OutsideRange) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+  // Set niceval > 19
+  EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/100),
+              SyscallSucceeds());
+
+  errno = 0;
+  // Test niceval truncated to 19
+  EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()),
+              SyscallSucceedsWithValue(/*maxnice=*/19));
+
+  // Set niceval < -20
+  EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/-100),
+              SyscallSucceeds());
+
+  errno = 0;
+  // Test niceval truncated to -20
+  EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()),
+              SyscallSucceedsWithValue(/*minnice=*/-20));
+}
+
+// Process is not found when which=PRIO_PROCESS
+TEST(SetpriorityTest, InvalidWho) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+  // Flaky, but it's tough to avoid a race condition when finding an unused pid
+  EXPECT_THAT(setpriority(PRIO_PROCESS,
+                          /*who=*/INT_MAX - 1,
+                          /*nice=*/16),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+// Nice succeeds, correctly modifies (or in this case does not
+// modify priority of process
+TEST(SetpriorityTest, NiceSucceeds) {
+  errno = 0;
+  const int priority_before = getpriority(PRIO_PROCESS, /*who=*/0);
+  ASSERT_THAT(nice(/*inc=*/0), SyscallSucceeds());
+
+  // nice(0) should not change priority
+  EXPECT_EQ(priority_before, getpriority(PRIO_PROCESS, /*who=*/0));
+}
+
+// Threads resulting from clone() maintain parent's priority
+// Changes to child priority do not affect parent's priority
+TEST(GetpriorityTest, CloneMaintainsPriority) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+  constexpr int kParentPriority = 16;
+  constexpr int kChildPriority = 14;
+  ASSERT_THAT(setpriority(PRIO_PROCESS, getpid(), kParentPriority),
+              SyscallSucceeds());
+
+  ScopedThread([kParentPriority, kChildPriority]() {
+    // Check that priority equals that of parent thread
+    pid_t my_tid;
+    EXPECT_THAT(my_tid = syscall(__NR_gettid), SyscallSucceeds());
+    EXPECT_THAT(getpriority(PRIO_PROCESS, my_tid),
+                SyscallSucceedsWithValue(kParentPriority));
+
+    // Change the child thread's priority
+    EXPECT_THAT(setpriority(PRIO_PROCESS, my_tid, kChildPriority),
+                SyscallSucceeds());
+  });
+
+  // Check that parent's priority reemained the same even though
+  // the child's priority was altered
+  EXPECT_EQ(kParentPriority, getpriority(PRIO_PROCESS, syscall(__NR_gettid)));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/priority_execve.cc b/test/syscalls/linux/priority_execve.cc
new file mode 100644
index 000000000..5604bd3d0
--- /dev/null
+++ b/test/syscalls/linux/priority_execve.cc
@@ -0,0 +1,42 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(int argc, char** argv, char** envp) {
+  errno = 0;
+  int prio = getpriority(PRIO_PROCESS, getpid());
+
+  // NOTE: getpriority() can legitimately return negative values
+  // in the range [-20, 0). If errno is set, exit with a value that
+  // could not be reached by a valid priority. Valid exit values
+  // for the test are in the range [1, 40], so we'll use 0.
+  if (errno != 0) {
+    printf("getpriority() failed with errno = %d\n", errno);
+    exit(0);
+  }
+
+  // Used by test to verify priority is being maintained through
+  // calls to execve(). Since prio should always be in the range
+  // [-20, 19], we offset by 20 so as not to have negative exit codes.
+  exit(20 - prio);
+
+  return 0;
+}
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
new file mode 100644
index 000000000..e64df97b0
--- /dev/null
+++ b/test/syscalls/linux/proc.cc
@@ -0,0 +1,1830 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/utsname.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+// NOTE: No, this isn't really a syscall but this is a really simple
+// way to get it tested on both gVisor, PTrace and Linux.
+
+using ::testing::AllOf;
+using ::testing::ContainerEq;
+using ::testing::Contains;
+using ::testing::ContainsRegex;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::IsSupersetOf;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+using ::testing::UnorderedElementsAreArray;
+
+// Exported by glibc.
+extern char** environ;
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
+// because "it isn't needed", even though Linux can return it via F_GETFL.
+constexpr int kOLargeFile = 00100000;
+
+// Takes the subprocess command line and pid.
+// If it returns !OK, WithSubprocess returns immediately.
+using SubprocessCallback = std::function<PosixError(int)>;
+
+std::vector<std::string> saved_argv;  // NOLINT
+
+// Helper function to dump /proc/{pid}/status and check the
+// state data. State should = "Z" for zombied or "RSD" for
+// running, interruptible sleeping (S), or uninterruptible sleep
+// (D).
+void CompareProcessState(absl::string_view state, int pid) {
+  auto status_file = ASSERT_NO_ERRNO_AND_VALUE(
+      GetContents(absl::StrCat("/proc/", pid, "/status")));
+  EXPECT_THAT(status_file, ContainsRegex(absl::StrCat("State:.[", state,
+                                                      "]\\s+\\(\\w+\\)")));
+}
+
+// Run callbacks while a subprocess is running, zombied, and/or exited.
+PosixError WithSubprocess(SubprocessCallback const& running,
+                          SubprocessCallback const& zombied,
+                          SubprocessCallback const& exited) {
+  int pipe_fds[2] = {};
+  if (pipe(pipe_fds) < 0) {
+    return PosixError(errno, "pipe");
+  }
+
+  int child_pid = fork();
+  if (child_pid < 0) {
+    return PosixError(errno, "fork");
+  }
+
+  if (child_pid == 0) {
+    close(pipe_fds[0]);    // Close the read end.
+    const DisableSave ds;  // Timing issues.
+
+    // Write to the pipe to tell it we're ready.
+    char buf = 'a';
+    int res = 0;
+    res = WriteFd(pipe_fds[1], &buf, sizeof(buf));
+    TEST_CHECK_MSG(res == sizeof(buf), "Write failure in subprocess");
+
+    while (true) {
+      SleepSafe(absl::Milliseconds(100));
+    }
+    __builtin_unreachable();
+  }
+
+  close(pipe_fds[1]);  // Close the write end.
+
+  int status = 0;
+  auto wait_cleanup = Cleanup([child_pid, &status] {
+    EXPECT_THAT(waitpid(child_pid, &status, 0), SyscallSucceeds());
+  });
+  auto kill_cleanup = Cleanup([child_pid] {
+    EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+  });
+
+  // Wait for the child.
+  char buf = 0;
+  int res = ReadFd(pipe_fds[0], &buf, sizeof(buf));
+  if (res < 0) {
+    return PosixError(errno, "Read from pipe");
+  } else if (res == 0) {
+    return PosixError(EPIPE, "Unable to read from pipe: EOF");
+  }
+
+  if (running) {
+    // The first arg, RSD, refers to a "running process", or a process with a
+    // state of Running (R), Interruptable Sleep (S) or Uninterruptable
+    // Sleep (D).
+    CompareProcessState("RSD", child_pid);
+    RETURN_IF_ERRNO(running(child_pid));
+  }
+
+  // Kill the process.
+  kill_cleanup.Release()();
+  siginfo_t info;
+  // Wait until the child process has exited (WEXITED flag) but don't
+  // reap the child (WNOWAIT flag).
+  waitid(P_PID, child_pid, &info, WNOWAIT | WEXITED);
+
+  if (zombied) {
+    // Arg of "Z" refers to a Zombied Process.
+    CompareProcessState("Z", child_pid);
+    RETURN_IF_ERRNO(zombied(child_pid));
+  }
+
+  // Wait on the process.
+  wait_cleanup.Release()();
+  // If the process is reaped, then then this should return
+  // with ECHILD.
+  EXPECT_THAT(waitpid(child_pid, &status, WNOHANG),
+              SyscallFailsWithErrno(ECHILD));
+
+  if (exited) {
+    RETURN_IF_ERRNO(exited(child_pid));
+  }
+
+  return NoError();
+}
+
+// Access the file returned by name when a subprocess is running.
+PosixError AccessWhileRunning(std::function<std::string(int pid)> name, int flags,
+                              std::function<void(int fd)> access) {
+  FileDescriptor fd;
+  return WithSubprocess(
+      [&](int pid) -> PosixError {
+        // Running.
+        ASSIGN_OR_RETURN_ERRNO(fd, Open(name(pid), flags));
+
+        access(fd.get());
+        return NoError();
+      },
+      nullptr, nullptr);
+}
+
+// Access the file returned by name when the a subprocess is zombied.
+PosixError AccessWhileZombied(std::function<std::string(int pid)> name, int flags,
+                              std::function<void(int fd)> access) {
+  FileDescriptor fd;
+  return WithSubprocess(
+      [&](int pid) -> PosixError {
+        // Running.
+        ASSIGN_OR_RETURN_ERRNO(fd, Open(name(pid), flags));
+        return NoError();
+      },
+      [&](int pid) -> PosixError {
+        // Zombied.
+        access(fd.get());
+        return NoError();
+      },
+      nullptr);
+}
+
+// Access the file returned by name when the a subprocess is exited.
+PosixError AccessWhileExited(std::function<std::string(int pid)> name, int flags,
+                             std::function<void(int fd)> access) {
+  FileDescriptor fd;
+  return WithSubprocess(
+      [&](int pid) -> PosixError {
+        // Running.
+        ASSIGN_OR_RETURN_ERRNO(fd, Open(name(pid), flags));
+        return NoError();
+      },
+      nullptr,
+      [&](int pid) -> PosixError {
+        // Exited.
+        access(fd.get());
+        return NoError();
+      });
+}
+
+// ReadFd(fd=/proc/PID/basename) while PID is running.
+int ReadWhileRunning(std::string const& basename, void* buf, size_t count) {
+  int ret = 0;
+  int err = 0;
+  EXPECT_NO_ERRNO(AccessWhileRunning(
+      [&](int pid) -> std::string {
+        return absl::StrCat("/proc/", pid, "/", basename);
+      },
+      O_RDONLY,
+      [&](int fd) {
+        ret = ReadFd(fd, buf, count);
+        err = errno;
+      }));
+  errno = err;
+  return ret;
+}
+
+// ReadFd(fd=/proc/PID/basename) while PID is zombied.
+int ReadWhileZombied(std::string const& basename, void* buf, size_t count) {
+  int ret = 0;
+  int err = 0;
+  EXPECT_NO_ERRNO(AccessWhileZombied(
+      [&](int pid) -> std::string {
+        return absl::StrCat("/proc/", pid, "/", basename);
+      },
+      O_RDONLY,
+      [&](int fd) {
+        ret = ReadFd(fd, buf, count);
+        err = errno;
+      }));
+  errno = err;
+  return ret;
+}
+
+// ReadFd(fd=/proc/PID/basename) while PID is exited.
+int ReadWhileExited(std::string const& basename, void* buf, size_t count) {
+  int ret = 0;
+  int err = 0;
+  EXPECT_NO_ERRNO(AccessWhileExited(
+      [&](int pid) -> std::string {
+        return absl::StrCat("/proc/", pid, "/", basename);
+      },
+      O_RDONLY,
+      [&](int fd) {
+        ret = ReadFd(fd, buf, count);
+        err = errno;
+      }));
+  errno = err;
+  return ret;
+}
+
+// readlinkat(fd=/proc/PID/, basename) while PID is running.
+int ReadlinkWhileRunning(std::string const& basename, char* buf, size_t count) {
+  int ret = 0;
+  int err = 0;
+  EXPECT_NO_ERRNO(AccessWhileRunning(
+      [&](int pid) -> std::string { return absl::StrCat("/proc/", pid, "/"); },
+      O_DIRECTORY,
+      [&](int fd) {
+        ret = readlinkat(fd, basename.c_str(), buf, count);
+        err = errno;
+      }));
+  errno = err;
+  return ret;
+}
+
+// readlinkat(fd=/proc/PID/, basename) while PID is zombied.
+int ReadlinkWhileZombied(std::string const& basename, char* buf, size_t count) {
+  int ret = 0;
+  int err = 0;
+  EXPECT_NO_ERRNO(AccessWhileZombied(
+      [&](int pid) -> std::string { return absl::StrCat("/proc/", pid, "/"); },
+      O_DIRECTORY,
+      [&](int fd) {
+        ret = readlinkat(fd, basename.c_str(), buf, count);
+        err = errno;
+      }));
+  errno = err;
+  return ret;
+}
+
+// readlinkat(fd=/proc/PID/, basename) while PID is exited.
+int ReadlinkWhileExited(std::string const& basename, char* buf, size_t count) {
+  int ret = 0;
+  int err = 0;
+  EXPECT_NO_ERRNO(AccessWhileExited(
+      [&](int pid) -> std::string { return absl::StrCat("/proc/", pid, "/"); },
+      O_DIRECTORY,
+      [&](int fd) {
+        ret = readlinkat(fd, basename.c_str(), buf, count);
+        err = errno;
+      }));
+  errno = err;
+  return ret;
+}
+
+TEST(ProcSelfTest, IsThreadGroupLeader) {
+  ScopedThread([] {
+    const pid_t tgid = getpid();
+    const pid_t tid = syscall(SYS_gettid);
+    EXPECT_NE(tgid, tid);
+    auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self"));
+    EXPECT_EQ(link, absl::StrCat(tgid));
+  });
+}
+
+TEST(ProcThreadSelfTest, Basic) {
+  const pid_t tgid = getpid();
+  const pid_t tid = syscall(SYS_gettid);
+  EXPECT_EQ(tgid, tid);
+  auto link_threadself =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/thread-self"));
+  EXPECT_EQ(link_threadself, absl::StrCat(tgid, "/task/", tid));
+  // Just read one file inside thread-self to ensure that the link is valid.
+  auto link_threadself_exe =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/thread-self/exe"));
+  auto link_procself_exe =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/exe"));
+  EXPECT_EQ(link_threadself_exe, link_procself_exe);
+}
+
+TEST(ProcThreadSelfTest, Thread) {
+  ScopedThread([] {
+    const pid_t tgid = getpid();
+    const pid_t tid = syscall(SYS_gettid);
+    EXPECT_NE(tgid, tid);
+    auto link_threadself =
+        ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/thread-self"));
+
+    EXPECT_EQ(link_threadself, absl::StrCat(tgid, "/task/", tid));
+    // Just read one file inside thread-self to ensure that the link is valid.
+    auto link_threadself_exe =
+        ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/thread-self/exe"));
+    auto link_procself_exe =
+        ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/exe"));
+    EXPECT_EQ(link_threadself_exe, link_procself_exe);
+    // A thread should not have "/proc/<tid>/task".
+    struct stat s;
+    EXPECT_THAT(stat("/proc/thread-self/task", &s),
+                SyscallFailsWithErrno(ENOENT));
+  });
+}
+
+// Returns the /proc/PID/maps entry for the MAP_PRIVATE | MAP_ANONYMOUS mapping
+// m with start address addr and length len.
+std::string AnonymousMapsEntry(uintptr_t addr, size_t len, int prot) {
+  return absl::StrCat(absl::Hex(addr, absl::PadSpec::kZeroPad8), "-",
+                      absl::Hex(addr + len, absl::PadSpec::kZeroPad8), " ",
+                      prot & PROT_READ ? "r" : "-",
+                      prot & PROT_WRITE ? "w" : "-",
+                      prot & PROT_EXEC ? "x" : "-", "p 00000000 00:00 0 ");
+}
+
+std::string AnonymousMapsEntryForMapping(const Mapping& m, int prot) {
+  return AnonymousMapsEntry(m.addr(), m.len(), prot);
+}
+
+PosixErrorOr<std::map<uint64_t, uint64_t>> ReadProcSelfAuxv() {
+  std::string auxv_file;
+  RETURN_IF_ERRNO(GetContents("/proc/self/auxv", &auxv_file));
+  const Elf64_auxv_t* auxv_data =
+      reinterpret_cast<const Elf64_auxv_t*>(auxv_file.data());
+  std::map<uint64_t, uint64_t> auxv_entries;
+  for (int i = 0; auxv_data[i].a_type != AT_NULL; i++) {
+    auto a_type = auxv_data[i].a_type;
+    EXPECT_EQ(0, auxv_entries.count(a_type)) << "a_type: " << a_type;
+    auxv_entries.emplace(a_type, auxv_data[i].a_un.a_val);
+  }
+  return auxv_entries;
+}
+
+TEST(ProcSelfAuxv, EntryPresence) {
+  auto auxv_entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfAuxv());
+
+  EXPECT_EQ(auxv_entries.count(AT_ENTRY), 1);
+  EXPECT_EQ(auxv_entries.count(AT_PHDR), 1);
+  EXPECT_EQ(auxv_entries.count(AT_PHENT), 1);
+  EXPECT_EQ(auxv_entries.count(AT_PHNUM), 1);
+  EXPECT_EQ(auxv_entries.count(AT_BASE), 1);
+  EXPECT_EQ(auxv_entries.count(AT_CLKTCK), 1);
+  EXPECT_EQ(auxv_entries.count(AT_RANDOM), 1);
+  EXPECT_EQ(auxv_entries.count(AT_EXECFN), 1);
+  EXPECT_EQ(auxv_entries.count(AT_PAGESZ), 1);
+  EXPECT_EQ(auxv_entries.count(AT_SYSINFO_EHDR), 1);
+}
+
+TEST(ProcSelfAuxv, EntryValues) {
+  auto proc_auxv = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfAuxv());
+
+  // We need to find the ELF auxiliary vector. The section of memory pointed to
+  // by envp contains some pointers to non-null pointers, followed by a single
+  // pointer to a null pointer, followed by the auxiliary vector.
+  char** envpi = environ;
+  while (*envpi) {
+    ++envpi;
+  }
+
+  const Elf64_auxv_t* envp_auxv =
+      reinterpret_cast<const Elf64_auxv_t*>(envpi + 1);
+  int i;
+  for (i = 0; envp_auxv[i].a_type != AT_NULL; i++) {
+    auto a_type = envp_auxv[i].a_type;
+    EXPECT_EQ(proc_auxv.count(a_type), 1);
+    EXPECT_EQ(proc_auxv[a_type], envp_auxv[i].a_un.a_val)
+        << "a_type: " << a_type;
+  }
+  EXPECT_EQ(i, proc_auxv.size());
+}
+
+// Just open and read /proc/self/maps, check that we can find [stack]
+TEST(ProcSelfMaps, Basic) {
+  auto proc_self_maps =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+
+  std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+  std::vector<std::string> stacks;
+  // Make sure there's a stack in there.
+  for (const auto& str : strings) {
+    if (str.find("[stack]") != std::string::npos) {
+      stacks.push_back(str);
+    }
+  }
+  ASSERT_EQ(1, stacks.size()) << "[stack] not found in: " << proc_self_maps;
+  // Linux pads to 73 characters then we add 7.
+  EXPECT_EQ(80, stacks[0].length());
+}
+
+TEST(ProcSelfMaps, Map1) {
+  Mapping mapping =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_READ, MAP_PRIVATE));
+  auto proc_self_maps =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+  std::vector<std::string> addrs;
+  // Make sure if is listed.
+  for (const auto& str : strings) {
+    if (str == AnonymousMapsEntryForMapping(mapping, PROT_READ)) {
+      addrs.push_back(str);
+    }
+  }
+  ASSERT_EQ(1, addrs.size());
+}
+
+TEST(ProcSelfMaps, Map2) {
+  // NOTE: The permissions must be different or the pages will get merged.
+  Mapping map1 = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_EXEC, MAP_PRIVATE));
+  Mapping map2 =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_WRITE, MAP_PRIVATE));
+
+  auto proc_self_maps =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+  std::vector<std::string> addrs;
+  // Make sure if is listed.
+  for (const auto& str : strings) {
+    if (str == AnonymousMapsEntryForMapping(map1, PROT_READ | PROT_EXEC)) {
+      addrs.push_back(str);
+    }
+  }
+  ASSERT_EQ(1, addrs.size());
+  addrs.clear();
+  for (const auto& str : strings) {
+    if (str == AnonymousMapsEntryForMapping(map2, PROT_WRITE)) {
+      addrs.push_back(str);
+    }
+  }
+  ASSERT_EQ(1, addrs.size());
+}
+
+TEST(ProcSelfMaps, MapUnmap) {
+  Mapping map1 = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_EXEC, MAP_PRIVATE));
+  Mapping map2 =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_WRITE, MAP_PRIVATE));
+
+  auto proc_self_maps =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+  std::vector<std::string> addrs;
+  // Make sure if is listed.
+  for (const auto& str : strings) {
+    if (str == AnonymousMapsEntryForMapping(map1, PROT_READ | PROT_EXEC)) {
+      addrs.push_back(str);
+    }
+  }
+  ASSERT_EQ(1, addrs.size()) << proc_self_maps;
+  addrs.clear();
+  for (const auto& str : strings) {
+    if (str == AnonymousMapsEntryForMapping(map2, PROT_WRITE)) {
+      addrs.push_back(str);
+    }
+  }
+  ASSERT_EQ(1, addrs.size());
+
+  map2.reset();
+
+  // Read it again.
+  proc_self_maps = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  strings = absl::StrSplit(proc_self_maps, '\n');
+  // First entry should be there.
+  addrs.clear();
+  for (const auto& str : strings) {
+    if (str == AnonymousMapsEntryForMapping(map1, PROT_READ | PROT_EXEC)) {
+      addrs.push_back(str);
+    }
+  }
+  ASSERT_EQ(1, addrs.size());
+  addrs.clear();
+  // But not the second.
+  for (const auto& str : strings) {
+    if (str == AnonymousMapsEntryForMapping(map2, PROT_WRITE)) {
+      addrs.push_back(str);
+    }
+  }
+  ASSERT_EQ(0, addrs.size());
+}
+
+TEST(ProcSelfMaps, Mprotect) {
+  if (!IsRunningOnGvisor()) {
+    // FIXME: Linux's mprotect() sometimes fails to merge VMAs in this
+    // case.
+    LOG(WARNING) << "Skipping test on Linux";
+    return;
+  }
+
+  // Reserve 5 pages of address space.
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(5 * kPageSize, PROT_NONE, MAP_PRIVATE));
+
+  // Change the permissions on the middle 3 pages. (The first and last pages may
+  // be merged with other vmas on either side, so they aren't tested directly;
+  // they just ensure that the middle 3 pages are bracketed by VMAs with
+  // incompatible permissions.)
+  ASSERT_THAT(mprotect(reinterpret_cast<void*>(m.addr() + kPageSize),
+                       3 * kPageSize, PROT_READ),
+              SyscallSucceeds());
+
+  // Check that the middle 3 pages make up a single VMA.
+  auto proc_self_maps =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+  EXPECT_THAT(strings, Contains(AnonymousMapsEntry(m.addr() + kPageSize,
+                                                   3 * kPageSize, PROT_READ)));
+
+  // Change the permissions on the middle page only.
+  ASSERT_THAT(mprotect(reinterpret_cast<void*>(m.addr() + 2 * kPageSize),
+                       kPageSize, PROT_READ | PROT_WRITE),
+              SyscallSucceeds());
+
+  // Check that the single VMA has been split into 3 VMAs.
+  proc_self_maps = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  strings = absl::StrSplit(proc_self_maps, '\n');
+  EXPECT_THAT(
+      strings,
+      IsSupersetOf(
+          {AnonymousMapsEntry(m.addr() + kPageSize, kPageSize, PROT_READ),
+           AnonymousMapsEntry(m.addr() + 2 * kPageSize, kPageSize,
+                              PROT_READ | PROT_WRITE),
+           AnonymousMapsEntry(m.addr() + 3 * kPageSize, kPageSize,
+                              PROT_READ)}));
+
+  // Change the permissions on the middle page back.
+  ASSERT_THAT(mprotect(reinterpret_cast<void*>(m.addr() + 2 * kPageSize),
+                       kPageSize, PROT_READ),
+              SyscallSucceeds());
+
+  // Check that the 3 VMAs have been merged back into a single VMA.
+  proc_self_maps = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  strings = absl::StrSplit(proc_self_maps, '\n');
+  EXPECT_THAT(strings, Contains(AnonymousMapsEntry(m.addr() + kPageSize,
+                                                   3 * kPageSize, PROT_READ)));
+}
+
+TEST(ProcSelfFd, OpenFd) {
+  int pipe_fds[2];
+  ASSERT_THAT(pipe2(pipe_fds, O_CLOEXEC), SyscallSucceeds());
+
+  // Reopen the write end.
+  const std::string path = absl::StrCat("/proc/self/fd/", pipe_fds[1]);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_WRONLY));
+
+  // Ensure that a read/write works.
+  const std::string data = "hello";
+  std::unique_ptr<char[]> buffer(new char[data.size()]);
+  EXPECT_THAT(write(fd.get(), data.c_str(), data.size()),
+              SyscallSucceedsWithValue(5));
+  EXPECT_THAT(read(pipe_fds[0], buffer.get(), data.size()),
+              SyscallSucceedsWithValue(5));
+  EXPECT_EQ(strncmp(buffer.get(), data.c_str(), data.size()), 0);
+
+  // Cleanup.
+  ASSERT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+  ASSERT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+TEST(ProcSelfFdInfo, CorrectFds) {
+  // Make sure there is at least one open file.
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+  // Get files in /proc/self/fd.
+  auto fd_files = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc/self/fd", false));
+
+  // Get files in /proc/self/fdinfo.
+  auto fdinfo_files =
+      ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc/self/fdinfo", false));
+
+  // They should contain the same fds.
+  EXPECT_THAT(fd_files, UnorderedElementsAreArray(fdinfo_files));
+
+  // Both should contain fd.
+  auto fd_s = absl::StrCat(fd.get());
+  EXPECT_THAT(fd_files, Contains(fd_s));
+}
+
+TEST(ProcSelfFdInfo, Flags) {
+  std::string path = NewTempAbsPath();
+
+  // Create file here with O_CREAT to test that O_CREAT does not appear in
+  // fdinfo flags.
+  int flags = O_CREAT | O_RDWR | O_APPEND | O_CLOEXEC;
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, flags, 0644));
+
+  // Automatically delete path.
+  TempPath temp_path(path);
+
+  // O_CREAT does not appear in fdinfo flags.
+  flags &= ~O_CREAT;
+
+  // O_LARGEFILE always appears (on x86_64).
+  flags |= kOLargeFile;
+
+  auto fd_info = ASSERT_NO_ERRNO_AND_VALUE(
+      GetContents(absl::StrCat("/proc/self/fdinfo/", fd.get())));
+  EXPECT_THAT(fd_info, HasSubstr(absl::StrFormat("flags:\t%#o", flags)));
+}
+
+TEST(ProcSelfExe, Absolute) {
+  auto exe = ASSERT_NO_ERRNO_AND_VALUE(
+      ReadLink(absl::StrCat("/proc/", getpid(), "/exe")));
+  EXPECT_EQ(exe[0], '/');
+}
+
+// Sanity check for /proc/cpuinfo fields that must be present.
+TEST(ProcCpuinfo, RequiredFieldsArePresent) {
+  std::string proc_cpuinfo = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/cpuinfo"));
+  ASSERT_FALSE(proc_cpuinfo.empty());
+  std::vector<std::string> cpuinfo_fields = absl::StrSplit(proc_cpuinfo, '\n');
+
+  // This list of "required" fields is taken from reading the file
+  // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
+  // printed by the kernel.
+  static const char* required_fields[] = {
+      "processor",
+      "vendor_id",
+      "cpu family",
+      "model\t\t:",
+      "model name",
+      "stepping",
+      "cpu MHz",
+      "fpu\t\t:",
+      "fpu_exception",
+      "cpuid level",
+      "wp",
+      "bogomips",
+      "clflush size",
+      "cache_alignment",
+      "address sizes",
+      "power management",
+  };
+
+  // Check that the usual fields are there. We don't really care about the
+  // contents.
+  for (const std::string& field : required_fields) {
+    EXPECT_THAT(proc_cpuinfo, HasSubstr(field));
+  }
+}
+
+// Sanity checks that uptime is present.
+TEST(ProcUptime, IsPresent) {
+  std::string proc_uptime = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/uptime"));
+  ASSERT_FALSE(proc_uptime.empty());
+  std::vector<std::string> uptime_parts = absl::StrSplit(proc_uptime, ' ');
+
+  // Parse once.
+  double uptime0, uptime1, idletime0, idletime1;
+  ASSERT_TRUE(absl::SimpleAtod(uptime_parts[0], &uptime0));
+  ASSERT_TRUE(absl::SimpleAtod(uptime_parts[1], &idletime0));
+
+  // Sleep for one second.
+  absl::SleepFor(absl::Seconds(1));
+
+  // Parse again.
+  proc_uptime = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/uptime"));
+  ASSERT_FALSE(proc_uptime.empty());
+  uptime_parts = absl::StrSplit(proc_uptime, ' ');
+  ASSERT_TRUE(absl::SimpleAtod(uptime_parts[0], &uptime1));
+  ASSERT_TRUE(absl::SimpleAtod(uptime_parts[1], &idletime1));
+
+  // Sanity check.
+  //
+  // We assert that between 0.99 and 59.99 seconds have passed. If more than a
+  // minute has passed, then we must be executing really, really slowly.
+  EXPECT_GE(uptime0, 0.0);
+  EXPECT_GE(idletime0, 0.0);
+  EXPECT_GT(uptime1, uptime0);
+  EXPECT_GE(uptime1, uptime0 + 0.99);
+  EXPECT_LE(uptime1, uptime0 + 59.99);
+  EXPECT_GE(idletime1, idletime0);
+}
+
+TEST(ProcMeminfo, ContainsBasicFields) {
+  std::string proc_meminfo = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/meminfo"));
+  EXPECT_THAT(proc_meminfo, AllOf(ContainsRegex(R"(MemTotal:\s+[0-9]+ kB)"),
+                                  ContainsRegex(R"(MemFree:\s+[0-9]+ kB)")));
+}
+
+TEST(ProcStat, ContainsBasicFields) {
+  std::string proc_stat = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/stat"));
+
+  std::vector<std::string> names;
+  for (auto const& line : absl::StrSplit(proc_stat, '\n')) {
+    std::vector<std::string> fields =
+        absl::StrSplit(line, ' ', absl::SkipWhitespace());
+    if (fields.empty()) {
+      continue;
+    }
+    names.push_back(fields[0]);
+  }
+
+  EXPECT_THAT(names,
+              IsSupersetOf({"cpu", "intr", "ctxt", "btime", "processes",
+                            "procs_running", "procs_blocked", "softirq"}));
+}
+
+TEST(ProcStat, EndsWithNewline) {
+  std::string proc_stat = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/stat"));
+  EXPECT_EQ(proc_stat.back(), '\n');
+}
+
+TEST(ProcStat, Fields) {
+  std::string proc_stat = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/stat"));
+
+  std::vector<std::string> names;
+  for (auto const& line : absl::StrSplit(proc_stat, '\n')) {
+    std::vector<std::string> fields =
+        absl::StrSplit(line, ' ', absl::SkipWhitespace());
+    if (fields.empty()) {
+      continue;
+    }
+
+    if (absl::StartsWith(fields[0], "cpu")) {
+      // As of Linux 3.11, each CPU entry has 10 fields, plus the name.
+      EXPECT_GE(fields.size(), 11) << proc_stat;
+    } else if (fields[0] == "ctxt") {
+      // Single field.
+      EXPECT_EQ(fields.size(), 2) << proc_stat;
+    } else if (fields[0] == "btime") {
+      // Single field.
+      EXPECT_EQ(fields.size(), 2) << proc_stat;
+    } else if (fields[0] == "itime") {
+      // Single field.
+      ASSERT_EQ(fields.size(), 2) << proc_stat;
+      // This is the only floating point field.
+      double val;
+      EXPECT_TRUE(absl::SimpleAtod(fields[1], &val)) << proc_stat;
+      continue;
+    } else if (fields[0] == "processes") {
+      // Single field.
+      EXPECT_EQ(fields.size(), 2) << proc_stat;
+    } else if (fields[0] == "procs_running") {
+      // Single field.
+      EXPECT_EQ(fields.size(), 2) << proc_stat;
+    } else if (fields[0] == "procs_blocked") {
+      // Single field.
+      EXPECT_EQ(fields.size(), 2) << proc_stat;
+    } else if (fields[0] == "softirq") {
+      // As of Linux 3.11, there are 10 softirqs. 12 fields for name + total.
+      EXPECT_GE(fields.size(), 12) << proc_stat;
+    }
+
+    // All fields besides itime are valid base 10 numbers.
+    for (size_t i = 1; i < fields.size(); i++) {
+      uint64_t val;
+      EXPECT_TRUE(absl::SimpleAtoi(fields[i], &val)) << proc_stat;
+    }
+  }
+}
+
+TEST(ProcLoadavg, EndsWithNewline) {
+  std::string proc_loadvg = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/loadavg"));
+  EXPECT_EQ(proc_loadvg.back(), '\n');
+}
+
+TEST(ProcLoadavg, Fields) {
+  std::string proc_loadvg = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/loadavg"));
+  std::vector<std::string> lines = absl::StrSplit(proc_loadvg, '\n');
+
+  // Single line.
+  EXPECT_EQ(lines.size(), 2) << proc_loadvg;
+
+  std::vector<std::string> fields =
+      absl::StrSplit(lines[0], absl::ByAnyChar(" /"), absl::SkipWhitespace());
+
+  // Six fields.
+  EXPECT_EQ(fields.size(), 6) << proc_loadvg;
+
+  double val;
+  uint64_t val2;
+  // First three fields are floating point numbers.
+  EXPECT_TRUE(absl::SimpleAtod(fields[0], &val)) << proc_loadvg;
+  EXPECT_TRUE(absl::SimpleAtod(fields[1], &val)) << proc_loadvg;
+  EXPECT_TRUE(absl::SimpleAtod(fields[2], &val)) << proc_loadvg;
+  // Rest of the fields are valid base 10 numbers.
+  EXPECT_TRUE(absl::SimpleAtoi(fields[3], &val2)) << proc_loadvg;
+  EXPECT_TRUE(absl::SimpleAtoi(fields[4], &val2)) << proc_loadvg;
+  EXPECT_TRUE(absl::SimpleAtoi(fields[5], &val2)) << proc_loadvg;
+}
+
+// NOTE: Tests in priority.cc also check certain priority related fields in
+// /proc/self/stat.
+
+class ProcPidStatTest : public ::testing::TestWithParam<std::string> {};
+
+TEST_P(ProcPidStatTest, HasBasicFields) {
+  std::string proc_pid_stat = ASSERT_NO_ERRNO_AND_VALUE(
+      GetContents(absl::StrCat("/proc/", GetParam(), "/stat")));
+
+  ASSERT_FALSE(proc_pid_stat.empty());
+  std::vector<std::string> fields = absl::StrSplit(proc_pid_stat, ' ');
+  ASSERT_GE(fields.size(), 24);
+  EXPECT_EQ(absl::StrCat(getpid()), fields[0]);
+  // fields[1] is the thread name.
+  EXPECT_EQ("R", fields[2]);  // task state
+  EXPECT_EQ(absl::StrCat(getppid()), fields[3]);
+
+  uint64_t vss;
+  ASSERT_TRUE(absl::SimpleAtoi(fields[22], &vss));
+  EXPECT_GT(vss, 0);
+
+  uint64_t rss;
+  ASSERT_TRUE(absl::SimpleAtoi(fields[23], &rss));
+  EXPECT_GT(rss, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(SelfAndNumericPid, ProcPidStatTest,
+                        ::testing::Values("self", absl::StrCat(getpid())));
+
+using ProcPidStatmTest = ::testing::TestWithParam<std::string>;
+
+TEST_P(ProcPidStatmTest, HasBasicFields) {
+  std::string proc_pid_statm = ASSERT_NO_ERRNO_AND_VALUE(
+      GetContents(absl::StrCat("/proc/", GetParam(), "/statm")));
+  ASSERT_FALSE(proc_pid_statm.empty());
+  std::vector<std::string> fields = absl::StrSplit(proc_pid_statm, ' ');
+  ASSERT_GE(fields.size(), 7);
+
+  uint64_t vss;
+  ASSERT_TRUE(absl::SimpleAtoi(fields[0], &vss));
+  EXPECT_GT(vss, 0);
+
+  uint64_t rss;
+  ASSERT_TRUE(absl::SimpleAtoi(fields[1], &rss));
+  EXPECT_GT(rss, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(SelfAndNumericPid, ProcPidStatmTest,
+                        ::testing::Values("self", absl::StrCat(getpid())));
+
+PosixErrorOr<uint64_t> CurrentRSS() {
+  ASSIGN_OR_RETURN_ERRNO(auto proc_self_stat, GetContents("/proc/self/stat"));
+  if (proc_self_stat.empty()) {
+    return PosixError(EINVAL, "empty /proc/self/stat");
+  }
+
+  std::vector<std::string> fields = absl::StrSplit(proc_self_stat, ' ');
+  if (fields.size() < 24) {
+    return PosixError(
+        EINVAL,
+        absl::StrCat("/proc/self/stat has too few fields: ", proc_self_stat));
+  }
+
+  uint64_t rss;
+  if (!absl::SimpleAtoi(fields[23], &rss)) {
+    return PosixError(
+        EINVAL, absl::StrCat("/proc/self/stat RSS field is not a number: ",
+                             fields[23]));
+  }
+
+  // RSS is given in number of pages.
+  return rss * kPageSize;
+}
+
+// The size of mapping created by MapPopulateRSS.
+constexpr uint64_t kMappingSize = 100 << 20;
+
+// Tolerance on RSS comparisons to account for background thread mappings,
+// reclaimed pages, newly faulted pages, etc.
+constexpr uint64_t kRSSTolerance = 5 << 20;
+
+// Capture RSS before and after an anonymous mapping with passed prot.
+void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
+  *before = ASSERT_NO_ERRNO_AND_VALUE(CurrentRSS());
+
+  // N.B. The kernel asynchronously accumulates per-task RSS counters into the
+  // mm RSS, which is exposed by /proc/PID/stat. Task exit is a synchronization
+  // point (kernel/exit.c:do_exit -> sync_mm_rss), so perform the mapping on
+  // another thread to ensure it is reflected in RSS after the thread exits.
+  Mapping mapping;
+  ScopedThread t([&mapping, prot] {
+    mapping = ASSERT_NO_ERRNO_AND_VALUE(
+        MmapAnon(kMappingSize, prot, MAP_PRIVATE | MAP_POPULATE));
+  });
+  t.Join();
+
+  *after = ASSERT_NO_ERRNO_AND_VALUE(CurrentRSS());
+}
+
+// TODO: Test for PROT_READ + MAP_POPULATE anonymous mappings. Their
+// semantics are more subtle:
+//
+// Small pages -> Zero page mapped, not counted in RSS
+// (mm/memory.c:do_anonymous_page).
+//
+// Huge pages (THP enabled, use_zero_page=0) -> Pages committed
+// (mm/memory.c:__handle_mm_fault -> create_huge_pmd).
+//
+// Huge pages (THP enabled, use_zero_page=1) -> Zero page mapped, not counted in
+// RSS (mm/huge_memory.c:do_huge_pmd_anonymous_page).
+
+// PROT_WRITE + MAP_POPULATE anonymous mappings are always committed.
+TEST(ProcSelfStat, PopulateWriteRSS) {
+  uint64_t before, after;
+  MapPopulateRSS(PROT_READ | PROT_WRITE, &before, &after);
+
+  // Mapping is committed.
+  EXPECT_NEAR(before + kMappingSize, after, kRSSTolerance);
+}
+
+// PROT_NONE + MAP_POPULATE anonymous mappings are never committed.
+TEST(ProcSelfStat, PopulateNoneRSS) {
+  uint64_t before, after;
+  MapPopulateRSS(PROT_NONE, &before, &after);
+
+  // Mapping not committed.
+  EXPECT_NEAR(before, after, kRSSTolerance);
+}
+
+// Returns the calling thread's name.
+PosixErrorOr<std::string> ThreadName() {
+  // "The buffer should allow space for up to 16 bytes; the returned std::string
+  // will be null-terminated if it is shorter than that." - prctl(2). But we
+  // always want the thread name to be null-terminated.
+  char thread_name[17];
+  int rc = prctl(PR_GET_NAME, thread_name, 0, 0, 0);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "prctl(PR_GET_NAME)");
+  }
+  thread_name[16] = '\0';
+  return std::string(thread_name);
+}
+
+// Parses the contents of a /proc/[pid]/status file into a collection of
+// key-value pairs.
+PosixErrorOr<std::map<std::string, std::string>> ParseProcStatus(
+    absl::string_view status_str) {
+  std::map<std::string, std::string> fields;
+  for (absl::string_view const line :
+       absl::StrSplit(status_str, '\n', absl::SkipWhitespace())) {
+    const std::pair<absl::string_view, absl::string_view> kv =
+        absl::StrSplit(line, absl::MaxSplits(":\t", 1));
+    if (kv.first.empty()) {
+      return PosixError(
+          EINVAL, absl::StrCat("failed to parse key in line \"", line, "\""));
+    }
+    std::string key(kv.first);
+    if (fields.count(key)) {
+      return PosixError(EINVAL,
+                        absl::StrCat("duplicate key \"", kv.first, "\""));
+    }
+    std::string value(kv.second);
+    absl::StripLeadingAsciiWhitespace(&value);
+    fields.emplace(std::move(key), std::move(value));
+  }
+  return fields;
+}
+
+TEST(ParseProcStatusTest, ParsesSimpleStatusFileWithMixedWhitespaceCorrectly) {
+  EXPECT_THAT(
+      ParseProcStatus(
+          "Name:\tinit\nState:\tS (sleeping)\nCapEff:\t 0000001fffffffff\n"),
+      IsPosixErrorOkAndHolds(UnorderedElementsAre(
+          Pair("Name", "init"), Pair("State", "S (sleeping)"),
+          Pair("CapEff", "0000001fffffffff"))));
+}
+
+TEST(ParseProcStatusTest, DetectsDuplicateKeys) {
+  auto proc_status_or = ParseProcStatus("Name:\tfoo\nName:\tfoo\n");
+  EXPECT_THAT(proc_status_or,
+              PosixErrorIs(EINVAL, ::testing::StrEq("duplicate key \"Name\"")));
+}
+
+TEST(ParseProcStatusTest, DetectsMissingTabs) {
+  EXPECT_THAT(ParseProcStatus("Name:foo\nPid: 1\n"),
+              IsPosixErrorOkAndHolds(UnorderedElementsAre(Pair("Name:foo", ""),
+                                                          Pair("Pid: 1", ""))));
+}
+
+TEST(ProcPidStatusTest, HasBasicFields) {
+  // Do this on a separate thread since we want tgid != tid.
+  ScopedThread([] {
+    const pid_t tgid = getpid();
+    const pid_t tid = syscall(SYS_gettid);
+    EXPECT_NE(tgid, tid);
+    const auto thread_name = ASSERT_NO_ERRNO_AND_VALUE(ThreadName());
+
+    std::string status_str = ASSERT_NO_ERRNO_AND_VALUE(
+        GetContents(absl::StrCat("/proc/", tid, "/status")));
+
+    ASSERT_FALSE(status_str.empty());
+    const auto status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(status_str));
+    EXPECT_THAT(status, IsSupersetOf({Pair("Name", thread_name),
+                                      Pair("Tgid", absl::StrCat(tgid)),
+                                      Pair("Pid", absl::StrCat(tid)),
+                                      Pair("PPid", absl::StrCat(getppid()))}));
+  });
+}
+
+TEST(ProcPidStatusTest, StateRunning) {
+  // Task must be running when reading the file.
+  const pid_t tid = syscall(SYS_gettid);
+  std::string status_str = ASSERT_NO_ERRNO_AND_VALUE(
+      GetContents(absl::StrCat("/proc/", tid, "/status")));
+
+  EXPECT_THAT(ParseProcStatus(status_str),
+              IsPosixErrorOkAndHolds(Contains(Pair("State", "R (running)"))));
+}
+
+TEST(ProcPidStatusTest, StateSleeping_NoRandomSave) {
+  // Starts a child process that blocks and checks that State is sleeping.
+  auto res = WithSubprocess(
+      [&](int pid) -> PosixError {
+        // Because this test is timing based we will disable cooperative saving
+        // and the test itself also has random saving disabled.
+        const DisableSave ds;
+        // Try multiple times in case the child isn't sleeping when status file
+        // is read.
+        MonotonicTimer timer;
+        timer.Start();
+        for (;;) {
+          ASSIGN_OR_RETURN_ERRNO(
+              std::string status_str,
+              GetContents(absl::StrCat("/proc/", pid, "/status")));
+          ASSIGN_OR_RETURN_ERRNO(auto map, ParseProcStatus(status_str));
+          if (map["State"] == std::string("S (sleeping)")) {
+            // Test passed!
+            return NoError();
+          }
+          if (timer.Duration() > absl::Seconds(10)) {
+            return PosixError(ETIMEDOUT, "Timeout waiting for child to sleep");
+          }
+          absl::SleepFor(absl::Milliseconds(10));
+        }
+      },
+      nullptr, nullptr);
+  ASSERT_NO_ERRNO(res);
+}
+
+TEST(ProcPidStatusTest, ValuesAreTabDelimited) {
+  std::string status_str =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/status"));
+  ASSERT_FALSE(status_str.empty());
+  for (absl::string_view const line :
+       absl::StrSplit(status_str, '\n', absl::SkipWhitespace())) {
+    EXPECT_NE(std::string::npos, line.find(":\t"));
+  }
+}
+
+// Threads properly counts running threads.
+//
+// TODO: Test zombied threads while the thread group leader is still
+// running with generalized fork and clone children from the wait test.
+TEST(ProcPidStatusTest, Threads) {
+  char buf[4096] = {};
+  EXPECT_THAT(ReadWhileRunning("status", buf, sizeof(buf) - 1),
+              SyscallSucceedsWithValue(Gt(0)));
+
+  auto status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(buf));
+  auto it = status.find("Threads");
+  ASSERT_NE(it, status.end());
+  int threads = -1;
+  EXPECT_TRUE(absl::SimpleAtoi(it->second, &threads))
+      << "Threads value " << it->second << " is not a number";
+  // Don't make assumptions about the exact number of threads, as it may not be
+  // constant.
+  EXPECT_GE(threads, 1);
+
+  memset(buf, 0, sizeof(buf));
+  EXPECT_THAT(ReadWhileZombied("status", buf, sizeof(buf) - 1),
+              SyscallSucceedsWithValue(Gt(0)));
+
+  status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(buf));
+  it = status.find("Threads");
+  ASSERT_NE(it, status.end());
+  threads = -1;
+  EXPECT_TRUE(absl::SimpleAtoi(it->second, &threads))
+      << "Threads value " << it->second << " is not a number";
+  // There must be only the thread group leader remaining, zombied.
+  EXPECT_EQ(threads, 1);
+}
+
+// Returns true if all characters in s are digits.
+bool IsDigits(absl::string_view s) {
+  return std::all_of(s.begin(), s.end(), absl::ascii_isdigit);
+}
+
+TEST(ProcPidStatTest, VSSRSS) {
+  std::string status_str =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/status"));
+  ASSERT_FALSE(status_str.empty());
+  auto status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(status_str));
+
+  const auto vss_it = status.find("VmSize");
+  ASSERT_NE(vss_it, status.end());
+
+  absl::string_view vss_str(vss_it->second);
+
+  // Room for the " kB" suffix plus at least one digit.
+  ASSERT_GT(vss_str.length(), 3);
+  EXPECT_TRUE(absl::EndsWith(vss_str, " kB"));
+  // Everything else is part of a number.
+  EXPECT_TRUE(IsDigits(vss_str.substr(0, vss_str.length() - 3))) << vss_str;
+  // ... which is not 0.
+  EXPECT_NE('0', vss_str[0]);
+
+  const auto rss_it = status.find("VmRSS");
+  ASSERT_NE(rss_it, status.end());
+
+  absl::string_view rss_str(rss_it->second);
+
+  // Room for the " kB" suffix plus at least one digit.
+  ASSERT_GT(rss_str.length(), 3);
+  EXPECT_TRUE(absl::EndsWith(rss_str, " kB"));
+  // Everything else is part of a number.
+  EXPECT_TRUE(IsDigits(rss_str.substr(0, rss_str.length() - 3))) << rss_str;
+  // ... which is not 0.
+  EXPECT_NE('0', rss_str[0]);
+}
+
+// Parse an array of NUL-terminated char* arrays, returning a vector of strings.
+std::vector<std::string> ParseNulTerminatedStrings(std::string contents) {
+  EXPECT_EQ('\0', contents.back());
+  // The split will leave an empty std::string if the NUL-byte remains, so pop it.
+  contents.pop_back();
+
+  return absl::StrSplit(contents, '\0');
+}
+
+TEST(ProcPidCmdline, MatchesArgv) {
+  std::vector<std::string> proc_cmdline = ParseNulTerminatedStrings(
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/cmdline")));
+  EXPECT_THAT(saved_argv, ContainerEq(proc_cmdline));
+}
+
+TEST(ProcPidEnviron, MatchesEnviron) {
+  std::vector<std::string> proc_environ = ParseNulTerminatedStrings(
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/environ")));
+  // Get the environment from the environ variable, which we will compare with
+  // /proc/self/environ.
+  std::vector<std::string> env;
+  for (char** v = environ; *v; v++) {
+    env.push_back(*v);
+  }
+  EXPECT_THAT(env, ContainerEq(proc_environ));
+}
+
+TEST(ProcPidCmdline, SubprocessForkSameCmdline) {
+  std::vector<std::string> proc_cmdline_parent;
+  std::vector<std::string> proc_cmdline;
+  proc_cmdline_parent = ParseNulTerminatedStrings(
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/cmdline")));
+  auto res = WithSubprocess(
+      [&](int pid) -> PosixError {
+        ASSIGN_OR_RETURN_ERRNO(
+            auto raw_cmdline,
+            GetContents(absl::StrCat("/proc/", pid, "/cmdline")));
+        proc_cmdline = ParseNulTerminatedStrings(raw_cmdline);
+        return NoError();
+      },
+      nullptr, nullptr);
+  ASSERT_NO_ERRNO(res);
+
+  for (size_t i = 0; i < proc_cmdline_parent.size(); i++) {
+    EXPECT_EQ(proc_cmdline_parent[i], proc_cmdline[i]);
+  }
+}
+
+// Test whether /proc/PID/ symlinks can be read for a running process.
+TEST(ProcPidSymlink, SubprocessRunning) {
+  char buf[1];
+
+  EXPECT_THAT(ReadlinkWhileRunning("exe", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadlinkWhileRunning("ns/net", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadlinkWhileRunning("ns/pid", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadlinkWhileRunning("ns/user", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+// FIXME: Inconsistent behavior between gVisor and linux
+// on proc files.
+TEST(ProcPidSymlink, SubprocessZombied) {
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  char buf[1];
+
+  int want = EACCES;
+  if (!IsRunningOnGvisor()) {
+    auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+    if (version.major == 4 && version.minor > 3) {
+      want = ENOENT;
+    }
+  }
+
+  EXPECT_THAT(ReadlinkWhileZombied("exe", buf, sizeof(buf)),
+              SyscallFailsWithErrno(want));
+
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(ReadlinkWhileZombied("ns/net", buf, sizeof(buf)),
+                SyscallFailsWithErrno(want));
+  }
+
+  // FIXME: Inconsistent behavior between gVisor and linux
+  // on proc files.
+  // 4.17 & gVisor: Syscall succeeds and returns 1
+  // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
+  //            SyscallFailsWithErrno(EACCES));
+
+  // FIXME: Inconsistent behavior between gVisor and linux
+  // on proc files.
+  // 4.17 &  gVisor: Syscall succeeds and returns 1.
+  // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
+  //            SyscallFailsWithErrno(EACCES));
+}
+
+// Test whether /proc/PID/ symlinks can be read for an exited process.
+TEST(ProcPidSymlink, SubprocessExited) {
+  // FIXME: These all succeed on gVisor.
+  SKIP_IF(IsRunningOnGvisor());
+
+  char buf[1];
+
+  EXPECT_THAT(ReadlinkWhileExited("exe", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
+
+  EXPECT_THAT(ReadlinkWhileExited("ns/net", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
+
+  EXPECT_THAT(ReadlinkWhileExited("ns/pid", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
+
+  EXPECT_THAT(ReadlinkWhileExited("ns/user", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+// /proc/PID/exe points to the correct binary.
+TEST(ProcPidExe, Subprocess) {
+  auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/exe"));
+  auto expected_absolute_path =
+      ASSERT_NO_ERRNO_AND_VALUE(MakeAbsolute(link, ""));
+
+  char actual[PATH_MAX + 1] = {};
+  ASSERT_THAT(ReadlinkWhileRunning("exe", actual, sizeof(actual)),
+              SyscallSucceedsWithValue(Gt(0)));
+  EXPECT_EQ(actual, expected_absolute_path);
+}
+
+// Test whether /proc/PID/ files can be read for a running process.
+TEST(ProcPidFile, SubprocessRunning) {
+  char buf[1];
+
+  EXPECT_THAT(ReadWhileRunning("auxv", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("cmdline", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("comm", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("gid_map", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("io", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("maps", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("stat", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("status", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+// Test whether /proc/PID/ files can be read for a zombie process.
+TEST(ProcPidFile, SubprocessZombie) {
+  char buf[1];
+  // 4.17: Succeeds and returns 1
+  // gVisor: Succeds and returns 0
+  EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
+
+  EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(0));
+
+  EXPECT_THAT(ReadWhileZombied("comm", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("gid_map", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("maps", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(0));
+
+  EXPECT_THAT(ReadWhileZombied("stat", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("status", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // FIXME: Inconsistent behavior between gVisor and linux
+  // on proc files.
+  // gVisor & 4.17: Succeeds and returns 1.
+  // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
+  //          SyscallFailsWithErrno(EACCES));
+}
+
+// Test whether /proc/PID/ files can be read for an exited process.
+TEST(ProcPidFile, SubprocessExited) {
+  char buf[1];
+
+  // FIXME: Inconsistent behavior between kernels
+  // gVisor: Fails with ESRCH.
+  // 4.17: Succeeds and returns 1.
+  // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
+  //            SyscallFailsWithErrno(ESRCH));
+
+  EXPECT_THAT(ReadWhileExited("cmdline", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("comm", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  EXPECT_THAT(ReadWhileExited("gid_map", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("io", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: Returns EOF on gVisor.
+    EXPECT_THAT(ReadWhileExited("maps", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("stat", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("status", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+PosixError DirContainsImpl(absl::string_view path,
+                           const std::vector<std::string>& targets, bool strict) {
+  ASSIGN_OR_RETURN_ERRNO(auto listing, ListDir(path, false));
+  bool success = true;
+
+  for (auto& expected_entry : targets) {
+    auto cursor = std::find(listing.begin(), listing.end(), expected_entry);
+    if (cursor == listing.end()) {
+      success = false;
+    }
+  }
+
+  if (!success) {
+    return PosixError(
+        ENOENT,
+        absl::StrCat("Failed to find one or more paths in '", path, "'"));
+  }
+
+  if (strict) {
+    if (targets.size() != listing.size()) {
+      return PosixError(
+          EINVAL,
+          absl::StrCat("Expected to find ", targets.size(), " elements in '",
+                       path, "', but found ", listing.size()));
+    }
+  }
+
+  return NoError();
+}
+
+PosixError DirContains(absl::string_view path,
+                       const std::vector<std::string>& targets) {
+  return DirContainsImpl(path, targets, false);
+}
+
+PosixError DirContainsExactly(absl::string_view path,
+                              const std::vector<std::string>& targets) {
+  return DirContainsImpl(path, targets, true);
+}
+
+PosixError EventuallyDirContainsExactly(absl::string_view path,
+                                        const std::vector<std::string>& targets) {
+  constexpr int kRetryCount = 100;
+  const absl::Duration kRetryDelay = absl::Milliseconds(100);
+
+  for (int i = 0; i < kRetryCount; ++i) {
+    auto res = DirContainsExactly(path, targets);
+    if (res.ok()) {
+      return res;
+    } else if (i < kRetryCount - 1) {
+      // Sleep if this isn't the final iteration.
+      absl::SleepFor(kRetryDelay);
+    }
+  }
+  return PosixError(ETIMEDOUT,
+                    "Timed out while waiting for directory to contain files ");
+}
+
+TEST(ProcTask, Basic) {
+  EXPECT_NO_ERRNO(
+      DirContains("/proc/self/task", {".", "..", absl::StrCat(getpid())}));
+}
+
+std::vector<std::string> TaskFiles(const std::vector<std::string>& initial_contents,
+                              const std::vector<pid_t>& pids) {
+  return VecCat<std::string>(
+      initial_contents,
+      ApplyVec<std::string>([](const pid_t p) { return absl::StrCat(p); }, pids));
+}
+
+std::vector<std::string> TaskFiles(const std::vector<pid_t>& pids) {
+  return TaskFiles({".", "..", absl::StrCat(getpid())}, pids);
+}
+
+// Helper class for creating a new task in the current thread group.
+class BlockingChild {
+ public:
+  BlockingChild() : thread_([=] { Start(); }) {}
+  ~BlockingChild() { Join(); }
+
+  pid_t Tid() const {
+    absl::MutexLock ml(&mu_);
+    mu_.Await(absl::Condition(&tid_ready_));
+    return tid_;
+  }
+
+  void Join() { Stop(); }
+
+ private:
+  void Start() {
+    absl::MutexLock ml(&mu_);
+    tid_ = syscall(__NR_gettid);
+    tid_ready_ = true;
+    mu_.Await(absl::Condition(&stop_));
+  }
+
+  void Stop() {
+    absl::MutexLock ml(&mu_);
+    stop_ = true;
+  }
+
+  mutable absl::Mutex mu_;
+  bool stop_ GUARDED_BY(mu_) = false;
+  pid_t tid_;
+  bool tid_ready_ GUARDED_BY(mu_) = false;
+
+  // Must be last to ensure that the destructor for the thread is run before
+  // any other member of the object is destroyed.
+  ScopedThread thread_;
+};
+
+TEST(ProcTask, NewThreadAppears) {
+  auto initial = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc/self/task", false));
+  BlockingChild child1;
+  EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
+                                     TaskFiles(initial, {child1.Tid()})));
+}
+
+TEST(ProcTask, KilledThreadsDisappear) {
+  auto initial = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc/self/task/", false));
+
+  BlockingChild child1;
+  EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
+                                     TaskFiles(initial, {child1.Tid()})));
+
+  // Stat child1's task file.
+  struct stat statbuf;
+  const std::string child1_task_file =
+      absl::StrCat("/proc/self/task/", child1.Tid());
+  EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf), SyscallSucceeds());
+
+  BlockingChild child2;
+  EXPECT_NO_ERRNO(DirContainsExactly(
+      "/proc/self/task", TaskFiles(initial, {child1.Tid(), child2.Tid()})));
+
+  BlockingChild child3;
+  BlockingChild child4;
+  BlockingChild child5;
+  EXPECT_NO_ERRNO(DirContainsExactly(
+      "/proc/self/task",
+      TaskFiles(initial, {child1.Tid(), child2.Tid(), child3.Tid(),
+                          child4.Tid(), child5.Tid()})));
+
+  child2.Join();
+  EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
+      "/proc/self/task", TaskFiles(initial, {child1.Tid(), child3.Tid(),
+                                             child4.Tid(), child5.Tid()})));
+
+  child1.Join();
+  child4.Join();
+  EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
+      "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()})));
+
+  // Stat child1's task file again.  This time it should fail.
+  EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf),
+              SyscallFailsWithErrno(ENOENT));
+
+  child3.Join();
+  child5.Join();
+  EXPECT_NO_ERRNO(EventuallyDirContainsExactly("/proc/self/task", initial));
+}
+
+TEST(ProcTask, ChildTaskDir) {
+  BlockingChild child1;
+  EXPECT_NO_ERRNO(DirContains("/proc/self/task", TaskFiles({child1.Tid()})));
+  EXPECT_NO_ERRNO(DirContains(absl::StrCat("/proc/", child1.Tid(), "/task"),
+                              TaskFiles({child1.Tid()})));
+}
+
+PosixError VerifyPidDir(std::string path) {
+  return DirContains(path, {"exe", "fd", "io", "maps", "ns", "stat", "status"});
+}
+
+TEST(ProcTask, VerifyTaskDir) {
+  EXPECT_NO_ERRNO(VerifyPidDir("/proc/self"));
+
+  EXPECT_NO_ERRNO(VerifyPidDir(absl::StrCat("/proc/self/task/", getpid())));
+  BlockingChild child1;
+  EXPECT_NO_ERRNO(VerifyPidDir(absl::StrCat("/proc/self/task/", child1.Tid())));
+
+  // Only the first level of task directories should contain the 'task'
+  // directory. That is:
+  //
+  // /proc/1234/task           <- should exist
+  // /proc/1234/task/1234/task <- should not exist
+  // /proc/1234/task/1235/task <- should not exist (where 1235 is in the same
+  //                                                thread group as 1234).
+  EXPECT_FALSE(
+      DirContains(absl::StrCat("/proc/self/task/", getpid()), {"task"}).ok())
+      << "Found 'task' directory in an inner directory.";
+}
+
+TEST(ProcTask, TaskDirCannotBeDeleted) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  EXPECT_THAT(rmdir("/proc/self/task"), SyscallFails());
+  EXPECT_THAT(rmdir(absl::StrCat("/proc/self/task/", getpid()).c_str()),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ProcTask, TaskDirHasCorrectMetadata) {
+  struct stat st;
+  EXPECT_THAT(stat("/proc/self/task", &st), SyscallSucceeds());
+  EXPECT_TRUE(S_ISDIR(st.st_mode));
+
+  // Verify file is readable and executable by everyone.
+  mode_t expected_permissions =
+      S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+  mode_t permissions = st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+  EXPECT_EQ(expected_permissions, permissions);
+}
+
+TEST(ProcTask, TaskDirCanSeekToEnd) {
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/task", O_RDONLY));
+  EXPECT_THAT(lseek(dirfd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(ProcTask, VerifyTaskDirNlinks) {
+  // A task directory will have 3 links if the taskgroup has a single
+  // thread. For example, the following shows where the links to
+  // '/proc/12345/task comes' from for a single threaded process with pid 12345:
+  //
+  //   /proc/12345/task  <-- 1 link for the directory itself
+  //     .               <-- link from "."
+  //     ..
+  //     12345
+  //       .
+  //       ..            <-- link from ".." to parent.
+  //       <other contents of a task dir>
+  //
+  // We can't assert an absolute number of links since we don't control how many
+  // threads the test framework spawns. Instead, we'll ensure creating a new
+  // thread increases the number of links as expected.
+
+  // Once we reach the test body, we can count on the thread count being stable
+  // unless we spawn a new one.
+  uint64_t initial_links = ASSERT_NO_ERRNO_AND_VALUE(Links("/proc/self/task"));
+  ASSERT_GE(initial_links, 3);
+
+  // For each new subtask, we should gain a new link.
+  BlockingChild child1;
+  EXPECT_THAT(Links("/proc/self/task"),
+              IsPosixErrorOkAndHolds(initial_links + 1));
+  BlockingChild child2;
+  EXPECT_THAT(Links("/proc/self/task"),
+              IsPosixErrorOkAndHolds(initial_links + 2));
+}
+
+TEST(ProcTask, CommContainsThreadNameAndTrailingNewline) {
+  constexpr char kThreadName[] = "TestThread12345";
+  ASSERT_THAT(prctl(PR_SET_NAME, kThreadName), SyscallSucceeds());
+
+  auto thread_name = ASSERT_NO_ERRNO_AND_VALUE(
+      GetContents(JoinPath("/proc", absl::StrCat(getpid()), "task",
+                           absl::StrCat(syscall(SYS_gettid)), "comm")));
+  EXPECT_EQ(absl::StrCat(kThreadName, "\n"), thread_name);
+}
+
+TEST(ProcTaskNs, NsDirExistsAndHasCorrectMetadata) {
+  EXPECT_NO_ERRNO(DirContains("/proc/self/ns", {"net", "pid", "user"}));
+
+  // Let's just test the 'pid' entry, all of them are very similar.
+  struct stat st;
+  EXPECT_THAT(lstat("/proc/self/ns/pid", &st), SyscallSucceeds());
+  EXPECT_TRUE(S_ISLNK(st.st_mode));
+
+  auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/ns/pid"));
+  EXPECT_THAT(link, ::testing::StartsWith("pid:["));
+}
+
+TEST(ProcTaskNs, AccessOnNsNodeSucceeds) {
+  EXPECT_THAT(access("/proc/self/ns/pid", F_OK), SyscallSucceeds());
+}
+
+TEST(ProcSysKernelHostname, Exists) {
+  EXPECT_THAT(open("/proc/sys/kernel/hostname", O_RDONLY), SyscallSucceeds());
+}
+
+TEST(ProcSysKernelHostname, MatchesUname) {
+  struct utsname buf;
+  EXPECT_THAT(uname(&buf), SyscallSucceeds());
+  const std::string hostname = absl::StrCat(buf.nodename, "\n");
+  auto procfs_hostname =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/hostname"));
+  EXPECT_EQ(procfs_hostname, hostname);
+}
+
+TEST(ProcSysVmMmapMinAddr, HasNumericValue) {
+  const std::string mmap_min_addr_str =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/vm/mmap_min_addr"));
+  uintptr_t mmap_min_addr;
+  EXPECT_TRUE(absl::SimpleAtoi(mmap_min_addr_str, &mmap_min_addr))
+      << "/proc/sys/vm/mmap_min_addr does not contain a numeric value: "
+      << mmap_min_addr_str;
+}
+
+TEST(ProcSysVmOvercommitMemory, HasNumericValue) {
+  const std::string overcommit_memory_str =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/vm/overcommit_memory"));
+  uintptr_t overcommit_memory;
+  EXPECT_TRUE(absl::SimpleAtoi(overcommit_memory_str, &overcommit_memory))
+      << "/proc/sys/vm/overcommit_memory does not contain a numeric value: "
+      << overcommit_memory;
+}
+
+// Check that link for proc fd entries point the target node, not the
+// symlink itself.
+TEST(ProcTaskFd, FstatatFollowsSymlink) {
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  struct stat sproc = {};
+  EXPECT_THAT(
+      fstatat(-1, absl::StrCat("/proc/self/fd/", fd.get()).c_str(), &sproc, 0),
+      SyscallSucceeds());
+
+  struct stat sfile = {};
+  EXPECT_THAT(fstatat(-1, file.path().c_str(), &sfile, 0), SyscallSucceeds());
+
+  // If fstatat follows the fd symlink, the device and inode numbers should
+  // match at a minimum.
+  EXPECT_EQ(sproc.st_dev, sfile.st_dev);
+  EXPECT_EQ(sproc.st_ino, sfile.st_ino);
+  EXPECT_EQ(0, memcmp(&sfile, &sproc, sizeof(sfile)));
+}
+
+TEST(ProcFilesystems, Bug65172365) {
+  std::string proc_filesystems =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/filesystems"));
+  ASSERT_FALSE(proc_filesystems.empty());
+}
+
+TEST(ProcFilesystems, PresenceOfShmMaxMniAll) {
+  uint64_t shmmax = 0;
+  uint64_t shmall = 0;
+  uint64_t shmmni = 0;
+  std::string proc_file;
+  proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/shmmax"));
+  ASSERT_FALSE(proc_file.empty());
+  ASSERT_TRUE(absl::SimpleAtoi(proc_file, &shmmax));
+  proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/shmall"));
+  ASSERT_FALSE(proc_file.empty());
+  ASSERT_TRUE(absl::SimpleAtoi(proc_file, &shmall));
+  proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/shmmni"));
+  ASSERT_FALSE(proc_file.empty());
+  ASSERT_TRUE(absl::SimpleAtoi(proc_file, &shmmni));
+
+  ASSERT_GT(shmmax, 0);
+  ASSERT_GT(shmall, 0);
+  ASSERT_GT(shmmni, 0);
+  ASSERT_LE(shmall, shmmax);
+
+  // These values should never be higher than this by default, for more
+  // information see uapi/linux/shm.h
+  ASSERT_LE(shmmax, ULONG_MAX - (1UL << 24));
+  ASSERT_LE(shmall, ULONG_MAX - (1UL << 24));
+}
+
+// Check that /proc/mounts is a symlink to self/mounts.
+TEST(ProcMounts, IsSymlink) {
+  auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/mounts"));
+  EXPECT_EQ(link, "self/mounts");
+}
+
+// Check that /proc/self/mounts looks something like a real mounts file.
+TEST(ProcSelfMounts, RequiredFieldsArePresent) {
+  auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts"));
+  EXPECT_THAT(mounts,
+              AllOf(
+                  // Root mount.
+                  ContainsRegex(R"(\S+ / \S+ (rw|ro)\S* [0-9]+ [0-9]+\s)"),
+                  // Root mount.
+                  ContainsRegex(R"(\S+ /proc \S+ rw\S* [0-9]+ [0-9]+\s)")));
+}
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  for (int i = 0; i < argc; ++i) {
+    gvisor::testing::saved_argv.emplace_back(std::string(argv[i]));
+  }
+
+  gvisor::testing::TestInit(&argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
new file mode 100644
index 000000000..6060d0644
--- /dev/null
+++ b/test/syscalls/linux/proc_net.cc
@@ -0,0 +1,59 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+TEST(ProcNetIfInet6, Format) {
+  auto ifinet6 = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/if_inet6"));
+  EXPECT_THAT(ifinet6,
+              ::testing::MatchesRegex(
+                  // Ex: "00000000000000000000000000000001 01 80 10 80 lo\n"
+                  "^([a-f\\d]{32}( [a-f\\d]{2}){4} +[a-z][a-z\\d]*\\n)+$"));
+}
+
+TEST(ProcSysNetIpv4Sack, Exists) {
+  EXPECT_THAT(open("/proc/sys/net/ipv4/tcp_sack", O_RDONLY), SyscallSucceeds());
+}
+
+TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
+  auto const fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/sys/net/ipv4/tcp_sack", O_RDWR));
+
+  char buf;
+  EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_TRUE(buf == '0' || buf == '1') << "unexpected tcp_sack: " << buf;
+
+  char to_write = (buf == '1') ? '0' : '1';
+  EXPECT_THAT(PwriteFd(fd.get(), &to_write, sizeof(to_write), 0),
+              SyscallSucceedsWithValue(sizeof(to_write)));
+
+  buf = 0;
+  EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  EXPECT_EQ(buf, to_write);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/pselect.cc b/test/syscalls/linux/pselect.cc
new file mode 100644
index 000000000..3294f6c14
--- /dev/null
+++ b/test/syscalls/linux/pselect.cc
@@ -0,0 +1,190 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/select.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+struct MaskWithSize {
+  sigset_t* mask;
+  size_t mask_size;
+};
+
+// Linux and glibc have a different idea of the sizeof sigset_t. When calling
+// the syscall directly, use what the kernel expects.
+unsigned kSigsetSize = SIGRTMAX / 8;
+
+// Linux pselect(2) differs from the glibc wrapper function in that Linux
+// updates the timeout with the amount of time remaining. In order to test this
+// behavior we need to use the syscall directly.
+int syscallPselect6(int nfds, fd_set* readfds, fd_set* writefds,
+                    fd_set* exceptfds, struct timespec* timeout,
+                    const MaskWithSize* mask_with_size) {
+  return syscall(SYS_pselect6, nfds, readfds, writefds, exceptfds, timeout,
+                 mask_with_size);
+}
+
+class PselectTest : public BasePollTest {
+ protected:
+  void SetUp() override { BasePollTest::SetUp(); }
+  void TearDown() override { BasePollTest::TearDown(); }
+};
+
+// See that when there are no FD sets, pselect behaves like sleep.
+TEST_F(PselectTest, NullFds) {
+  struct timespec timeout = absl::ToTimespec(absl::Milliseconds(10));
+  ASSERT_THAT(syscallPselect6(0, nullptr, nullptr, nullptr, &timeout, nullptr),
+              SyscallSucceeds());
+  EXPECT_EQ(timeout.tv_sec, 0);
+  EXPECT_EQ(timeout.tv_nsec, 0);
+
+  timeout = absl::ToTimespec(absl::Milliseconds(10));
+  ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, nullptr),
+              SyscallSucceeds());
+  EXPECT_EQ(timeout.tv_sec, 0);
+  EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+TEST_F(PselectTest, ClosedFds) {
+  fd_set read_set;
+  FD_ZERO(&read_set);
+  int fd;
+  ASSERT_THAT(fd = dup(1), SyscallSucceeds());
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+  FD_SET(fd, &read_set);
+  struct timespec timeout = absl::ToTimespec(absl::Milliseconds(10));
+  EXPECT_THAT(
+      syscallPselect6(fd + 1, &read_set, nullptr, nullptr, &timeout, nullptr),
+      SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(PselectTest, ZeroTimeout) {
+  struct timespec timeout = {};
+  ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, nullptr),
+              SyscallSucceeds());
+  EXPECT_EQ(timeout.tv_sec, 0);
+  EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+// If random S/R interrupts the pselect, SIGALRM may be delivered before pselect
+// restarts, causing the pselect to hang forever.
+TEST_F(PselectTest, NoTimeout_NoRandomSave) {
+  // When there's no timeout, pselect may never return so set a timer.
+  SetTimer(absl::Milliseconds(100));
+  // See that we get interrupted by the timer.
+  ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, nullptr, nullptr),
+              SyscallFailsWithErrno(EINTR));
+  EXPECT_TRUE(TimerFired());
+}
+
+TEST_F(PselectTest, InvalidTimeoutNegative) {
+  struct timespec timeout = absl::ToTimespec(absl::Seconds(-1));
+  ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_EQ(timeout.tv_sec, -1);
+  EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+TEST_F(PselectTest, InvalidTimeoutNotNormalized) {
+  struct timespec timeout = {0, 1000000001};
+  ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_EQ(timeout.tv_sec, 0);
+  EXPECT_EQ(timeout.tv_nsec, 1000000001);
+}
+
+TEST_F(PselectTest, EmptySigMaskInvalidMaskSize) {
+  struct timespec timeout = {};
+  MaskWithSize invalid = {nullptr, 7};
+  EXPECT_THAT(syscallPselect6(0, nullptr, nullptr, nullptr, &timeout, &invalid),
+              SyscallSucceeds());
+}
+
+TEST_F(PselectTest, EmptySigMaskValidMaskSize) {
+  struct timespec timeout = {};
+  MaskWithSize invalid = {nullptr, 8};
+  EXPECT_THAT(syscallPselect6(0, nullptr, nullptr, nullptr, &timeout, &invalid),
+              SyscallSucceeds());
+}
+
+TEST_F(PselectTest, InvalidMaskSize) {
+  struct timespec timeout = {};
+  sigset_t sigmask;
+  ASSERT_THAT(sigemptyset(&sigmask), SyscallSucceeds());
+  MaskWithSize invalid = {&sigmask, 7};
+  EXPECT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, &invalid),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Verify that signals blocked by the pselect mask (that would otherwise be
+// allowed) do not interrupt pselect.
+TEST_F(PselectTest, SignalMaskBlocksSignal) {
+  absl::Duration duration(absl::Seconds(30));
+  struct timespec timeout = absl::ToTimespec(duration);
+  absl::Duration timer_duration(absl::Seconds(10));
+
+  // Call with a mask that blocks SIGALRM. See that pselect is not interrupted
+  // (i.e. returns 0) and that upon completion, the timer has fired.
+  sigset_t mask;
+  ASSERT_THAT(sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+  ASSERT_THAT(sigaddset(&mask, SIGALRM), SyscallSucceeds());
+  MaskWithSize mask_with_size = {&mask, kSigsetSize};
+  SetTimer(timer_duration);
+  MaybeSave();
+  ASSERT_FALSE(TimerFired());
+  ASSERT_THAT(
+      syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, &mask_with_size),
+      SyscallSucceeds());
+  EXPECT_TRUE(TimerFired());
+  EXPECT_EQ(absl::DurationFromTimespec(timeout), absl::Duration());
+}
+
+// Verify that signals allowed by the pselect mask (that would otherwise be
+// blocked) interrupt pselect.
+TEST_F(PselectTest, SignalMaskAllowsSignal) {
+  absl::Duration duration = absl::Seconds(30);
+  struct timespec timeout = absl::ToTimespec(duration);
+  absl::Duration timer_duration = absl::Seconds(10);
+
+  sigset_t mask;
+  ASSERT_THAT(sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+
+  // Block SIGALRM.
+  auto cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGALRM));
+
+  // Call with a mask that unblocks SIGALRM. See that pselect is interrupted.
+  MaskWithSize mask_with_size = {&mask, kSigsetSize};
+  SetTimer(timer_duration);
+  MaybeSave();
+  ASSERT_FALSE(TimerFired());
+  ASSERT_THAT(
+      syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, &mask_with_size),
+      SyscallFailsWithErrno(EINTR));
+  EXPECT_TRUE(TimerFired());
+  EXPECT_GT(absl::DurationFromTimespec(timeout), absl::Duration());
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
new file mode 100644
index 000000000..d3b3b8b02
--- /dev/null
+++ b/test/syscalls/linux/ptrace.cc
@@ -0,0 +1,948 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <elf.h>
+#include <signal.h>
+#include <stddef.h>
+#include <sys/ptrace.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Sends sig to the current process with tgkill(2).
+//
+// glibc's raise(2) may change the signal mask before sending the signal. These
+// extra syscalls make tests of syscall, signal interception, etc. difficult to
+// write.
+void RaiseSignal(int sig) {
+  pid_t pid = getpid();
+  TEST_PCHECK(pid > 0);
+  pid_t tid = gettid();
+  TEST_PCHECK(tid > 0);
+  TEST_PCHECK(tgkill(pid, tid, sig) == 0);
+}
+
+// Returns the Yama ptrace scope.
+PosixErrorOr<int> YamaPtraceScope() {
+  constexpr char kYamaPtraceScopePath[] = "/proc/sys/kernel/yama/ptrace_scope";
+
+  ASSIGN_OR_RETURN_ERRNO(bool exists, Exists(kYamaPtraceScopePath));
+  if (!exists) {
+    // File doesn't exist means no Yama, so the scope is disabled -> 0.
+    return 0;
+  }
+
+  std::string contents;
+  RETURN_IF_ERRNO(GetContents(kYamaPtraceScopePath, &contents));
+
+  int scope;
+  if (!absl::SimpleAtoi(contents, &scope)) {
+    return PosixError(EINVAL, absl::StrCat(contents, ": not a valid number"));
+  }
+
+  return scope;
+}
+
+TEST(PtraceTest, AttachSelf) {
+  EXPECT_THAT(ptrace(PTRACE_ATTACH, gettid(), 0, 0),
+              SyscallFailsWithErrno(EPERM));
+}
+
+TEST(PtraceTest, AttachSameThreadGroup) {
+  pid_t const tid = gettid();
+  ScopedThread([&] {
+    EXPECT_THAT(ptrace(PTRACE_ATTACH, tid, 0, 0), SyscallFailsWithErrno(EPERM));
+  });
+}
+
+TEST(PtraceTest, AttachParent_PeekData_PokeData_SignalSuppression) {
+  // Yama prevents attaching to a parent. Skip the test if the scope is anything
+  // except disabled.
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(YamaPtraceScope()) > 0);
+
+  constexpr long kBeforePokeDataValue = 10;
+  constexpr long kAfterPokeDataValue = 20;
+
+  volatile long word = kBeforePokeDataValue;
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+
+    // Attach to the parent.
+    pid_t const parent_pid = getppid();
+    TEST_PCHECK(ptrace(PTRACE_ATTACH, parent_pid, 0, 0) == 0);
+    MaybeSave();
+
+    // Block until the parent enters signal-delivery-stop as a result of the
+    // SIGSTOP sent by PTRACE_ATTACH.
+    int status;
+    TEST_PCHECK(waitpid(parent_pid, &status, 0) == parent_pid);
+    MaybeSave();
+    TEST_CHECK(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+    // Replace the value of word in the parent process with kAfterPokeDataValue.
+    long const parent_word = ptrace(PTRACE_PEEKDATA, parent_pid, &word, 0);
+    MaybeSave();
+    TEST_CHECK(parent_word == kBeforePokeDataValue);
+    TEST_PCHECK(
+        ptrace(PTRACE_POKEDATA, parent_pid, &word, kAfterPokeDataValue) == 0);
+    MaybeSave();
+
+    // Detach from the parent and suppress the SIGSTOP. If the SIGSTOP is not
+    // suppressed, the parent will hang in group-stop, causing the test to time
+    // out.
+    TEST_PCHECK(ptrace(PTRACE_DETACH, parent_pid, 0, 0) == 0);
+    MaybeSave();
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to complete.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+
+  // Check that the child's PTRACE_POKEDATA was effective.
+  EXPECT_EQ(kAfterPokeDataValue, word);
+}
+
+TEST(PtraceTest, GetSigMask) {
+  // <sys/user.h> doesn't define these until Linux 4.4, even though the features
+  // were added in 3.11.
+  constexpr auto kPtraceGetSigMask = static_cast<enum __ptrace_request>(0x420a);
+  constexpr auto kPtraceSetSigMask = static_cast<enum __ptrace_request>(0x420b);
+  // glibc and the Linux kernel define a sigset_t with different sizes. To avoid
+  // creating a kernel_sigset_t and recreating all the modification functions
+  // (sigemptyset, etc), we just hardcode the kernel sigset size.
+  constexpr int kSizeofKernelSigset = 8;
+  constexpr int kBlockSignal = SIGUSR1;
+  sigset_t blocked;
+  sigemptyset(&blocked);
+  sigaddset(&blocked, kBlockSignal);
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+
+    // Install a signal handler for kBlockSignal to avoid termination and block
+    // it.
+    TEST_PCHECK(signal(kBlockSignal, +[](int signo) {}) != SIG_ERR);
+    MaybeSave();
+    TEST_PCHECK(sigprocmask(SIG_SETMASK, &blocked, nullptr) == 0);
+    MaybeSave();
+
+    // Enable tracing.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    MaybeSave();
+
+    // This should be blocked.
+    RaiseSignal(kBlockSignal);
+
+    // This should be suppressed by parent, who will change signal mask in the
+    // meantime, which means kBlockSignal should be delivered once this resumes.
+    RaiseSignal(SIGSTOP);
+
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Get current signal mask.
+  sigset_t set;
+  EXPECT_THAT(ptrace(kPtraceGetSigMask, child_pid, kSizeofKernelSigset, &set),
+              SyscallSucceeds());
+  EXPECT_THAT(blocked, EqualsSigset(set));
+
+  // Try to get current signal mask with bad size argument.
+  EXPECT_THAT(ptrace(kPtraceGetSigMask, child_pid, 0, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Try to set bad signal mask.
+  sigset_t* bad_addr = reinterpret_cast<sigset_t*>(-1);
+  EXPECT_THAT(
+      ptrace(kPtraceSetSigMask, child_pid, kSizeofKernelSigset, bad_addr),
+      SyscallFailsWithErrno(EFAULT));
+
+  // Set signal mask to empty set.
+  sigset_t set1;
+  sigemptyset(&set1);
+  EXPECT_THAT(ptrace(kPtraceSetSigMask, child_pid, kSizeofKernelSigset, &set1),
+              SyscallSucceeds());
+
+  // Suppress SIGSTOP and resume the child. It should re-enter
+  // signal-delivery-stop for kBlockSignal.
+  ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == kBlockSignal)
+      << " status " << status;
+
+  ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  // Let's see that process exited normally.
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
+TEST(PtraceTest, GetSiginfo_SetSiginfo_SignalInjection) {
+  constexpr int kOriginalSigno = SIGUSR1;
+  constexpr int kInjectedSigno = SIGUSR2;
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+
+    // Override all signal handlers.
+    struct sigaction sa = {};
+    sa.sa_handler = +[](int signo) { _exit(signo); };
+    TEST_PCHECK(sigfillset(&sa.sa_mask) == 0);
+    for (int signo = 1; signo < 32; signo++) {
+      if (signo == SIGKILL || signo == SIGSTOP) {
+        continue;
+      }
+      TEST_PCHECK(sigaction(signo, &sa, nullptr) == 0);
+    }
+    for (int signo = SIGRTMIN; signo <= SIGRTMAX; signo++) {
+      TEST_PCHECK(sigaction(signo, &sa, nullptr) == 0);
+    }
+
+    // Unblock all signals.
+    TEST_PCHECK(sigprocmask(SIG_UNBLOCK, &sa.sa_mask, nullptr) == 0);
+    MaybeSave();
+
+    // Send ourselves kOriginalSignal while ptraced and exit with the signal we
+    // actually receive via the signal handler, if any, or 0 if we don't receive
+    // a signal.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    MaybeSave();
+    RaiseSignal(kOriginalSigno);
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself kOriginalSigno and enter
+  // signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == kOriginalSigno)
+      << " status " << status;
+
+  siginfo_t siginfo = {};
+  ASSERT_THAT(ptrace(PTRACE_GETSIGINFO, child_pid, 0, &siginfo),
+              SyscallSucceeds());
+  EXPECT_EQ(kOriginalSigno, siginfo.si_signo);
+  EXPECT_EQ(SI_TKILL, siginfo.si_code);
+
+  // Replace the signal with kInjectedSigno, and check that the child exits
+  // with kInjectedSigno, indicating that signal injection was successful.
+  siginfo.si_signo = kInjectedSigno;
+  ASSERT_THAT(ptrace(PTRACE_SETSIGINFO, child_pid, 0, &siginfo),
+              SyscallSucceeds());
+  ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, kInjectedSigno),
+              SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == kInjectedSigno)
+      << " status " << status;
+}
+
+TEST(PtraceTest, SIGKILLDoesNotCauseSignalDeliveryStop) {
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    MaybeSave();
+    RaiseSignal(SIGKILL);
+    TEST_CHECK_MSG(false, "Survived SIGKILL?");
+    _exit(1);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Expect the child to die to SIGKILL without entering signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+      << " status " << status;
+}
+
+TEST(PtraceTest, PtraceKill) {
+  constexpr int kOriginalSigno = SIGUSR1;
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    MaybeSave();
+
+    // PTRACE_KILL only works if tracee has entered signal-delivery-stop.
+    RaiseSignal(kOriginalSigno);
+    TEST_CHECK_MSG(false, "Failed to kill the process?");
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself kOriginalSigno and enter
+  // signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == kOriginalSigno)
+      << " status " << status;
+
+  ASSERT_THAT(ptrace(PTRACE_KILL, child_pid, 0, 0), SyscallSucceeds());
+
+  // Expect the child to die with SIGKILL.
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+      << " status " << status;
+}
+
+TEST(PtraceTest, GetRegSet) {
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+
+    // Enable tracing.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    MaybeSave();
+
+    // Use kill explicitly because we check the syscall argument register below.
+    kill(getpid(), SIGSTOP);
+
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Get the general registers.
+  struct user_regs_struct regs;
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+#ifdef __x86_64__
+  // Child called kill(2), with SIGSTOP as arg 2.
+  EXPECT_EQ(regs.rsi, SIGSTOP);
+#endif
+
+  // Suppress SIGSTOP and resume the child.
+  ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  // Let's see that process exited normally.
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
+TEST(PtraceTest, AttachingConvertsGroupStopToPtraceStop) {
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+    while (true) {
+      pause();
+    }
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // SIGSTOP the child and wait for it to stop.
+  ASSERT_THAT(kill(child_pid, SIGSTOP), SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, WUNTRACED),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Attach to the child and expect it to re-enter a traced group-stop despite
+  // already being stopped.
+  ASSERT_THAT(ptrace(PTRACE_ATTACH, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Verify that the child is ptrace-stopped by checking that it can receive
+  // ptrace commands requiring a ptrace-stop.
+  EXPECT_THAT(ptrace(PTRACE_SETOPTIONS, child_pid, 0, 0), SyscallSucceeds());
+
+  // Group-stop is distinguished from signal-delivery-stop by PTRACE_GETSIGINFO
+  // failing with EINVAL.
+  siginfo_t siginfo = {};
+  EXPECT_THAT(ptrace(PTRACE_GETSIGINFO, child_pid, 0, &siginfo),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Detach from the child and expect it to stay stopped without a notification.
+  ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, WUNTRACED | WNOHANG),
+              SyscallSucceedsWithValue(0));
+
+  // Sending it SIGCONT should cause it to leave its stop.
+  ASSERT_THAT(kill(child_pid, SIGCONT), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, WCONTINUED),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFCONTINUED(status)) << " status " << status;
+
+  // Clean up the child.
+  ASSERT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+      << " status " << status;
+}
+
+// Fixture for tests parameterized by whether or not to use PTRACE_O_TRACEEXEC.
+class PtraceExecveTest : public ::testing::TestWithParam<bool> {
+ protected:
+  bool TraceExec() const { return GetParam(); }
+};
+
+TEST_P(PtraceExecveTest, Execve_GetRegs_PeekUser_SIGKILL_TraceClone_TraceExit) {
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+
+    // Enable tracing, then raise SIGSTOP and expect our parent to suppress it.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    MaybeSave();
+    RaiseSignal(SIGSTOP);
+    MaybeSave();
+
+    // Call execve in a non-leader thread.
+    ExecveArray const owned_child_argv = {"/proc/self/exe"};
+    char* const* const child_argv = owned_child_argv.get();
+    ScopedThread t([&] {
+      execve(child_argv[0], child_argv, /* envp = */ nullptr);
+      TEST_CHECK_MSG(false, "Survived execve? (thread)");
+    });
+    t.Join();
+    TEST_CHECK_MSG(false, "Survived execve? (main)");
+    _exit(1);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Enable PTRACE_O_TRACECLONE so we can get the ID of the child's non-leader
+  // thread, PTRACE_O_TRACEEXIT so we can observe the leader's death, and
+  // PTRACE_O_TRACEEXEC if required by the test. (The leader doesn't call
+  // execve, but options should be inherited across clone.)
+  long opts = PTRACE_O_TRACECLONE | PTRACE_O_TRACEEXIT;
+  if (TraceExec()) {
+    opts |= PTRACE_O_TRACEEXEC;
+  }
+  ASSERT_THAT(ptrace(PTRACE_SETOPTIONS, child_pid, 0, opts), SyscallSucceeds());
+
+  // Suppress the SIGSTOP and wait for the child's leader thread to report
+  // PTRACE_EVENT_CLONE. Get the new thread's ID from the event.
+  ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_CLONE << 8), status >> 8);
+  unsigned long eventmsg;
+  ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, child_pid, 0, &eventmsg),
+              SyscallSucceeds());
+  pid_t const nonleader_tid = eventmsg;
+  pid_t const leader_tid = child_pid;
+
+  // The new thread should be ptraced and in signal-delivery-stop by SIGSTOP due
+  // to PTRACE_O_TRACECLONE.
+  //
+  // Before bf959931ddb88c4e4366e96dd22e68fa0db9527c "wait/ptrace: assume __WALL
+  // if the child is traced" (4.7) , waiting on it requires __WCLONE since, as a
+  // non-leader, its termination signal is 0. After, a standard wait is
+  // sufficient.
+  ASSERT_THAT(waitpid(nonleader_tid, &status, __WCLONE),
+              SyscallSucceedsWithValue(nonleader_tid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Resume both child threads.
+  for (pid_t const tid : {leader_tid, nonleader_tid}) {
+    ASSERT_THAT(ptrace(PTRACE_CONT, tid, 0, 0), SyscallSucceeds());
+  }
+
+  // The non-leader child thread should call execve, causing the leader thread
+  // to enter PTRACE_EVENT_EXIT with an apparent exit code of 0. At this point,
+  // the leader has not yet exited, so the non-leader should be blocked in
+  // execve.
+  ASSERT_THAT(waitpid(leader_tid, &status, 0),
+              SyscallSucceedsWithValue(leader_tid));
+  EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_EXIT << 8), status >> 8);
+  ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, leader_tid, 0, &eventmsg),
+              SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(eventmsg) && WEXITSTATUS(eventmsg) == 0)
+      << " eventmsg " << eventmsg;
+  EXPECT_THAT(waitpid(nonleader_tid, &status, __WCLONE | WNOHANG),
+              SyscallSucceedsWithValue(0));
+
+  // Allow the leader to continue exiting. This should allow the non-leader to
+  // complete its execve, causing the original leader to be reaped without
+  // further notice and the non-leader to steal its ID.
+  ASSERT_THAT(ptrace(PTRACE_CONT, leader_tid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(leader_tid, &status, 0),
+              SyscallSucceedsWithValue(leader_tid));
+  if (TraceExec()) {
+    // If PTRACE_O_TRACEEXEC was enabled, the execing thread should be in
+    // PTRACE_EVENT_EXEC-stop, with the event message set to its old thread ID.
+    EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_EXEC << 8), status >> 8);
+    ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, leader_tid, 0, &eventmsg),
+                SyscallSucceeds());
+    EXPECT_EQ(nonleader_tid, eventmsg);
+  } else {
+    // Otherwise, the execing thread should have received SIGTRAP and should now
+    // be in signal-delivery-stop.
+    EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+        << " status " << status;
+  }
+
+#ifdef __x86_64__
+  {
+    // CS should be 0x33, indicating an 64-bit binary.
+    constexpr uint64_t kAMD64UserCS = 0x33;
+    EXPECT_THAT(ptrace(PTRACE_PEEKUSER, leader_tid,
+                       offsetof(struct user_regs_struct, cs), 0),
+                SyscallSucceedsWithValue(kAMD64UserCS));
+    struct user_regs_struct regs = {};
+    ASSERT_THAT(ptrace(PTRACE_GETREGS, leader_tid, 0, &regs),
+                SyscallSucceeds());
+    EXPECT_EQ(kAMD64UserCS, regs.cs);
+  }
+#endif  // defined(__x86_64__)
+
+  // PTRACE_O_TRACEEXIT should have been inherited across execve. Send SIGKILL,
+  // which should end the PTRACE_EVENT_EXEC-stop or signal-delivery-stop and
+  // leave the child in PTRACE_EVENT_EXIT-stop.
+  ASSERT_THAT(kill(leader_tid, SIGKILL), SyscallSucceeds());
+  ASSERT_THAT(waitpid(leader_tid, &status, 0),
+              SyscallSucceedsWithValue(leader_tid));
+  EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_EXIT << 8), status >> 8);
+  ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, leader_tid, 0, &eventmsg),
+              SyscallSucceeds());
+  EXPECT_TRUE(WIFSIGNALED(eventmsg) && WTERMSIG(eventmsg) == SIGKILL)
+      << " eventmsg " << eventmsg;
+
+  // End the PTRACE_EVENT_EXIT stop, allowing the child to exit.
+  ASSERT_THAT(ptrace(PTRACE_CONT, leader_tid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(leader_tid, &status, 0),
+              SyscallSucceedsWithValue(leader_tid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+      << " status " << status;
+}
+
+INSTANTIATE_TEST_CASE_P(TraceExec, PtraceExecveTest, ::testing::Bool());
+
+// This test has expectations on when syscall-enter/exit-stops occur that are
+// violated if saving occurs, since saving interrupts all syscalls, causing
+// premature syscall-exit.
+TEST(PtraceTest,
+     ExitWhenParentIsNotTracer_Syscall_TraceVfork_TraceVforkDone_NoRandomSave) {
+  constexpr int kExitTraceeExitCode = 99;
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+
+    // Block SIGCHLD so it doesn't interrupt wait4.
+    sigset_t mask;
+    TEST_PCHECK(sigemptyset(&mask) == 0);
+    TEST_PCHECK(sigaddset(&mask, SIGCHLD) == 0);
+    TEST_PCHECK(sigprocmask(SIG_SETMASK, &mask, nullptr) == 0);
+    MaybeSave();
+
+    // Enable tracing, then raise SIGSTOP and expect our parent to suppress it.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    MaybeSave();
+    RaiseSignal(SIGSTOP);
+    MaybeSave();
+
+    // Spawn a vfork child that exits immediately, and reap it. Don't save
+    // after vfork since the parent expects to see wait4 as the next syscall.
+    pid_t const pid = vfork();
+    if (pid == 0) {
+      _exit(kExitTraceeExitCode);
+    }
+    TEST_PCHECK_MSG(pid > 0, "vfork failed");
+
+    int status;
+    TEST_PCHECK(wait4(pid, &status, 0, nullptr) > 0);
+    MaybeSave();
+    TEST_CHECK(WIFEXITED(status) && WEXITSTATUS(status) == kExitTraceeExitCode);
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+  int status;
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Enable PTRACE_O_TRACEVFORK so we can get the ID of the grandchild,
+  // PTRACE_O_TRACEVFORKDONE so we can observe PTRACE_EVENT_VFORK_DONE, and
+  // PTRACE_O_TRACESYSGOOD so syscall-enter/exit-stops are unambiguously
+  // indicated by a stop signal of SIGTRAP|0x80 rather than just SIGTRAP.
+  ASSERT_THAT(ptrace(PTRACE_SETOPTIONS, child_pid, 0,
+                     PTRACE_O_TRACEVFORK | PTRACE_O_TRACEVFORKDONE |
+                         PTRACE_O_TRACESYSGOOD),
+              SyscallSucceeds());
+
+  // Suppress the SIGSTOP and wait for the child to report PTRACE_EVENT_VFORK.
+  // Get the new process' ID from the event.
+  ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_VFORK << 8), status >> 8);
+  unsigned long eventmsg;
+  ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, child_pid, 0, &eventmsg),
+              SyscallSucceeds());
+  pid_t const grandchild_pid = eventmsg;
+
+  // The grandchild should be traced by us and in signal-delivery-stop by
+  // SIGSTOP due to PTRACE_O_TRACEVFORK. This allows us to wait on it even
+  // though we're not its parent.
+  ASSERT_THAT(waitpid(grandchild_pid, &status, 0),
+              SyscallSucceedsWithValue(grandchild_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Resume the child with PTRACE_SYSCALL. Since the grandchild is still in
+  // signal-delivery-stop, the child should remain in vfork() waiting for the
+  // grandchild to exec or exit.
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));
+  ASSERT_THAT(waitpid(child_pid, &status, WNOHANG),
+              SyscallSucceedsWithValue(0));
+
+  // Suppress the grandchild's SIGSTOP and wait for the grandchild to exit. Pass
+  // WNOWAIT to waitid() so that we don't acknowledge the grandchild's exit yet.
+  ASSERT_THAT(ptrace(PTRACE_CONT, grandchild_pid, 0, 0), SyscallSucceeds());
+  siginfo_t siginfo = {};
+  ASSERT_THAT(waitid(P_PID, grandchild_pid, &siginfo, WEXITED | WNOWAIT),
+              SyscallSucceeds());
+  EXPECT_EQ(SIGCHLD, siginfo.si_signo);
+  EXPECT_EQ(CLD_EXITED, siginfo.si_code);
+  EXPECT_EQ(kExitTraceeExitCode, siginfo.si_status);
+  EXPECT_EQ(grandchild_pid, siginfo.si_pid);
+  EXPECT_EQ(getuid(), siginfo.si_uid);
+
+  // The child should now be in PTRACE_EVENT_VFORK_DONE stop. The event
+  // message should still be the grandchild's PID.
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_VFORK_DONE << 8), status >> 8);
+  ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, child_pid, 0, &eventmsg),
+              SyscallSucceeds());
+  EXPECT_EQ(grandchild_pid, eventmsg);
+
+  // Resume the child with PTRACE_SYSCALL again and expect it to enter
+  // syscall-exit-stop for vfork() or clone(), either of which should return the
+  // grandchild's PID from the syscall. Aside from PTRACE_O_TRACESYSGOOD,
+  // syscall-stops are distinguished from signal-delivery-stop by
+  // PTRACE_GETSIGINFO returning a siginfo for which si_code == SIGTRAP or
+  // SIGTRAP|0x80.
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
+      << " status " << status;
+  ASSERT_THAT(ptrace(PTRACE_GETSIGINFO, child_pid, 0, &siginfo),
+              SyscallSucceeds());
+  EXPECT_TRUE(siginfo.si_code == SIGTRAP || siginfo.si_code == (SIGTRAP | 0x80))
+      << "si_code = " << siginfo.si_code;
+#ifdef __x86_64__
+  {
+    struct user_regs_struct regs = {};
+    ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+    EXPECT_TRUE(regs.orig_rax == SYS_vfork || regs.orig_rax == SYS_clone)
+        << "orig_rax = " << regs.orig_rax;
+    EXPECT_EQ(grandchild_pid, regs.rax);
+  }
+#endif  // defined(__x86_64__)
+
+  // After this point, the child will be making wait4 syscalls that will be
+  // interrupted by saving, so saving is not permitted. Note that this is
+  // explicitly released below once the grandchild exits.
+  DisableSave ds;
+
+  // Resume the child with PTRACE_SYSCALL again and expect it to enter
+  // syscall-enter-stop for wait4().
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
+      << " status " << status;
+  ASSERT_THAT(ptrace(PTRACE_GETSIGINFO, child_pid, 0, &siginfo),
+              SyscallSucceeds());
+  EXPECT_TRUE(siginfo.si_code == SIGTRAP || siginfo.si_code == (SIGTRAP | 0x80))
+      << "si_code = " << siginfo.si_code;
+#ifdef __x86_64__
+  {
+    EXPECT_THAT(ptrace(PTRACE_PEEKUSER, child_pid,
+                       offsetof(struct user_regs_struct, orig_rax), 0),
+                SyscallSucceedsWithValue(SYS_wait4));
+  }
+#endif  // defined(__x86_64__)
+
+  // Resume the child with PTRACE_SYSCALL again. Since the grandchild is
+  // waiting for the tracer (us) to acknowledge its exit first, wait4 should
+  // block.
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));
+  ASSERT_THAT(waitpid(child_pid, &status, WNOHANG),
+              SyscallSucceedsWithValue(0));
+
+  // Acknowledge the grandchild's exit.
+  ASSERT_THAT(waitpid(grandchild_pid, &status, 0),
+              SyscallSucceedsWithValue(grandchild_pid));
+  ds.reset();
+
+  // Now the child should enter syscall-exit-stop for wait4, returning with the
+  // grandchild's PID.
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
+      << " status " << status;
+#ifdef __x86_64__
+  {
+    struct user_regs_struct regs = {};
+    ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+    EXPECT_EQ(SYS_wait4, regs.orig_rax);
+    EXPECT_EQ(grandchild_pid, regs.rax);
+  }
+#endif  // defined(__x86_64__)
+
+  // Detach from the child and wait for it to exit.
+  ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
+// These tests requires knowledge of architecture-specific syscall convention.
+#ifdef __x86_64__
+TEST(PtraceTest, Sysemu_PokeUser) {
+  constexpr int kSysemuHelperFirstExitCode = 126;
+  constexpr uint64_t kSysemuInjectedExitGroupReturn = 42;
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+
+    // Enable tracing, then raise SIGSTOP and expect our parent to suppress it.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    RaiseSignal(SIGSTOP);
+
+    // Try to exit_group, expecting the tracer to skip the syscall and set its
+    // own return value.
+    int const rv = syscall(SYS_exit_group, kSysemuHelperFirstExitCode);
+    TEST_PCHECK_MSG(rv == kSysemuInjectedExitGroupReturn,
+                    "exit_group returned incorrect value");
+
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Suppress the SIGSTOP and wait for the child to enter syscall-enter-stop
+  // for its first exit_group syscall. glibc doesn't necessarily define
+  // PTRACE_SYSEMU.
+  constexpr auto kPtraceSysemu = static_cast<__ptrace_request>(31);
+  ASSERT_THAT(ptrace(kPtraceSysemu, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+      << " status " << status;
+
+  struct user_regs_struct regs = {};
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+  EXPECT_EQ(SYS_exit_group, regs.orig_rax);
+  EXPECT_EQ(-ENOSYS, regs.rax);
+  EXPECT_EQ(kSysemuHelperFirstExitCode, regs.rdi);
+
+  // Replace the exit_group return value, then resume the child, which should
+  // automatically skip the syscall.
+  ASSERT_THAT(
+      ptrace(PTRACE_POKEUSER, child_pid, offsetof(struct user_regs_struct, rax),
+             kSysemuInjectedExitGroupReturn),
+      SyscallSucceeds());
+  ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+
+  // The child should validate the injected return value and then exit normally.
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
+// This test also cares about syscall-exit-stop.
+TEST(PtraceTest, ERESTART_NoRandomSave) {
+  constexpr int kSigno = SIGUSR1;
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+
+    // Ignore, but unblock, kSigno.
+    struct sigaction sa = {};
+    sa.sa_handler = SIG_IGN;
+    TEST_PCHECK(sigfillset(&sa.sa_mask) == 0);
+    TEST_PCHECK(sigaction(kSigno, &sa, nullptr) == 0);
+    MaybeSave();
+    TEST_PCHECK(sigprocmask(SIG_UNBLOCK, &sa.sa_mask, nullptr) == 0);
+    MaybeSave();
+
+    // Enable tracing, then raise SIGSTOP and expect our parent to suppress it.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    RaiseSignal(SIGSTOP);
+
+    // Invoke the pause syscall, which normally should not return until we
+    // receive a signal that "either terminates the process or causes the
+    // invocation of a signal-catching function".
+    pause();
+
+    _exit(0);
+  }
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // After this point, the child's pause syscall will be interrupted by saving,
+  // so saving is not permitted. Note that this is explicitly released below
+  // once the child is stopped.
+  DisableSave ds;
+
+  // Suppress the SIGSTOP and wait for the child to enter syscall-enter-stop for
+  // its pause syscall.
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+      << " status " << status;
+
+  struct user_regs_struct regs = {};
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+  EXPECT_EQ(SYS_pause, regs.orig_rax);
+  EXPECT_EQ(-ENOSYS, regs.rax);
+
+  // Resume the child with PTRACE_SYSCALL and expect it to block in the pause
+  // syscall.
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));
+  ASSERT_THAT(waitpid(child_pid, &status, WNOHANG),
+              SyscallSucceedsWithValue(0));
+
+  // Send the child kSigno, causing it to return ERESTARTNOHAND and enter
+  // syscall-exit-stop from the pause syscall.
+  constexpr int ERESTARTNOHAND = 514;
+  ASSERT_THAT(kill(child_pid, kSigno), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+      << " status " << status;
+  ds.reset();
+
+  ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+  EXPECT_EQ(SYS_pause, regs.orig_rax);
+  EXPECT_EQ(-ERESTARTNOHAND, regs.rax);
+
+  // Replace the return value from pause with 0, causing pause to not be
+  // restarted despite kSigno being ignored.
+  ASSERT_THAT(ptrace(PTRACE_POKEUSER, child_pid,
+                     offsetof(struct user_regs_struct, rax), 0),
+              SyscallSucceeds());
+
+  // Detach from the child and wait for it to exit.
+  ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+#endif  // defined(__x86_64__)
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
new file mode 100644
index 000000000..253aa26ba
--- /dev/null
+++ b/test/syscalls/linux/pty.cc
@@ -0,0 +1,1230 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/major.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <termios.h>
+#include <unistd.h>
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Contains;
+using ::testing::Eq;
+using ::testing::Not;
+
+// Tests Unix98 pseudoterminals.
+//
+// These tests assume that /dev/ptmx exists and is associated with a devpts
+// filesystem mounted at /dev/pts/. While a Linux distribution could
+// theoretically place those anywhere, glibc expects those locations, so they
+// are effectively fixed.
+
+// Minor device number for an unopened ptmx file.
+constexpr int kPtmxMinor = 2;
+
+// The timeout when polling for data from a pty. When data is written to one end
+// of a pty, Linux asynchronously makes it available to the other end, so we
+// have to wait.
+constexpr absl::Duration kTimeout = absl::Seconds(20);
+
+// The maximum line size in bytes returned per read from a pty file.
+constexpr int kMaxLineSize = 4096;
+
+// glibc defines its own, different, version of struct termios. We care about
+// what the kernel does, not glibc.
+#define KERNEL_NCCS 19
+struct kernel_termios {
+  tcflag_t c_iflag;
+  tcflag_t c_oflag;
+  tcflag_t c_cflag;
+  tcflag_t c_lflag;
+  cc_t c_line;
+  cc_t c_cc[KERNEL_NCCS];
+};
+
+bool operator==(struct kernel_termios const& a,
+                struct kernel_termios const& b) {
+  return memcmp(&a, &b, sizeof(a)) == 0;
+}
+
+// Returns the termios-style control character for the passed character.
+//
+// e.g., for Ctrl-C, i.e., ^C, call ControlCharacter('C').
+//
+// Standard control characters are ASCII bytes 0 through 31.
+constexpr char ControlCharacter(char c) {
+  // A is 1, B is 2, etc.
+  return c - 'A' + 1;
+}
+
+// Returns the printable character the given control character represents.
+constexpr char FromControlCharacter(char c) { return c + 'A' - 1; }
+
+// Returns true if c is a control character.
+//
+// Standard control characters are ASCII bytes 0 through 31.
+constexpr bool IsControlCharacter(char c) { return c <= 31; }
+
+struct Field {
+  const char* name;
+  uint64_t mask;
+  uint64_t value;
+};
+
+// ParseFields returns a std::string representation of value, using the names in
+// fields.
+std::string ParseFields(const Field* fields, size_t len, uint64_t value) {
+  bool first = true;
+  std::string s;
+  for (size_t i = 0; i < len; i++) {
+    const Field f = fields[i];
+    if ((value & f.mask) == f.value) {
+      if (!first) {
+        s += "|";
+      }
+      s += f.name;
+      first = false;
+      value &= ~f.mask;
+    }
+  }
+
+  if (value) {
+    if (!first) {
+      s += "|";
+    }
+    absl::StrAppend(&s, value);
+  }
+
+  return s;
+}
+
+const Field kIflagFields[] = {
+    {"IGNBRK", IGNBRK, IGNBRK}, {"BRKINT", BRKINT, BRKINT},
+    {"IGNPAR", IGNPAR, IGNPAR}, {"PARMRK", PARMRK, PARMRK},
+    {"INPCK", INPCK, INPCK},    {"ISTRIP", ISTRIP, ISTRIP},
+    {"INLCR", INLCR, INLCR},    {"IGNCR", IGNCR, IGNCR},
+    {"ICRNL", ICRNL, ICRNL},    {"IUCLC", IUCLC, IUCLC},
+    {"IXON", IXON, IXON},       {"IXANY", IXANY, IXANY},
+    {"IXOFF", IXOFF, IXOFF},    {"IMAXBEL", IMAXBEL, IMAXBEL},
+    {"IUTF8", IUTF8, IUTF8},
+};
+
+const Field kOflagFields[] = {
+    {"OPOST", OPOST, OPOST}, {"OLCUC", OLCUC, OLCUC},
+    {"ONLCR", ONLCR, ONLCR}, {"OCRNL", OCRNL, OCRNL},
+    {"ONOCR", ONOCR, ONOCR}, {"ONLRET", ONLRET, ONLRET},
+    {"OFILL", OFILL, OFILL}, {"OFDEL", OFDEL, OFDEL},
+    {"NL0", NLDLY, NL0},     {"NL1", NLDLY, NL1},
+    {"CR0", CRDLY, CR0},     {"CR1", CRDLY, CR1},
+    {"CR2", CRDLY, CR2},     {"CR3", CRDLY, CR3},
+    {"TAB0", TABDLY, TAB0},  {"TAB1", TABDLY, TAB1},
+    {"TAB2", TABDLY, TAB2},  {"TAB3", TABDLY, TAB3},
+    {"BS0", BSDLY, BS0},     {"BS1", BSDLY, BS1},
+    {"FF0", FFDLY, FF0},     {"FF1", FFDLY, FF1},
+    {"VT0", VTDLY, VT0},     {"VT1", VTDLY, VT1},
+    {"XTABS", XTABS, XTABS},
+};
+
+#ifndef IBSHIFT
+// Shift from CBAUD to CIBAUD.
+#define IBSHIFT 16
+#endif
+
+const Field kCflagFields[] = {
+    {"B0", CBAUD, B0},
+    {"B50", CBAUD, B50},
+    {"B75", CBAUD, B75},
+    {"B110", CBAUD, B110},
+    {"B134", CBAUD, B134},
+    {"B150", CBAUD, B150},
+    {"B200", CBAUD, B200},
+    {"B300", CBAUD, B300},
+    {"B600", CBAUD, B600},
+    {"B1200", CBAUD, B1200},
+    {"B1800", CBAUD, B1800},
+    {"B2400", CBAUD, B2400},
+    {"B4800", CBAUD, B4800},
+    {"B9600", CBAUD, B9600},
+    {"B19200", CBAUD, B19200},
+    {"B38400", CBAUD, B38400},
+    {"CS5", CSIZE, CS5},
+    {"CS6", CSIZE, CS6},
+    {"CS7", CSIZE, CS7},
+    {"CS8", CSIZE, CS8},
+    {"CSTOPB", CSTOPB, CSTOPB},
+    {"CREAD", CREAD, CREAD},
+    {"PARENB", PARENB, PARENB},
+    {"PARODD", PARODD, PARODD},
+    {"HUPCL", HUPCL, HUPCL},
+    {"CLOCAL", CLOCAL, CLOCAL},
+    {"B57600", CBAUD, B57600},
+    {"B115200", CBAUD, B115200},
+    {"B230400", CBAUD, B230400},
+    {"B460800", CBAUD, B460800},
+    {"B500000", CBAUD, B500000},
+    {"B576000", CBAUD, B576000},
+    {"B921600", CBAUD, B921600},
+    {"B1000000", CBAUD, B1000000},
+    {"B1152000", CBAUD, B1152000},
+    {"B1500000", CBAUD, B1500000},
+    {"B2000000", CBAUD, B2000000},
+    {"B2500000", CBAUD, B2500000},
+    {"B3000000", CBAUD, B3000000},
+    {"B3500000", CBAUD, B3500000},
+    {"B4000000", CBAUD, B4000000},
+    {"CMSPAR", CMSPAR, CMSPAR},
+    {"CRTSCTS", CRTSCTS, CRTSCTS},
+    {"IB0", CIBAUD, B0 << IBSHIFT},
+    {"IB50", CIBAUD, B50 << IBSHIFT},
+    {"IB75", CIBAUD, B75 << IBSHIFT},
+    {"IB110", CIBAUD, B110 << IBSHIFT},
+    {"IB134", CIBAUD, B134 << IBSHIFT},
+    {"IB150", CIBAUD, B150 << IBSHIFT},
+    {"IB200", CIBAUD, B200 << IBSHIFT},
+    {"IB300", CIBAUD, B300 << IBSHIFT},
+    {"IB600", CIBAUD, B600 << IBSHIFT},
+    {"IB1200", CIBAUD, B1200 << IBSHIFT},
+    {"IB1800", CIBAUD, B1800 << IBSHIFT},
+    {"IB2400", CIBAUD, B2400 << IBSHIFT},
+    {"IB4800", CIBAUD, B4800 << IBSHIFT},
+    {"IB9600", CIBAUD, B9600 << IBSHIFT},
+    {"IB19200", CIBAUD, B19200 << IBSHIFT},
+    {"IB38400", CIBAUD, B38400 << IBSHIFT},
+    {"IB57600", CIBAUD, B57600 << IBSHIFT},
+    {"IB115200", CIBAUD, B115200 << IBSHIFT},
+    {"IB230400", CIBAUD, B230400 << IBSHIFT},
+    {"IB460800", CIBAUD, B460800 << IBSHIFT},
+    {"IB500000", CIBAUD, B500000 << IBSHIFT},
+    {"IB576000", CIBAUD, B576000 << IBSHIFT},
+    {"IB921600", CIBAUD, B921600 << IBSHIFT},
+    {"IB1000000", CIBAUD, B1000000 << IBSHIFT},
+    {"IB1152000", CIBAUD, B1152000 << IBSHIFT},
+    {"IB1500000", CIBAUD, B1500000 << IBSHIFT},
+    {"IB2000000", CIBAUD, B2000000 << IBSHIFT},
+    {"IB2500000", CIBAUD, B2500000 << IBSHIFT},
+    {"IB3000000", CIBAUD, B3000000 << IBSHIFT},
+    {"IB3500000", CIBAUD, B3500000 << IBSHIFT},
+    {"IB4000000", CIBAUD, B4000000 << IBSHIFT},
+};
+
+const Field kLflagFields[] = {
+    {"ISIG", ISIG, ISIG},          {"ICANON", ICANON, ICANON},
+    {"XCASE", XCASE, XCASE},       {"ECHO", ECHO, ECHO},
+    {"ECHOE", ECHOE, ECHOE},       {"ECHOK", ECHOK, ECHOK},
+    {"ECHONL", ECHONL, ECHONL},    {"NOFLSH", NOFLSH, NOFLSH},
+    {"TOSTOP", TOSTOP, TOSTOP},    {"ECHOCTL", ECHOCTL, ECHOCTL},
+    {"ECHOPRT", ECHOPRT, ECHOPRT}, {"ECHOKE", ECHOKE, ECHOKE},
+    {"FLUSHO", FLUSHO, FLUSHO},    {"PENDIN", PENDIN, PENDIN},
+    {"IEXTEN", IEXTEN, IEXTEN},    {"EXTPROC", EXTPROC, EXTPROC},
+};
+
+std::string FormatCC(char c) {
+  if (isgraph(c)) {
+    return std::string(1, c);
+  } else if (c == ' ') {
+    return " ";
+  } else if (c == '\t') {
+    return "\\t";
+  } else if (c == '\r') {
+    return "\\r";
+  } else if (c == '\n') {
+    return "\\n";
+  } else if (c == '\0') {
+    return "\\0";
+  } else if (IsControlCharacter(c)) {
+    return absl::StrCat("^", std::string(1, FromControlCharacter(c)));
+  }
+  return absl::StrCat("\\x", absl::Hex(c));
+}
+
+std::ostream& operator<<(std::ostream& os, struct kernel_termios const& a) {
+  os << "{ c_iflag = "
+     << ParseFields(kIflagFields, ABSL_ARRAYSIZE(kIflagFields), a.c_iflag);
+  os << ", c_oflag = "
+     << ParseFields(kOflagFields, ABSL_ARRAYSIZE(kOflagFields), a.c_oflag);
+  os << ", c_cflag = "
+     << ParseFields(kCflagFields, ABSL_ARRAYSIZE(kCflagFields), a.c_cflag);
+  os << ", c_lflag = "
+     << ParseFields(kLflagFields, ABSL_ARRAYSIZE(kLflagFields), a.c_lflag);
+  os << ", c_line = " << a.c_line;
+  os << ", c_cc = { [VINTR] = '" << FormatCC(a.c_cc[VINTR]);
+  os << "', [VQUIT] = '" << FormatCC(a.c_cc[VQUIT]);
+  os << "', [VERASE] = '" << FormatCC(a.c_cc[VERASE]);
+  os << "', [VKILL] = '" << FormatCC(a.c_cc[VKILL]);
+  os << "', [VEOF] = '" << FormatCC(a.c_cc[VEOF]);
+  os << "', [VTIME] = '" << static_cast<int>(a.c_cc[VTIME]);
+  os << "', [VMIN] = " << static_cast<int>(a.c_cc[VMIN]);
+  os << ", [VSWTC] = '" << FormatCC(a.c_cc[VSWTC]);
+  os << "', [VSTART] = '" << FormatCC(a.c_cc[VSTART]);
+  os << "', [VSTOP] = '" << FormatCC(a.c_cc[VSTOP]);
+  os << "', [VSUSP] = '" << FormatCC(a.c_cc[VSUSP]);
+  os << "', [VEOL] = '" << FormatCC(a.c_cc[VEOL]);
+  os << "', [VREPRINT] = '" << FormatCC(a.c_cc[VREPRINT]);
+  os << "', [VDISCARD] = '" << FormatCC(a.c_cc[VDISCARD]);
+  os << "', [VWERASE] = '" << FormatCC(a.c_cc[VWERASE]);
+  os << "', [VLNEXT] = '" << FormatCC(a.c_cc[VLNEXT]);
+  os << "', [VEOL2] = '" << FormatCC(a.c_cc[VEOL2]);
+  os << "'}";
+  return os;
+}
+
+// Return the default termios settings for a new terminal.
+struct kernel_termios DefaultTermios() {
+  struct kernel_termios t = {};
+  t.c_iflag = IXON | ICRNL;
+  t.c_oflag = OPOST | ONLCR;
+  t.c_cflag = B38400 | CSIZE | CS8 | CREAD;
+  t.c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN;
+  t.c_line = 0;
+  t.c_cc[VINTR] = ControlCharacter('C');
+  t.c_cc[VQUIT] = ControlCharacter('\\');
+  t.c_cc[VERASE] = '\x7f';
+  t.c_cc[VKILL] = ControlCharacter('U');
+  t.c_cc[VEOF] = ControlCharacter('D');
+  t.c_cc[VTIME] = '\0';
+  t.c_cc[VMIN] = 1;
+  t.c_cc[VSWTC] = '\0';
+  t.c_cc[VSTART] = ControlCharacter('Q');
+  t.c_cc[VSTOP] = ControlCharacter('S');
+  t.c_cc[VSUSP] = ControlCharacter('Z');
+  t.c_cc[VEOL] = '\0';
+  t.c_cc[VREPRINT] = ControlCharacter('R');
+  t.c_cc[VDISCARD] = ControlCharacter('O');
+  t.c_cc[VWERASE] = ControlCharacter('W');
+  t.c_cc[VLNEXT] = ControlCharacter('V');
+  t.c_cc[VEOL2] = '\0';
+  return t;
+}
+
+// PollAndReadFd tries to read count bytes from buf within timeout.
+//
+// Returns a partial read if some bytes were read.
+//
+// fd must be non-blocking.
+PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
+                                   absl::Duration timeout) {
+  absl::Time end = absl::Now() + timeout;
+
+  size_t completed = 0;
+  absl::Duration remaining;
+  while ((remaining = end - absl::Now()) > absl::ZeroDuration()) {
+    struct pollfd pfd = {fd, POLLIN, 0};
+    int ret = RetryEINTR(poll)(&pfd, 1, absl::ToInt64Milliseconds(remaining));
+    if (ret < 0) {
+      return PosixError(errno, "poll failed");
+    } else if (ret == 0) {
+      // Timed out.
+      continue;
+    } else if (ret != 1) {
+      return PosixError(EINVAL, absl::StrCat("Bad poll ret ", ret));
+    }
+
+    ssize_t n =
+        ReadFd(fd, static_cast<char*>(buf) + completed, count - completed);
+    if (n < 0) {
+      return PosixError(errno, "read failed");
+    }
+    completed += n;
+    if (completed >= count) {
+      return completed;
+    }
+  }
+
+  if (completed) {
+    return completed;
+  }
+  return PosixError(ETIMEDOUT, "Poll timed out");
+}
+
+// Opens the slave end of the passed master as R/W and nonblocking.
+PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
+  // Get pty index.
+  int n;
+  int ret = ioctl(master.get(), TIOCGPTN, &n);
+  if (ret < 0) {
+    return PosixError(errno, "ioctl(TIOCGPTN) failed");
+  }
+
+  // Unlock pts.
+  int unlock = 0;
+  ret = ioctl(master.get(), TIOCSPTLCK, &unlock);
+  if (ret < 0) {
+    return PosixError(errno, "ioctl(TIOSPTLCK) failed");
+  }
+
+  return Open(absl::StrCat("/dev/pts/", n), O_RDWR | O_NONBLOCK);
+}
+
+TEST(BasicPtyTest, StatUnopenedMaster) {
+  struct stat s;
+  ASSERT_THAT(stat("/dev/ptmx", &s), SyscallSucceeds());
+
+  EXPECT_EQ(s.st_rdev, makedev(TTYAUX_MAJOR, kPtmxMinor));
+  EXPECT_EQ(s.st_size, 0);
+  EXPECT_EQ(s.st_blocks, 0);
+
+  // ptmx attached to a specific devpts mount uses block size 1024. See
+  // fs/devpts/inode.c:devpts_fill_super.
+  //
+  // The global ptmx device uses the block size of the filesystem it is created
+  // on (which is usually 4096 for disk filesystems).
+  EXPECT_THAT(s.st_blksize, AnyOf(Eq(1024), Eq(4096)));
+}
+
+// Waits for count bytes to be readable from fd. Unlike poll, which can return
+// before all data is moved into a pty's read buffer, this function waits for
+// all count bytes to become readable.
+PosixErrorOr<int> WaitUntilReceived(int fd, int count) {
+  int buffered = -1;
+  absl::Duration remaining;
+  absl::Time end = absl::Now() + kTimeout;
+  while ((remaining = end - absl::Now()) > absl::ZeroDuration()) {
+    if (ioctl(fd, FIONREAD, &buffered) < 0) {
+      return PosixError(errno, "failed FIONREAD ioctl");
+    }
+    if (buffered >= count) {
+      return buffered;
+    }
+    absl::SleepFor(absl::Milliseconds(500));
+  }
+  return PosixError(
+      ETIMEDOUT,
+      absl::StrFormat(
+          "FIONREAD timed out, receiving only %d of %d expected bytes",
+          buffered, count));
+}
+
+// Verifies that there is nothing left to read from fd.
+void ExpectFinished(const FileDescriptor& fd) {
+  // Nothing more to read.
+  char c;
+  EXPECT_THAT(ReadFd(fd.get(), &c, 1), SyscallFailsWithErrno(EAGAIN));
+}
+
+// Verifies that we can read expected bytes from fd into buf.
+void ExpectReadable(const FileDescriptor& fd, int expected, char* buf) {
+  size_t n = ASSERT_NO_ERRNO_AND_VALUE(
+      PollAndReadFd(fd.get(), buf, expected, kTimeout));
+  EXPECT_EQ(expected, n);
+}
+
+TEST(BasicPtyTest, OpenMasterSlave) {
+  FileDescriptor master = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+  FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+}
+
+// The slave entry in /dev/pts/ disappears when the master is closed, even if
+// the slave is still open.
+TEST(BasicPtyTest, SlaveEntryGoneAfterMasterClose) {
+  FileDescriptor master = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+  FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+
+  // Get pty index.
+  int index = -1;
+  ASSERT_THAT(ioctl(master.get(), TIOCGPTN, &index), SyscallSucceeds());
+
+  std::string path = absl::StrCat("/dev/pts/", index);
+
+  struct stat st;
+  EXPECT_THAT(stat(path.c_str(), &st), SyscallSucceeds());
+
+  master.reset();
+
+  EXPECT_THAT(stat(path.c_str(), &st), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(BasicPtyTest, Getdents) {
+  FileDescriptor master1 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+  int index1 = -1;
+  ASSERT_THAT(ioctl(master1.get(), TIOCGPTN, &index1), SyscallSucceeds());
+  FileDescriptor slave1 = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master1));
+
+  FileDescriptor master2 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+  int index2 = -1;
+  ASSERT_THAT(ioctl(master2.get(), TIOCGPTN, &index2), SyscallSucceeds());
+  FileDescriptor slave2 = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master2));
+
+  // The directory contains ptmx, index1, and index2. (Plus any additional PTYs
+  // unrelated to this test.)
+
+  std::vector<std::string> contents =
+      ASSERT_NO_ERRNO_AND_VALUE(ListDir("/dev/pts/", true));
+  EXPECT_THAT(contents, Contains(absl::StrCat(index1)));
+  EXPECT_THAT(contents, Contains(absl::StrCat(index2)));
+
+  master2.reset();
+
+  // The directory contains ptmx and index1, but not index2 since the master is
+  // closed. (Plus any additional PTYs unrelated to this test.)
+
+  contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/dev/pts/", true));
+  EXPECT_THAT(contents, Contains(absl::StrCat(index1)));
+  EXPECT_THAT(contents, Not(Contains(absl::StrCat(index2))));
+
+  // N.B. devpts supports legacy "single-instance" mode and new "multi-instance"
+  // mode. In legacy mode, devpts does not contain a "ptmx" device (the distro
+  // must use mknod to create it somewhere, presumably /dev/ptmx).
+  // Multi-instance mode does include a "ptmx" device tied to that mount.
+  //
+  // We don't check for the presence or absence of "ptmx", as distros vary in
+  // their usage of the two modes.
+}
+
+class PtyTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
+    slave_ = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master_));
+  }
+
+  void DisableCanonical() {
+    struct kernel_termios t = {};
+    EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+    t.c_lflag &= ~ICANON;
+    EXPECT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  }
+
+  void EnableCanonical() {
+    struct kernel_termios t = {};
+    EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+    t.c_lflag |= ICANON;
+    EXPECT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  }
+
+  // Master and slave ends of the PTY. Non-blocking.
+  FileDescriptor master_;
+  FileDescriptor slave_;
+};
+
+// Master to slave sanity test.
+TEST_F(PtyTest, WriteMasterToSlave) {
+  // N.B. by default, the slave reads nothing until the master writes a newline.
+  constexpr char kBuf[] = "hello\n";
+
+  EXPECT_THAT(WriteFd(master_.get(), kBuf, sizeof(kBuf) - 1),
+              SyscallSucceedsWithValue(sizeof(kBuf) - 1));
+
+  // Linux moves data from the master to the slave via async work scheduled via
+  // tty_flip_buffer_push. Since it is asynchronous, the data may not be
+  // available for reading immediately. Instead we must poll and assert that it
+  // becomes available "soon".
+
+  char buf[sizeof(kBuf)] = {};
+  ExpectReadable(slave_, sizeof(buf) - 1, buf);
+
+  EXPECT_EQ(memcmp(buf, kBuf, sizeof(kBuf)), 0);
+}
+
+// Slave to master sanity test.
+TEST_F(PtyTest, WriteSlaveToMaster) {
+  // N.B. by default, the master reads nothing until the slave writes a newline,
+  // and the master gets a carriage return.
+  constexpr char kInput[] = "hello\n";
+  constexpr char kExpected[] = "hello\r\n";
+
+  EXPECT_THAT(WriteFd(slave_.get(), kInput, sizeof(kInput) - 1),
+              SyscallSucceedsWithValue(sizeof(kInput) - 1));
+
+  // Linux moves data from the master to the slave via async work scheduled via
+  // tty_flip_buffer_push. Since it is asynchronous, the data may not be
+  // available for reading immediately. Instead we must poll and assert that it
+  // becomes available "soon".
+
+  char buf[sizeof(kExpected)] = {};
+  ExpectReadable(master_, sizeof(buf) - 1, buf);
+
+  EXPECT_EQ(memcmp(buf, kExpected, sizeof(kExpected)), 0);
+}
+
+// Both the master and slave report the standard default termios settings.
+//
+// Note that TCGETS on the master actually redirects to the slave (see comment
+// on MasterTermiosUnchangable).
+TEST_F(PtyTest, DefaultTermios) {
+  struct kernel_termios t = {};
+  EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+  EXPECT_EQ(t, DefaultTermios());
+
+  EXPECT_THAT(ioctl(master_.get(), TCGETS, &t), SyscallSucceeds());
+  EXPECT_EQ(t, DefaultTermios());
+}
+
+// Changing termios from the master actually affects the slave.
+//
+// TCSETS on the master actually redirects to the slave (see comment on
+// MasterTermiosUnchangable).
+TEST_F(PtyTest, TermiosAffectsSlave) {
+  struct kernel_termios master_termios = {};
+  EXPECT_THAT(ioctl(master_.get(), TCGETS, &master_termios), SyscallSucceeds());
+  master_termios.c_lflag ^= ICANON;
+  EXPECT_THAT(ioctl(master_.get(), TCSETS, &master_termios), SyscallSucceeds());
+
+  struct kernel_termios slave_termios = {};
+  EXPECT_THAT(ioctl(slave_.get(), TCGETS, &slave_termios), SyscallSucceeds());
+  EXPECT_EQ(master_termios, slave_termios);
+}
+
+// The master end of the pty has termios:
+//
+// struct kernel_termios t = {
+//   .c_iflag = 0;
+//   .c_oflag = 0;
+//   .c_cflag = B38400 | CS8 | CREAD;
+//   .c_lflag = 0;
+//   .c_cc = /* same as DefaultTermios */
+// }
+//
+// (From drivers/tty/pty.c:unix98_pty_init)
+//
+// All termios control ioctls on the master actually redirect to the slave
+// (drivers/tty/tty_ioctl.c:tty_mode_ioctl), making it impossible to change the
+// master termios.
+//
+// Verify this by setting ICRNL (which rewrites input \r to \n) and verify that
+// it has no effect on the master.
+TEST_F(PtyTest, MasterTermiosUnchangable) {
+  char c = '\r';
+  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  ExpectReadable(master_, 1, &c);
+  EXPECT_EQ(c, '\r');  // ICRNL had no effect!
+
+  ExpectFinished(master_);
+}
+
+// ICRNL rewrites input \r to \n.
+TEST_F(PtyTest, TermiosICRNL) {
+  struct kernel_termios t = DefaultTermios();
+  t.c_iflag |= ICRNL;
+  t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
+  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+  char c = '\r';
+  ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  ExpectReadable(slave_, 1, &c);
+  EXPECT_EQ(c, '\n');
+
+  ExpectFinished(slave_);
+}
+
+// ONLCR rewrites output \n to \r\n.
+TEST_F(PtyTest, TermiosONLCR) {
+  struct kernel_termios t = DefaultTermios();
+  t.c_oflag |= ONLCR;
+  t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
+  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+  char c = '\n';
+  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  // Extra byte for NUL for EXPECT_STREQ.
+  char buf[3] = {};
+  ExpectReadable(master_, 2, buf);
+  EXPECT_STREQ(buf, "\r\n");
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, TermiosIGNCR) {
+  struct kernel_termios t = DefaultTermios();
+  t.c_iflag |= IGNCR;
+  t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
+  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+  char c = '\r';
+  ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  // Nothing to read.
+  ASSERT_THAT(PollAndReadFd(slave_.get(), &c, 1, kTimeout),
+              PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+}
+
+// Test that we can successfully poll for readable data from the slave.
+TEST_F(PtyTest, TermiosPollSlave) {
+  struct kernel_termios t = DefaultTermios();
+  t.c_iflag |= IGNCR;
+  t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
+  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+  absl::Notification notify;
+  int sfd = slave_.get();
+  ScopedThread th([sfd, &notify]() {
+    notify.Notify();
+
+    // Poll on the reader fd with POLLIN event.
+    struct pollfd poll_fd = {sfd, POLLIN, 0};
+    EXPECT_THAT(
+        RetryEINTR(poll)(&poll_fd, 1, absl::ToInt64Milliseconds(kTimeout)),
+        SyscallSucceedsWithValue(1));
+
+    // Should trigger POLLIN event.
+    EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+  });
+
+  notify.WaitForNotification();
+  // Sleep ensures that poll begins waiting before we write to the FD.
+  absl::SleepFor(absl::Seconds(1));
+
+  char s[] = "foo\n";
+  ASSERT_THAT(WriteFd(master_.get(), s, strlen(s) + 1), SyscallSucceeds());
+}
+
+// Test that we can successfully poll for readable data from the master.
+TEST_F(PtyTest, TermiosPollMaster) {
+  struct kernel_termios t = DefaultTermios();
+  t.c_iflag |= IGNCR;
+  t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
+  ASSERT_THAT(ioctl(master_.get(), TCSETS, &t), SyscallSucceeds());
+
+  absl::Notification notify;
+  int mfd = master_.get();
+  ScopedThread th([mfd, &notify]() {
+    notify.Notify();
+
+    // Poll on the reader fd with POLLIN event.
+    struct pollfd poll_fd = {mfd, POLLIN, 0};
+    EXPECT_THAT(
+        RetryEINTR(poll)(&poll_fd, 1, absl::ToInt64Milliseconds(kTimeout)),
+        SyscallSucceedsWithValue(1));
+
+    // Should trigger POLLIN event.
+    EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+  });
+
+  notify.WaitForNotification();
+  // Sleep ensures that poll begins waiting before we write to the FD.
+  absl::SleepFor(absl::Seconds(1));
+
+  char s[] = "foo\n";
+  ASSERT_THAT(WriteFd(slave_.get(), s, strlen(s) + 1), SyscallSucceeds());
+}
+
+TEST_F(PtyTest, TermiosINLCR) {
+  struct kernel_termios t = DefaultTermios();
+  t.c_iflag |= INLCR;
+  t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
+  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+  char c = '\n';
+  ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  ExpectReadable(slave_, 1, &c);
+  EXPECT_EQ(c, '\r');
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, TermiosONOCR) {
+  struct kernel_termios t = DefaultTermios();
+  t.c_oflag |= ONOCR;
+  t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
+  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+  // The terminal is at column 0, so there should be no CR to read.
+  char c = '\r';
+  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  // Nothing to read.
+  ASSERT_THAT(PollAndReadFd(master_.get(), &c, 1, kTimeout),
+              PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+
+  // This time the column is greater than 0, so we should be able to read the CR
+  // out of the other end.
+  constexpr char kInput[] = "foo\r";
+  constexpr int kInputSize = sizeof(kInput) - 1;
+  ASSERT_THAT(WriteFd(slave_.get(), kInput, kInputSize),
+              SyscallSucceedsWithValue(kInputSize));
+
+  char buf[kInputSize] = {};
+  ExpectReadable(master_, kInputSize, buf);
+
+  EXPECT_EQ(memcmp(buf, kInput, kInputSize), 0);
+
+  ExpectFinished(master_);
+
+  // Terminal should be at column 0 again, so no CR can be read.
+  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  // Nothing to read.
+  ASSERT_THAT(PollAndReadFd(master_.get(), &c, 1, kTimeout),
+              PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+}
+
+TEST_F(PtyTest, TermiosOCRNL) {
+  struct kernel_termios t = DefaultTermios();
+  t.c_oflag |= OCRNL;
+  t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
+  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+  // The terminal is at column 0, so there should be no CR to read.
+  char c = '\r';
+  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  ExpectReadable(master_, 1, &c);
+  EXPECT_EQ(c, '\n');
+
+  ExpectFinished(master_);
+}
+
+// Tests that VEOL is disabled when we start, and that we can set it to enable
+// it.
+TEST_F(PtyTest, VEOLTermination) {
+  // Write a few bytes ending with '\0', and confirm that we can't read.
+  constexpr char kInput[] = "hello";
+  ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
+              SyscallSucceedsWithValue(sizeof(kInput)));
+  char buf[sizeof(kInput)] = {};
+  ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(kInput), kTimeout),
+              PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+
+  // Set the EOL character to '=' and write it.
+  constexpr char delim = '=';
+  struct kernel_termios t = DefaultTermios();
+  t.c_cc[VEOL] = delim;
+  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+
+  // Now we can read, as sending EOL caused the line to become available.
+  ExpectReadable(slave_, sizeof(kInput), buf);
+  EXPECT_EQ(memcmp(buf, kInput, sizeof(kInput)), 0);
+
+  ExpectReadable(slave_, 1, buf);
+  EXPECT_EQ(buf[0], '=');
+
+  ExpectFinished(slave_);
+}
+
+// Tests that we can write more than the 4096 character limit, then a
+// terminating character, then read out just the first 4095 bytes plus the
+// terminator.
+TEST_F(PtyTest, CanonBigWrite) {
+  constexpr int kWriteLen = kMaxLineSize + 4;
+  char input[kWriteLen];
+  memset(input, 'M', kWriteLen - 1);
+  input[kWriteLen - 1] = '\n';
+  ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
+              SyscallSucceedsWithValue(kWriteLen));
+
+  // We can read the line.
+  char buf[kMaxLineSize] = {};
+  ExpectReadable(slave_, kMaxLineSize, buf);
+
+  ExpectFinished(slave_);
+}
+
+// Tests that data written in canonical mode can be read immediately once
+// switched to noncanonical mode.
+TEST_F(PtyTest, SwitchCanonToNoncanon) {
+  // Write a few bytes without a terminating character, switch to noncanonical
+  // mode, and read them.
+  constexpr char kInput[] = "hello";
+  ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
+              SyscallSucceedsWithValue(sizeof(kInput)));
+
+  // Nothing available yet.
+  char buf[sizeof(kInput)] = {};
+  ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(kInput), kTimeout),
+              PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+
+  DisableCanonical();
+
+  ExpectReadable(slave_, sizeof(kInput), buf);
+  EXPECT_STREQ(buf, kInput);
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchCanonToNonCanonNewline) {
+  // Write a few bytes with a terminating character.
+  constexpr char kInput[] = "hello\n";
+  ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
+              SyscallSucceedsWithValue(sizeof(kInput)));
+
+  DisableCanonical();
+
+  // We can read the line.
+  char buf[sizeof(kInput)] = {};
+  ExpectReadable(slave_, sizeof(kInput), buf);
+  EXPECT_STREQ(buf, kInput);
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchNoncanonToCanonNewlineBig) {
+  DisableCanonical();
+
+  // Write more than the maximum line size, then write a delimiter.
+  constexpr int kWriteLen = 4100;
+  char input[kWriteLen];
+  memset(input, 'M', kWriteLen);
+  ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
+              SyscallSucceedsWithValue(kWriteLen));
+  // Wait for the input queue to fill.
+  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+  constexpr char delim = '\n';
+  ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+
+  EnableCanonical();
+
+  // We can read the line.
+  char buf[kMaxLineSize] = {};
+  ExpectReadable(slave_, kMaxLineSize - 1, buf);
+
+  // We can also read the remaining characters.
+  ExpectReadable(slave_, 6, buf);
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchNoncanonToCanonNoNewline) {
+  DisableCanonical();
+
+  // Write a few bytes without a terminating character.
+  // mode, and read them.
+  constexpr char kInput[] = "hello";
+  ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput) - 1),
+              SyscallSucceedsWithValue(sizeof(kInput) - 1));
+
+  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(kInput) - 1));
+  EnableCanonical();
+
+  // We can read the line.
+  char buf[sizeof(kInput)] = {};
+  ExpectReadable(slave_, sizeof(kInput) - 1, buf);
+  EXPECT_STREQ(buf, kInput);
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchNoncanonToCanonNoNewlineBig) {
+  DisableCanonical();
+
+  // Write a few bytes without a terminating character.
+  // mode, and read them.
+  constexpr int kWriteLen = 4100;
+  char input[kWriteLen];
+  memset(input, 'M', kWriteLen);
+  ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
+              SyscallSucceedsWithValue(kWriteLen));
+
+  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+  EnableCanonical();
+
+  // We can read the line.
+  char buf[kMaxLineSize] = {};
+  ExpectReadable(slave_, kMaxLineSize - 1, buf);
+
+  ExpectFinished(slave_);
+}
+
+// Tests that we can write over the 4095 noncanonical limit, then read out
+// everything.
+TEST_F(PtyTest, NoncanonBigWrite) {
+  DisableCanonical();
+
+  // Write well over the 4095 internal buffer limit.
+  constexpr char kInput = 'M';
+  constexpr int kInputSize = kMaxLineSize * 2;
+  for (int i = 0; i < kInputSize; i++) {
+    // This makes too many syscalls for save/restore.
+    const DisableSave ds;
+    ASSERT_THAT(WriteFd(master_.get(), &kInput, sizeof(kInput)),
+                SyscallSucceedsWithValue(sizeof(kInput)));
+  }
+
+  // We should be able to read out everything. Sleep a bit so that Linux has a
+  // chance to move data from the master to the slave.
+  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+  for (int i = 0; i < kInputSize; i++) {
+    // This makes too many syscalls for save/restore.
+    const DisableSave ds;
+    char c;
+    ExpectReadable(slave_, 1, &c);
+    ASSERT_EQ(c, kInput);
+  }
+
+  ExpectFinished(slave_);
+}
+
+// ICANON doesn't make input available until a line delimiter is typed.
+//
+// Test newline.
+TEST_F(PtyTest, TermiosICANONNewline) {
+  char input[3] = {'a', 'b', 'c'};
+  ASSERT_THAT(WriteFd(master_.get(), input, sizeof(input)),
+              SyscallSucceedsWithValue(sizeof(input)));
+
+  // Extra bytes for newline (written later) and NUL for EXPECT_STREQ.
+  char buf[5] = {};
+
+  // Nothing available yet.
+  ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(input), kTimeout),
+              PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+
+  char delim = '\n';
+  ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+
+  // Now it is available.
+  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(input) + 1));
+  ExpectReadable(slave_, sizeof(input) + 1, buf);
+  EXPECT_STREQ(buf, "abc\n");
+
+  ExpectFinished(slave_);
+}
+
+// ICANON doesn't make input available until a line delimiter is typed.
+//
+// Test EOF (^D).
+TEST_F(PtyTest, TermiosICANONEOF) {
+  char input[3] = {'a', 'b', 'c'};
+  ASSERT_THAT(WriteFd(master_.get(), input, sizeof(input)),
+              SyscallSucceedsWithValue(sizeof(input)));
+
+  // Extra byte for NUL for EXPECT_STREQ.
+  char buf[4] = {};
+
+  // Nothing available yet.
+  ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(input), kTimeout),
+              PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+  char delim = ControlCharacter('D');
+  ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+
+  // Now it is available. Note that ^D is not included.
+  ExpectReadable(slave_, sizeof(input), buf);
+  EXPECT_STREQ(buf, "abc");
+
+  ExpectFinished(slave_);
+}
+
+// ICANON limits us to 4096 bytes including a terminating character. Anything
+// after and 4095th character is discarded (although still processed for
+// signals and echoing).
+TEST_F(PtyTest, CanonDiscard) {
+  constexpr char kInput = 'M';
+  constexpr int kInputSize = 4100;
+  constexpr int kIter = 3;
+
+  // A few times write more than the 4096 character maximum, then a newline.
+  constexpr char delim = '\n';
+  for (int i = 0; i < kIter; i++) {
+    // This makes too many syscalls for save/restore.
+    const DisableSave ds;
+    for (int i = 0; i < kInputSize; i++) {
+      ASSERT_THAT(WriteFd(master_.get(), &kInput, sizeof(kInput)),
+                  SyscallSucceedsWithValue(sizeof(kInput)));
+    }
+    ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+  }
+
+  // There should be multiple truncated lines available to read.
+  for (int i = 0; i < kIter; i++) {
+    char buf[kInputSize] = {};
+    ExpectReadable(slave_, kMaxLineSize, buf);
+    EXPECT_EQ(buf[kMaxLineSize - 1], delim);
+    EXPECT_EQ(buf[kMaxLineSize - 2], kInput);
+  }
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, CanonMultiline) {
+  constexpr char kInput1[] = "GO\n";
+  constexpr char kInput2[] = "BLUE\n";
+
+  // Write both lines.
+  ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
+              SyscallSucceedsWithValue(sizeof(kInput1) - 1));
+  ASSERT_THAT(WriteFd(master_.get(), kInput2, sizeof(kInput2) - 1),
+              SyscallSucceedsWithValue(sizeof(kInput2) - 1));
+
+  // Get the first line.
+  char line1[8] = {};
+  ExpectReadable(slave_, sizeof(kInput1) - 1, line1);
+  EXPECT_STREQ(line1, kInput1);
+
+  // Get the second line.
+  char line2[8] = {};
+  ExpectReadable(slave_, sizeof(kInput2) - 1, line2);
+  EXPECT_STREQ(line2, kInput2);
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchNoncanonToCanonMultiline) {
+  DisableCanonical();
+
+  constexpr char kInput1[] = "GO\n";
+  constexpr char kInput2[] = "BLUE\n";
+  constexpr char kExpected[] = "GO\nBLUE\n";
+
+  // Write both lines.
+  ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
+              SyscallSucceedsWithValue(sizeof(kInput1) - 1));
+  ASSERT_THAT(WriteFd(master_.get(), kInput2, sizeof(kInput2) - 1),
+              SyscallSucceedsWithValue(sizeof(kInput2) - 1));
+
+  ASSERT_NO_ERRNO(
+      WaitUntilReceived(slave_.get(), sizeof(kInput1) + sizeof(kInput2) - 2));
+  EnableCanonical();
+
+  // Get all together as one line.
+  char line[9] = {};
+  ExpectReadable(slave_, 8, line);
+  EXPECT_STREQ(line, kExpected);
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchTwiceMultiline) {
+  std::string kInputs[] = {"GO\n", "BLUE\n", "!"};
+  std::string kExpected = "GO\nBLUE\n!";
+
+  // Write each line.
+  for (std::string input : kInputs) {
+    ASSERT_THAT(WriteFd(master_.get(), input.c_str(), input.size()),
+                SyscallSucceedsWithValue(input.size()));
+  }
+
+  DisableCanonical();
+  // All written characters have to make it into the input queue before
+  // canonical mode is re-enabled. If the final '!' character hasn't been
+  // enqueued before canonical mode is re-enabled, it won't be readable.
+  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kExpected.size()));
+  EnableCanonical();
+
+  // Get all together as one line.
+  char line[10] = {};
+  ExpectReadable(slave_, 9, line);
+  EXPECT_STREQ(line, kExpected.c_str());
+
+  ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, QueueSize) {
+  // Write the line.
+  constexpr char kInput1[] = "GO\n";
+  ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
+              SyscallSucceedsWithValue(sizeof(kInput1) - 1));
+  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(kInput1) - 1));
+
+  // Ensure that writing more (beyond what is readable) does not impact the
+  // readable size.
+  char input[kMaxLineSize];
+  memset(input, 'M', kMaxLineSize);
+  ASSERT_THAT(WriteFd(master_.get(), input, kMaxLineSize),
+              SyscallSucceedsWithValue(kMaxLineSize));
+  int inputBufSize = ASSERT_NO_ERRNO_AND_VALUE(
+      WaitUntilReceived(slave_.get(), sizeof(kInput1) - 1));
+  EXPECT_EQ(inputBufSize, sizeof(kInput1) - 1);
+}
+
+TEST_F(PtyTest, PartialBadBuffer) {
+  // Allocate 2 pages.
+  void* addr = mmap(nullptr, 2 * kPageSize, PROT_READ | PROT_WRITE,
+                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  ASSERT_NE(addr, MAP_FAILED);
+  char* buf = reinterpret_cast<char*>(addr);
+
+  // Guard the 2nd page for our read to run into.
+  ASSERT_THAT(
+      mprotect(reinterpret_cast<void*>(buf + kPageSize), kPageSize, PROT_NONE),
+      SyscallSucceeds());
+
+  // Leave only one free byte in the buffer.
+  char* bad_buffer = buf + kPageSize - 1;
+
+  // Write to the master.
+  constexpr char kBuf[] = "hello\n";
+  constexpr size_t size = sizeof(kBuf) - 1;
+  EXPECT_THAT(WriteFd(master_.get(), kBuf, size),
+              SyscallSucceedsWithValue(size));
+
+  // Read from the slave into bad_buffer.
+  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), size));
+  EXPECT_THAT(ReadFd(slave_.get(), bad_buffer, size),
+              SyscallFailsWithErrno(EFAULT));
+
+  EXPECT_THAT(munmap(addr, 2 * kPageSize), SyscallSucceeds()) << addr;
+}
+
+TEST_F(PtyTest, SimpleEcho) {
+  constexpr char kInput[] = "Mr. Eko";
+  EXPECT_THAT(WriteFd(master_.get(), kInput, strlen(kInput)),
+              SyscallSucceedsWithValue(strlen(kInput)));
+
+  char buf[100] = {};
+  ExpectReadable(master_, strlen(kInput), buf);
+
+  EXPECT_STREQ(buf, kInput);
+  ExpectFinished(master_);
+}
+
+TEST_F(PtyTest, GetWindowSize) {
+  struct winsize ws;
+  ASSERT_THAT(ioctl(slave_.get(), TIOCGWINSZ, &ws), SyscallSucceeds());
+  EXPECT_EQ(ws.ws_row, 0);
+  EXPECT_EQ(ws.ws_col, 0);
+}
+
+TEST_F(PtyTest, SetSlaveWindowSize) {
+  constexpr uint16_t kRows = 343;
+  constexpr uint16_t kCols = 2401;
+  struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
+
+  struct winsize retrieved_ws = {};
+  ASSERT_THAT(ioctl(master_.get(), TIOCGWINSZ, &retrieved_ws),
+              SyscallSucceeds());
+  EXPECT_EQ(retrieved_ws.ws_row, kRows);
+  EXPECT_EQ(retrieved_ws.ws_col, kCols);
+}
+
+TEST_F(PtyTest, SetMasterWindowSize) {
+  constexpr uint16_t kRows = 343;
+  constexpr uint16_t kCols = 2401;
+  struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
+  ASSERT_THAT(ioctl(master_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
+
+  struct winsize retrieved_ws = {};
+  ASSERT_THAT(ioctl(slave_.get(), TIOCGWINSZ, &retrieved_ws),
+              SyscallSucceeds());
+  EXPECT_EQ(retrieved_ws.ws_row, kRows);
+  EXPECT_EQ(retrieved_ws.ws_col, kCols);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
new file mode 100644
index 000000000..60ae6de1f
--- /dev/null
+++ b/test/syscalls/linux/pwrite64.cc
@@ -0,0 +1,79 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// This test is currently very rudimentary.
+//
+// TODO:
+// * bad buffer states (EFAULT).
+// * bad fds (wrong permission, wrong type of file, EBADF).
+// * check offset is not incremented.
+// * check for EOF.
+// * writing to pipes, symlinks, special files.
+class Pwrite64 : public ::testing::Test {
+  void SetUp() override {
+    name_ = NewTempAbsPath();
+    int fd;
+    ASSERT_THAT(fd = open(name_.c_str(), O_CREAT, 0644), SyscallSucceeds());
+    EXPECT_THAT(close(fd), SyscallSucceeds());
+  }
+
+  void TearDown() override { unlink(name_.c_str()); }
+
+ public:
+  std::string name_;
+};
+
+TEST_F(Pwrite64, AppendOnly) {
+  int fd;
+  ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
+  constexpr int64_t kBufSize = 1024;
+  std::vector<char> buf(kBufSize);
+  std::fill(buf.begin(), buf.end(), 'a');
+  EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), 0),
+              SyscallSucceedsWithValue(buf.size()));
+  EXPECT_THAT(lseek(fd, 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(Pwrite64, InvalidArgs) {
+  int fd;
+  ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
+  constexpr int64_t kBufSize = 1024;
+  std::vector<char> buf(kBufSize);
+  std::fill(buf.begin(), buf.end(), 'a');
+  EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), -1),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/read.cc b/test/syscalls/linux/read.cc
new file mode 100644
index 000000000..eb1b5bc10
--- /dev/null
+++ b/test/syscalls/linux/read.cc
@@ -0,0 +1,117 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReadTest : public ::testing::Test {
+  void SetUp() override {
+    name_ = NewTempAbsPath();
+    int fd;
+    ASSERT_THAT(fd = open(name_.c_str(), O_CREAT, 0644), SyscallSucceeds());
+    ASSERT_THAT(close(fd), SyscallSucceeds());
+  }
+
+  void TearDown() override { unlink(name_.c_str()); }
+
+ public:
+  std::string name_;
+};
+
+TEST_F(ReadTest, ZeroBuffer) {
+  int fd;
+  ASSERT_THAT(fd = open(name_.c_str(), O_RDWR), SyscallSucceeds());
+
+  char msg[] = "hello world";
+  EXPECT_THAT(PwriteFd(fd, msg, strlen(msg), 0),
+              SyscallSucceedsWithValue(strlen(msg)));
+
+  char buf[10];
+  EXPECT_THAT(ReadFd(fd, buf, 0), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(ReadTest, EmptyFileReturnsZeroAtEOF) {
+  int fd;
+  ASSERT_THAT(fd = open(name_.c_str(), O_RDWR), SyscallSucceeds());
+
+  char eof_buf[10];
+  EXPECT_THAT(ReadFd(fd, eof_buf, 10), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(ReadTest, EofAfterRead) {
+  int fd;
+  ASSERT_THAT(fd = open(name_.c_str(), O_RDWR), SyscallSucceeds());
+
+  // Write some bytes to be read.
+  constexpr char kMessage[] = "hello world";
+  EXPECT_THAT(PwriteFd(fd, kMessage, sizeof(kMessage), 0),
+              SyscallSucceedsWithValue(sizeof(kMessage)));
+
+  // Read all of the bytes at once.
+  char buf[sizeof(kMessage)];
+  EXPECT_THAT(ReadFd(fd, buf, sizeof(kMessage)),
+              SyscallSucceedsWithValue(sizeof(kMessage)));
+
+  // Read again with a non-zero buffer and expect EOF.
+  char eof_buf[10];
+  EXPECT_THAT(ReadFd(fd, eof_buf, 10), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(ReadTest, DevNullReturnsEof) {
+  int fd;
+  ASSERT_THAT(fd = open("/dev/null", O_RDONLY), SyscallSucceeds());
+  std::vector<char> buf(1);
+  EXPECT_THAT(ReadFd(fd, buf.data(), 1), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+const int kReadSize = 128 * 1024;
+
+// Do not allow random save as it could lead to partial reads.
+TEST_F(ReadTest, CanReadFullyFromDevZero_NoRandomSave) {
+  int fd;
+  ASSERT_THAT(fd = open("/dev/zero", O_RDONLY), SyscallSucceeds());
+
+  std::vector<char> buf(kReadSize, 1);
+  EXPECT_THAT(ReadFd(fd, buf.data(), kReadSize),
+              SyscallSucceedsWithValue(kReadSize));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  EXPECT_EQ(std::vector<char>(kReadSize, 0), buf);
+}
+
+TEST_F(ReadTest, ReadDirectoryFails) {
+  const FileDescriptor file =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), O_RDONLY));
+  std::vector<char> buf(1);
+  EXPECT_THAT(ReadFd(file.get(), buf.data(), 1), SyscallFailsWithErrno(EISDIR));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
new file mode 100644
index 000000000..0b933673a
--- /dev/null
+++ b/test/syscalls/linux/readv.cc
@@ -0,0 +1,293 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/syscalls/linux/readv_common.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReadvTest : public FileTest {
+  void SetUp() override {
+    FileTest::SetUp();
+
+    ASSERT_THAT(write(test_file_fd_.get(), kReadvTestData, kReadvTestDataSize),
+                SyscallSucceedsWithValue(kReadvTestDataSize));
+    ASSERT_THAT(lseek(test_file_fd_.get(), 0, SEEK_SET),
+                SyscallSucceedsWithValue(0));
+    ASSERT_THAT(write(test_pipe_[1], kReadvTestData, kReadvTestDataSize),
+                SyscallSucceedsWithValue(kReadvTestDataSize));
+  }
+};
+
+TEST_F(ReadvTest, ReadOneBufferPerByte_File) {
+  ReadOneBufferPerByte(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadOneBufferPerByte_Pipe) {
+  ReadOneBufferPerByte(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadOneHalfAtATime_File) {
+  ReadOneHalfAtATime(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadOneHalfAtATime_Pipe) {
+  ReadOneHalfAtATime(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadAllOneBuffer_File) {
+  ReadAllOneBuffer(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadAllOneBuffer_Pipe) { ReadAllOneBuffer(test_pipe_[0]); }
+
+TEST_F(ReadvTest, ReadAllOneLargeBuffer_File) {
+  ReadAllOneLargeBuffer(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadAllOneLargeBuffer_Pipe) {
+  ReadAllOneLargeBuffer(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadBuffersOverlapping_File) {
+  ReadBuffersOverlapping(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadBuffersOverlapping_Pipe) {
+  ReadBuffersOverlapping(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadBuffersDiscontinuous_File) {
+  ReadBuffersDiscontinuous(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadBuffersDiscontinuous_Pipe) {
+  ReadBuffersDiscontinuous(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadIovecsCompletelyFilled_File) {
+  ReadIovecsCompletelyFilled(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadIovecsCompletelyFilled_Pipe) {
+  ReadIovecsCompletelyFilled(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, BadFileDescriptor) {
+  char buffer[1024];
+  struct iovec iov[1];
+  iov[0].iov_base = buffer;
+  iov[0].iov_len = 1024;
+
+  ASSERT_THAT(readv(-1, iov, 1024), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(ReadvTest, BadIovecsPointer_File) {
+  ASSERT_THAT(readv(test_file_fd_.get(), nullptr, 1),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvTest, BadIovecsPointer_Pipe) {
+  ASSERT_THAT(readv(test_pipe_[0], nullptr, 1), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvTest, BadIovecBase_File) {
+  struct iovec iov[1];
+  iov[0].iov_base = nullptr;
+  iov[0].iov_len = 1024;
+  ASSERT_THAT(readv(test_file_fd_.get(), iov, 1),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvTest, BadIovecBase_Pipe) {
+  struct iovec iov[1];
+  iov[0].iov_base = nullptr;
+  iov[0].iov_len = 1024;
+  ASSERT_THAT(readv(test_pipe_[0], iov, 1), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvTest, ZeroIovecs_File) {
+  struct iovec iov[1];
+  iov[0].iov_base = 0;
+  iov[0].iov_len = 0;
+  ASSERT_THAT(readv(test_file_fd_.get(), iov, 1), SyscallSucceeds());
+}
+
+TEST_F(ReadvTest, ZeroIovecs_Pipe) {
+  struct iovec iov[1];
+  iov[0].iov_base = 0;
+  iov[0].iov_len = 0;
+  ASSERT_THAT(readv(test_pipe_[0], iov, 1), SyscallSucceeds());
+}
+
+TEST_F(ReadvTest, NotReadable_File) {
+  char buffer[1024];
+  struct iovec iov[1];
+  iov[0].iov_base = buffer;
+  iov[0].iov_len = 1024;
+
+  std::string wronly_file = NewTempAbsPath();
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(wronly_file, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR));
+  ASSERT_THAT(readv(fd.get(), iov, 1), SyscallFailsWithErrno(EBADF));
+  fd.reset();  // Close before unlinking.
+  ASSERT_THAT(unlink(wronly_file.c_str()), SyscallSucceeds());
+}
+
+TEST_F(ReadvTest, NotReadable_Pipe) {
+  char buffer[1024];
+  struct iovec iov[1];
+  iov[0].iov_base = buffer;
+  iov[0].iov_len = 1024;
+  ASSERT_THAT(readv(test_pipe_[1], iov, 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(ReadvTest, DirNotReadable) {
+  char buffer[1024];
+  struct iovec iov[1];
+  iov[0].iov_base = buffer;
+  iov[0].iov_len = 1024;
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), O_RDONLY));
+  ASSERT_THAT(readv(fd.get(), iov, 1), SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(ReadvTest, OffsetIncremented) {
+  char* buffer = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  struct iovec iov[1];
+  iov[0].iov_base = buffer;
+  iov[0].iov_len = kReadvTestDataSize;
+
+  ASSERT_THAT(readv(test_file_fd_.get(), iov, 1),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+  ASSERT_THAT(lseek(test_file_fd_.get(), 0, SEEK_CUR),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+
+  free(buffer);
+}
+
+TEST_F(ReadvTest, EndOfFile) {
+  char* buffer = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  struct iovec iov[1];
+  iov[0].iov_base = buffer;
+  iov[0].iov_len = kReadvTestDataSize;
+  ASSERT_THAT(readv(test_file_fd_.get(), iov, 1),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+  free(buffer);
+
+  buffer = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  iov[0].iov_base = buffer;
+  iov[0].iov_len = kReadvTestDataSize;
+  ASSERT_THAT(readv(test_file_fd_.get(), iov, 1), SyscallSucceedsWithValue(0));
+  free(buffer);
+}
+
+TEST_F(ReadvTest, WouldBlock_Pipe) {
+  struct iovec iov[1];
+  iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  iov[0].iov_len = kReadvTestDataSize;
+  ASSERT_THAT(readv(test_pipe_[0], iov, 1),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+  free(iov[0].iov_base);
+
+  iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  ASSERT_THAT(readv(test_pipe_[0], iov, 1), SyscallFailsWithErrno(EAGAIN));
+  free(iov[0].iov_base);
+}
+
+TEST_F(ReadvTest, ZeroBuffer) {
+  char buf[10];
+  struct iovec iov[1];
+  iov[0].iov_base = buf;
+  iov[0].iov_len = 0;
+  ASSERT_THAT(readv(test_pipe_[0], iov, 1), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ReadvTest, NullIovecInNonemptyArray) {
+  std::vector<char> buf(kReadvTestDataSize);
+  struct iovec iov[2];
+  iov[0].iov_base = nullptr;
+  iov[0].iov_len = 0;
+  iov[1].iov_base = buf.data();
+  iov[1].iov_len = kReadvTestDataSize;
+  ASSERT_THAT(readv(test_file_fd_.get(), iov, 2),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+}
+
+TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) {
+  std::vector<char> buf(kReadvTestDataSize);
+  struct iovec iov[2];
+  iov[0].iov_base = reinterpret_cast<void*>(~static_cast<uintptr_t>(0));
+  iov[0].iov_len = 0;
+  iov[1].iov_base = buf.data();
+  iov[1].iov_len = kReadvTestDataSize;
+  ASSERT_THAT(readv(test_file_fd_.get(), iov, 2),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+// This test depends on the maximum extent of a single readv() syscall, so
+// we can't tolerate interruption from saving.
+TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) {
+  // Ensure that we won't be interrupted by ITIMER_PROF.
+  struct itimerval itv = {};
+  auto const cleanup_itimer =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv));
+
+  // From Linux's include/linux/fs.h.
+  size_t const MAX_RW_COUNT = INT_MAX & ~(kPageSize - 1);
+
+  // Create an iovec array with 3 segments pointing to consecutive parts of a
+  // buffer. The first covers all but the last three pages, and should be
+  // written to in its entirety. The second covers the last page before
+  // MAX_RW_COUNT and the first page after; only the first page should be
+  // written to. The third covers the last page of the buffer, and should be
+  // skipped entirely.
+  size_t const kBufferSize = MAX_RW_COUNT + 2 * kPageSize;
+  size_t const kFirstOffset = MAX_RW_COUNT - kPageSize;
+  size_t const kSecondOffset = MAX_RW_COUNT + kPageSize;
+  // The buffer is too big to fit on the stack.
+  std::vector<char> buf(kBufferSize);
+  struct iovec iov[3];
+  iov[0].iov_base = buf.data();
+  iov[0].iov_len = kFirstOffset;
+  iov[1].iov_base = buf.data() + kFirstOffset;
+  iov[1].iov_len = kSecondOffset - kFirstOffset;
+  iov[2].iov_base = buf.data() + kSecondOffset;
+  iov[2].iov_len = kBufferSize - kSecondOffset;
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+  EXPECT_THAT(readv(fd.get(), iov, 3), SyscallSucceedsWithValue(MAX_RW_COUNT));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
new file mode 100644
index 000000000..349b80d7f
--- /dev/null
+++ b/test/syscalls/linux/readv_common.cc
@@ -0,0 +1,180 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+extern const char kReadvTestData[] =
+    "127.0.0.1      localhost"
+    ""
+    "# The following lines are desirable for IPv6 capable hosts"
+    "::1     ip6-localhost ip6-loopback"
+    "fe00::0 ip6-localnet"
+    "ff00::0 ip6-mcastprefix"
+    "ff02::1 ip6-allnodes"
+    "ff02::2 ip6-allrouters"
+    "ff02::3 ip6-allhosts"
+    "192.168.1.100 a"
+    "93.184.216.34          foo.bar.example.com xcpu";
+extern const size_t kReadvTestDataSize = sizeof(kReadvTestData);
+
+static void ReadAllOneProvidedBuffer(int fd, std::vector<char>* buffer) {
+  struct iovec iovs[1];
+  iovs[0].iov_base = buffer->data();
+  iovs[0].iov_len = kReadvTestDataSize;
+
+  ASSERT_THAT(readv(fd, iovs, 1), SyscallSucceedsWithValue(kReadvTestDataSize));
+
+  std::pair<struct iovec*, int> iovec_desc(iovs, 1);
+  EXPECT_THAT(iovec_desc, MatchesStringLength(kReadvTestDataSize));
+  EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+}
+
+void ReadAllOneBuffer(int fd) {
+  std::vector<char> buffer(kReadvTestDataSize);
+  ReadAllOneProvidedBuffer(fd, &buffer);
+}
+
+void ReadAllOneLargeBuffer(int fd) {
+  std::vector<char> buffer(10 * kReadvTestDataSize);
+  ReadAllOneProvidedBuffer(fd, &buffer);
+}
+
+void ReadOneHalfAtATime(int fd) {
+  int len0 = kReadvTestDataSize / 2;
+  int len1 = kReadvTestDataSize - len0;
+  std::vector<char> buffer0(len0);
+  std::vector<char> buffer1(len1);
+
+  struct iovec iovs[2];
+  iovs[0].iov_base = buffer0.data();
+  iovs[0].iov_len = len0;
+  iovs[1].iov_base = buffer1.data();
+  iovs[1].iov_len = len1;
+
+  ASSERT_THAT(readv(fd, iovs, 2), SyscallSucceedsWithValue(kReadvTestDataSize));
+
+  std::pair<struct iovec*, int> iovec_desc(iovs, 2);
+  EXPECT_THAT(iovec_desc, MatchesStringLength(kReadvTestDataSize));
+  EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+}
+
+void ReadOneBufferPerByte(int fd) {
+  std::vector<char> buffer(kReadvTestDataSize);
+  std::vector<struct iovec> iovs(kReadvTestDataSize);
+  char* buffer_ptr = buffer.data();
+  struct iovec* iovs_ptr = iovs.data();
+
+  for (int i = 0; i < static_cast<int>(kReadvTestDataSize); i++) {
+    struct iovec iov = {
+        .iov_base = &buffer_ptr[i],
+        .iov_len = 1,
+    };
+    iovs_ptr[i] = iov;
+  }
+
+  ASSERT_THAT(readv(fd, iovs_ptr, kReadvTestDataSize),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+
+  std::pair<struct iovec*, int> iovec_desc(iovs.data(), kReadvTestDataSize);
+  EXPECT_THAT(iovec_desc, MatchesStringLength(kReadvTestDataSize));
+  EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+}
+
+void ReadBuffersOverlapping(int fd) {
+  // overlap the first overlap_bytes.
+  int overlap_bytes = 8;
+  std::vector<char> buffer(kReadvTestDataSize);
+
+  // overlapping causes us to get more data.
+  int expected_size = kReadvTestDataSize + overlap_bytes;
+  std::vector<char> expected(expected_size);
+  char* expected_ptr = expected.data();
+  memcpy(expected_ptr, &kReadvTestData[overlap_bytes], overlap_bytes);
+  memcpy(&expected_ptr[overlap_bytes], &kReadvTestData[overlap_bytes],
+         kReadvTestDataSize);
+
+  struct iovec iovs[2];
+  iovs[0].iov_base = buffer.data();
+  iovs[0].iov_len = overlap_bytes;
+  iovs[1].iov_base = buffer.data();
+  iovs[1].iov_len = kReadvTestDataSize;
+
+  ASSERT_THAT(readv(fd, iovs, 2), SyscallSucceedsWithValue(kReadvTestDataSize));
+
+  std::pair<struct iovec*, int> iovec_desc(iovs, 2);
+  EXPECT_THAT(iovec_desc, MatchesStringLength(expected_size));
+  EXPECT_THAT(iovec_desc, MatchesStringValue(expected_ptr));
+}
+
+void ReadBuffersDiscontinuous(int fd) {
+  // Each iov is 1 byte separated by 1 byte.
+  std::vector<char> buffer(kReadvTestDataSize * 2);
+  std::vector<struct iovec> iovs(kReadvTestDataSize);
+
+  char* buffer_ptr = buffer.data();
+  struct iovec* iovs_ptr = iovs.data();
+
+  for (int i = 0; i < static_cast<int>(kReadvTestDataSize); i++) {
+    struct iovec iov = {
+        .iov_base = &buffer_ptr[i * 2],
+        .iov_len = 1,
+    };
+    iovs_ptr[i] = iov;
+  }
+
+  ASSERT_THAT(readv(fd, iovs_ptr, kReadvTestDataSize),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+
+  std::pair<struct iovec*, int> iovec_desc(iovs.data(), kReadvTestDataSize);
+  EXPECT_THAT(iovec_desc, MatchesStringLength(kReadvTestDataSize));
+  EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+}
+
+void ReadIovecsCompletelyFilled(int fd) {
+  int half = kReadvTestDataSize / 2;
+  std::vector<char> buffer(kReadvTestDataSize);
+  char* buffer_ptr = buffer.data();
+  memset(buffer.data(), '\0', kReadvTestDataSize);
+
+  struct iovec iovs[2];
+  iovs[0].iov_base = buffer.data();
+  iovs[0].iov_len = half;
+  iovs[1].iov_base = &buffer_ptr[half];
+  iovs[1].iov_len = half;
+
+  ASSERT_THAT(readv(fd, iovs, 2), SyscallSucceedsWithValue(half * 2));
+
+  std::pair<struct iovec*, int> iovec_desc(iovs, 2);
+  EXPECT_THAT(iovec_desc, MatchesStringLength(half * 2));
+  EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+
+  char* str = static_cast<char*>(iovs[0].iov_base);
+  str[iovs[0].iov_len - 1] = '\0';
+  ASSERT_EQ(half - 1, strlen(str));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/readv_common.h b/test/syscalls/linux/readv_common.h
new file mode 100644
index 000000000..e261d545a
--- /dev/null
+++ b/test/syscalls/linux/readv_common.h
@@ -0,0 +1,61 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_READV_COMMON_H_
+#define GVISOR_TEST_SYSCALLS_READV_COMMON_H_
+
+#include <stddef.h>
+
+namespace gvisor {
+namespace testing {
+
+// A NUL-terminated std::string containing the data used by tests using the following
+// test helpers.
+extern const char kReadvTestData[];
+
+// The size of kReadvTestData, including the terminating NUL.
+extern const size_t kReadvTestDataSize;
+
+// ReadAllOneBuffer asserts that it can read kReadvTestData from an fd using
+// exactly one iovec.
+void ReadAllOneBuffer(int fd);
+
+// ReadAllOneLargeBuffer asserts that it can read kReadvTestData from an fd
+// using exactly one iovec containing an overly large buffer.
+void ReadAllOneLargeBuffer(int fd);
+
+// ReadOneHalfAtATime asserts that it can read test_data_from an fd using
+// exactly two iovecs that are roughly equivalent in size.
+void ReadOneHalfAtATime(int fd);
+
+// ReadOneBufferPerByte asserts that it can read kReadvTestData from an fd
+// using one iovec per byte.
+void ReadOneBufferPerByte(int fd);
+
+// ReadBuffersOverlapping asserts that it can read kReadvTestData from an fd
+// where two iovecs are overlapping.
+void ReadBuffersOverlapping(int fd);
+
+// ReadBuffersDiscontinuous asserts that it can read kReadvTestData from an fd
+// where each iovec is discontinuous from the next by 1 byte.
+void ReadBuffersDiscontinuous(int fd);
+
+// ReadIovecsCompletelyFilled asserts that the previous iovec is completely
+// filled before moving onto the next.
+void ReadIovecsCompletelyFilled(int fd);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_READV_COMMON_H_
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
new file mode 100644
index 000000000..2c129b7e8
--- /dev/null
+++ b/test/syscalls/linux/readv_socket.cc
@@ -0,0 +1,182 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/syscalls/linux/readv_common.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReadvSocketTest : public SocketTest {
+  void SetUp() override {
+    SocketTest::SetUp();
+    ASSERT_THAT(
+        write(test_unix_stream_socket_[1], kReadvTestData, kReadvTestDataSize),
+        SyscallSucceedsWithValue(kReadvTestDataSize));
+    ASSERT_THAT(
+        write(test_unix_dgram_socket_[1], kReadvTestData, kReadvTestDataSize),
+        SyscallSucceedsWithValue(kReadvTestDataSize));
+    ASSERT_THAT(write(test_unix_seqpacket_socket_[1], kReadvTestData,
+                      kReadvTestDataSize),
+                SyscallSucceedsWithValue(kReadvTestDataSize));
+    // FIXME: Enable when possible.
+    // ASSERT_THAT(write(test_tcp_socket_[1], kReadvTestData,
+    // kReadvTestDataSize),
+    //             SyscallSucceedsWithValue(kReadvTestDataSize));
+  }
+};
+
+TEST_F(ReadvSocketTest, ReadOneBufferPerByte_StreamSocket) {
+  ReadOneBufferPerByte(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadOneBufferPerByte_DgramSocket) {
+  ReadOneBufferPerByte(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadOneBufferPerByte_SeqPacketSocket) {
+  ReadOneBufferPerByte(test_unix_seqpacket_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadOneHalfAtATime_StreamSocket) {
+  ReadOneHalfAtATime(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadOneHalfAtATime_DgramSocket) {
+  ReadOneHalfAtATime(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadAllOneBuffer_StreamSocket) {
+  ReadAllOneBuffer(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadAllOneBuffer_DgramSocket) {
+  ReadAllOneBuffer(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadAllOneLargeBuffer_StreamSocket) {
+  ReadAllOneLargeBuffer(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadAllOneLargeBuffer_DgramSocket) {
+  ReadAllOneLargeBuffer(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadBuffersOverlapping_StreamSocket) {
+  ReadBuffersOverlapping(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadBuffersOverlapping_DgramSocket) {
+  ReadBuffersOverlapping(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadBuffersDiscontinuous_StreamSocket) {
+  ReadBuffersDiscontinuous(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadBuffersDiscontinuous_DgramSocket) {
+  ReadBuffersDiscontinuous(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadIovecsCompletelyFilled_StreamSocket) {
+  ReadIovecsCompletelyFilled(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadIovecsCompletelyFilled_DgramSocket) {
+  ReadIovecsCompletelyFilled(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, BadIovecsPointer_StreamSocket) {
+  ASSERT_THAT(readv(test_unix_stream_socket_[0], nullptr, 1),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvSocketTest, BadIovecsPointer_DgramSocket) {
+  ASSERT_THAT(readv(test_unix_dgram_socket_[0], nullptr, 1),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvSocketTest, BadIovecBase_StreamSocket) {
+  struct iovec iov[1];
+  iov[0].iov_base = nullptr;
+  iov[0].iov_len = 1024;
+  ASSERT_THAT(readv(test_unix_stream_socket_[0], iov, 1),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvSocketTest, BadIovecBase_DgramSocket) {
+  struct iovec iov[1];
+  iov[0].iov_base = nullptr;
+  iov[0].iov_len = 1024;
+  ASSERT_THAT(readv(test_unix_dgram_socket_[0], iov, 1),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvSocketTest, ZeroIovecs_StreamSocket) {
+  struct iovec iov[1];
+  iov[0].iov_base = 0;
+  iov[0].iov_len = 0;
+  ASSERT_THAT(readv(test_unix_stream_socket_[0], iov, 1), SyscallSucceeds());
+}
+
+TEST_F(ReadvSocketTest, ZeroIovecs_DgramSocket) {
+  struct iovec iov[1];
+  iov[0].iov_base = 0;
+  iov[0].iov_len = 0;
+  ASSERT_THAT(readv(test_unix_dgram_socket_[0], iov, 1), SyscallSucceeds());
+}
+
+TEST_F(ReadvSocketTest, WouldBlock_StreamSocket) {
+  struct iovec iov[1];
+  iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  iov[0].iov_len = kReadvTestDataSize;
+  ASSERT_THAT(readv(test_unix_stream_socket_[0], iov, 1),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+  free(iov[0].iov_base);
+
+  iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  ASSERT_THAT(readv(test_unix_stream_socket_[0], iov, 1),
+              SyscallFailsWithErrno(EAGAIN));
+  free(iov[0].iov_base);
+}
+
+TEST_F(ReadvSocketTest, WouldBlock_DgramSocket) {
+  struct iovec iov[1];
+  iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  iov[0].iov_len = kReadvTestDataSize;
+  ASSERT_THAT(readv(test_unix_dgram_socket_[0], iov, 1),
+              SyscallSucceedsWithValue(kReadvTestDataSize));
+  free(iov[0].iov_base);
+
+  iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+  ASSERT_THAT(readv(test_unix_dgram_socket_[0], iov, 1),
+              SyscallFailsWithErrno(EAGAIN));
+  free(iov[0].iov_base);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
new file mode 100644
index 000000000..f4c877a00
--- /dev/null
+++ b/test/syscalls/linux/rename.cc
@@ -0,0 +1,373 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(RenameTest, RootToAnything) {
+  ASSERT_THAT(rename("/", "/bin"), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(RenameTest, AnythingToRoot) {
+  ASSERT_THAT(rename("/bin", "/"), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(RenameTest, SourceIsAncestorOfTarget) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto subdir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+  ASSERT_THAT(rename(dir.path().c_str(), subdir.path().c_str()),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Try an even deeper directory.
+  auto deep_subdir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(subdir.path()));
+  ASSERT_THAT(rename(dir.path().c_str(), deep_subdir.path().c_str()),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(RenameTest, TargetIsAncestorOfSource) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto subdir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+  ASSERT_THAT(rename(subdir.path().c_str(), dir.path().c_str()),
+              SyscallFailsWithErrno(ENOTEMPTY));
+
+  // Try an even deeper directory.
+  auto deep_subdir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(subdir.path()));
+  ASSERT_THAT(rename(deep_subdir.path().c_str(), dir.path().c_str()),
+              SyscallFailsWithErrno(ENOTEMPTY));
+}
+
+TEST(RenameTest, FileToSelf) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  EXPECT_THAT(rename(f.path().c_str(), f.path().c_str()), SyscallSucceeds());
+}
+
+TEST(RenameTest, DirectoryToSelf) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(rename(f.path().c_str(), f.path().c_str()), SyscallSucceeds());
+}
+
+TEST(RenameTest, FileToSameDirectory) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  std::string const newpath = NewTempAbsPath();
+  ASSERT_THAT(rename(f.path().c_str(), newpath.c_str()), SyscallSucceeds());
+  std::string const oldpath = f.release();
+  f.reset(newpath);
+  EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+  EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, DirectoryToSameDirectory) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  std::string const newpath = NewTempAbsPath();
+  ASSERT_THAT(rename(dir.path().c_str(), newpath.c_str()), SyscallSucceeds());
+  std::string const oldpath = dir.release();
+  dir.reset(newpath);
+  EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+  EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, FileToParentDirectory) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir2.path()));
+  std::string const newpath = NewTempAbsPathInDir(dir1.path());
+  ASSERT_THAT(rename(f.path().c_str(), newpath.c_str()), SyscallSucceeds());
+  std::string const oldpath = f.release();
+  f.reset(newpath);
+  EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+  EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, DirectoryToParentDirectory) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+  auto dir3 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir2.path()));
+  EXPECT_THAT(IsDirectory(dir3.path()), IsPosixErrorOkAndHolds(true));
+  std::string const newpath = NewTempAbsPathInDir(dir1.path());
+  ASSERT_THAT(rename(dir3.path().c_str(), newpath.c_str()), SyscallSucceeds());
+  std::string const oldpath = dir3.release();
+  dir3.reset(newpath);
+  EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+  EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+  EXPECT_THAT(IsDirectory(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, FileToChildDirectory) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  std::string const newpath = NewTempAbsPathInDir(dir2.path());
+  ASSERT_THAT(rename(f.path().c_str(), newpath.c_str()), SyscallSucceeds());
+  std::string const oldpath = f.release();
+  f.reset(newpath);
+  EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+  EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, DirectoryToChildDirectory) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+  auto dir3 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+  std::string const newpath = NewTempAbsPathInDir(dir2.path());
+  ASSERT_THAT(rename(dir3.path().c_str(), newpath.c_str()), SyscallSucceeds());
+  std::string const oldpath = dir3.release();
+  dir3.reset(newpath);
+  EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+  EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+  EXPECT_THAT(IsDirectory(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, DirectoryToOwnChildDirectory) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+  std::string const newpath = NewTempAbsPathInDir(dir2.path());
+  ASSERT_THAT(rename(dir1.path().c_str(), newpath.c_str()),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(RenameTest, FileOverwritesFile) {
+  auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "first", TempPath::kDefaultFileMode));
+  auto f2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "second", TempPath::kDefaultFileMode));
+  ASSERT_THAT(rename(f1.path().c_str(), f2.path().c_str()), SyscallSucceeds());
+  EXPECT_THAT(Exists(f1.path()), IsPosixErrorOkAndHolds(false));
+
+  f1.release();
+  std::string f2_contents;
+  ASSERT_NO_ERRNO(GetContents(f2.path(), &f2_contents));
+  EXPECT_EQ("first", f2_contents);
+}
+
+TEST(RenameTest, FileDoesNotExist) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string source = JoinPath(dir.path(), "source");
+  const std::string dest = JoinPath(dir.path(), "dest");
+  ASSERT_THAT(rename(source.c_str(), dest.c_str()),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(RenameTest, FileDoesNotOverwriteDirectory) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  ASSERT_THAT(rename(f.path().c_str(), dir.path().c_str()),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(RenameTest, DirectoryDoesNotOverwriteFile) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  ASSERT_THAT(rename(dir.path().c_str(), f.path().c_str()),
+              SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(RenameTest, DirectoryOverwritesEmptyDirectory) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(rename(dir1.path().c_str(), dir2.path().c_str()),
+              SyscallSucceeds());
+  EXPECT_THAT(Exists(dir1.path()), IsPosixErrorOkAndHolds(false));
+  dir1.release();
+  EXPECT_THAT(Exists(JoinPath(dir2.path(), Basename(f.path()))),
+              IsPosixErrorOkAndHolds(true));
+  f.release();
+}
+
+TEST(RenameTest, FailsWithDots) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto dir1_dot = absl::StrCat(dir1.path(), "/.");
+  auto dir2_dot = absl::StrCat(dir2.path(), "/.");
+  auto dir1_dot_dot = absl::StrCat(dir1.path(), "/..");
+  auto dir2_dot_dot = absl::StrCat(dir2.path(), "/..");
+
+  // Try with dot paths in the first argument
+  EXPECT_THAT(rename(dir1_dot.c_str(), dir2.path().c_str()),
+              SyscallFailsWithErrno(EBUSY));
+  EXPECT_THAT(rename(dir1_dot_dot.c_str(), dir2.path().c_str()),
+              SyscallFailsWithErrno(EBUSY));
+
+  // Try with dot paths in the second argument
+  EXPECT_THAT(rename(dir1.path().c_str(), dir2_dot.c_str()),
+              SyscallFailsWithErrno(EBUSY));
+  EXPECT_THAT(rename(dir1.path().c_str(), dir2_dot_dot.c_str()),
+              SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(RenameTest, DirectoryDoesNotOverwriteNonemptyDirectory) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir2.path()));
+  ASSERT_THAT(rename(dir1.path().c_str(), dir2.path().c_str()),
+              SyscallFailsWithErrno(ENOTEMPTY));
+}
+
+TEST(RenameTest, FailsWhenOldParentNotWritable) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  // dir1 is not writable.
+  ASSERT_THAT(chmod(dir1.path().c_str(), 0555), SyscallSucceeds());
+
+  std::string const newpath = NewTempAbsPathInDir(dir2.path());
+  EXPECT_THAT(rename(f1.path().c_str(), newpath.c_str()),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(RenameTest, FailsWhenNewParentNotWritable) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  // dir2 is not writable.
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0555));
+
+  std::string const newpath = NewTempAbsPathInDir(dir2.path());
+  EXPECT_THAT(rename(f1.path().c_str(), newpath.c_str()),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// Equivalent to FailsWhenNewParentNotWritable, but with a destination file
+// to overwrite.
+TEST(RenameTest, OverwriteFailsWhenNewParentNotWritable) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+
+  // dir2 is not writable.
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir2.path()));
+  ASSERT_THAT(chmod(dir2.path().c_str(), 0555), SyscallSucceeds());
+
+  EXPECT_THAT(rename(f1.path().c_str(), f2.path().c_str()),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// If the parent directory of source is not accessible, rename returns EACCES
+// because the user cannot determine if source exists.
+TEST(RenameTest, FileDoesNotExistWhenNewParentNotExecutable) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  // No execute permission.
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0400));
+
+  const std::string source = JoinPath(dir.path(), "source");
+  const std::string dest = JoinPath(dir.path(), "dest");
+  ASSERT_THAT(rename(source.c_str(), dest.c_str()),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(RenameTest, DirectoryWithOpenFdOverwritesEmptyDirectory) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  // Get an fd on dir1
+  int fd;
+  ASSERT_THAT(fd = open(dir1.path().c_str(), O_DIRECTORY), SyscallSucceeds());
+  auto close_f = Cleanup([fd] {
+    // Close the fd on f.
+    EXPECT_THAT(close(fd), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(rename(dir1.path().c_str(), dir2.path().c_str()),
+              SyscallSucceeds());
+
+  const std::string new_f_path = JoinPath(dir2.path(), Basename(f.path()));
+
+  auto remove_f = Cleanup([&] {
+    // Delete f in its new location.
+    ASSERT_NO_ERRNO(Delete(new_f_path));
+    f.release();
+  });
+
+  EXPECT_THAT(Exists(dir1.path()), IsPosixErrorOkAndHolds(false));
+  dir1.release();
+  EXPECT_THAT(Exists(new_f_path), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, FileWithOpenFd) {
+  TempPath root_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath dir1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root_dir.path()));
+  TempPath dir2 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root_dir.path()));
+  TempPath dir3 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root_dir.path()));
+
+  // Create file in dir1.
+  constexpr char kContents[] = "foo";
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      dir1.path(), kContents, TempPath::kDefaultFileMode));
+
+  // Get fd on file.
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+
+  // Move f to dir2.
+  const std::string path2 = NewTempAbsPathInDir(dir2.path());
+  ASSERT_THAT(rename(f.path().c_str(), path2.c_str()), SyscallSucceeds());
+
+  // Read f's kContents.
+  char buf[sizeof(kContents)];
+  EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(kContents), 0),
+              SyscallSucceedsWithValue(sizeof(kContents) - 1));
+  EXPECT_EQ(absl::string_view(buf, sizeof(buf) - 1), kContents);
+
+  // Move f to dir3.
+  const std::string path3 = NewTempAbsPathInDir(dir3.path());
+  ASSERT_THAT(rename(path2.c_str(), path3.c_str()), SyscallSucceeds());
+
+  // Read f's kContents.
+  EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(kContents), 0),
+              SyscallSucceedsWithValue(sizeof(kContents) - 1));
+  EXPECT_EQ(absl::string_view(buf, sizeof(buf) - 1), kContents);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rlimits.cc b/test/syscalls/linux/rlimits.cc
new file mode 100644
index 000000000..0072285f9
--- /dev/null
+++ b/test/syscalls/linux/rlimits.cc
@@ -0,0 +1,61 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(RlimitTest, SetRlimitHigher) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE)));
+  SKIP_IF(!IsRunningOnGvisor());
+
+  struct rlimit rl = {};
+  EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
+
+  // Even with CAP_SYS_RESOURCE, gVisor does not allow setting a higher rlimit.
+  rl.rlim_max++;
+  EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(RlimitTest, UnprivilegedSetRlimit) {
+  // Drop privileges if necessary.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE))) {
+    EXPECT_NO_ERRNO(SetCapability(CAP_SYS_RESOURCE, false));
+  }
+
+  struct rlimit rl = {};
+  rl.rlim_cur = 1000;
+  rl.rlim_max = 20000;
+  EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
+
+  struct rlimit rl2 = {};
+  EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl2), SyscallSucceeds());
+  EXPECT_EQ(rl.rlim_cur, rl2.rlim_cur);
+  EXPECT_EQ(rl.rlim_max, rl2.rlim_max);
+
+  rl.rlim_max = 100000;
+  EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallFailsWithErrno(EPERM));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
new file mode 100644
index 000000000..1f2fed7cc
--- /dev/null
+++ b/test/syscalls/linux/rtsignal.cc
@@ -0,0 +1,172 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <csignal>
+
+#include "gtest/gtest.h"
+#include "test/util/cleanup.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// saved_info is set by the handler.
+siginfo_t saved_info;
+
+// has_saved_info is set to true by the handler.
+volatile bool has_saved_info;
+
+void SigHandler(int sig, siginfo_t* info, void* context) {
+  // Copy to the given info.
+  saved_info = *info;
+  has_saved_info = true;
+}
+
+void ClearSavedInfo() {
+  // Clear the cached info.
+  memset(&saved_info, 0, sizeof(saved_info));
+  has_saved_info = false;
+}
+
+PosixErrorOr<Cleanup> SetupSignalHandler(int sig) {
+  struct sigaction sa;
+  sa.sa_sigaction = SigHandler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  return ScopedSigaction(sig, sa);
+}
+
+class RtSignalTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    action_cleanup_ = ASSERT_NO_ERRNO_AND_VALUE(SetupSignalHandler(SIGUSR1));
+    mask_cleanup_ =
+        ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGUSR1));
+  }
+
+  void TearDown() override { ClearSavedInfo(); }
+
+ private:
+  Cleanup action_cleanup_;
+  Cleanup mask_cleanup_;
+};
+
+static int rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t* uinfo) {
+  int ret;
+  do {
+    // NOTE: rt_sigqueueinfo(2) could return EAGAIN for RT signals.
+    ret = syscall(SYS_rt_sigqueueinfo, tgid, sig, uinfo);
+  } while (ret == -1 && errno == EAGAIN);
+  return ret;
+}
+
+TEST_F(RtSignalTest, InvalidTID) {
+  siginfo_t uinfo;
+  // Depending on the kernel version, these calls may fail with
+  // ESRCH (goobunutu machines) or EPERM (production machines). Thus,
+  // the test simply ensures that they do fail.
+  EXPECT_THAT(rt_sigqueueinfo(-1, SIGUSR1, &uinfo), SyscallFails());
+  EXPECT_FALSE(has_saved_info);
+  EXPECT_THAT(rt_sigqueueinfo(0, SIGUSR1, &uinfo), SyscallFails());
+  EXPECT_FALSE(has_saved_info);
+}
+
+TEST_F(RtSignalTest, InvalidCodes) {
+  siginfo_t uinfo;
+
+  // We need a child for the code checks to apply. If the process is delivering
+  // to itself, then it can use whatever codes it wants and they will go
+  // through.
+  pid_t child = fork();
+  if (child == 0) {
+    _exit(1);
+  }
+  ASSERT_THAT(child, SyscallSucceeds());
+
+  // These are not allowed for child processes.
+  uinfo.si_code = 0;  // SI_USER.
+  EXPECT_THAT(rt_sigqueueinfo(child, SIGUSR1, &uinfo),
+              SyscallFailsWithErrno(EPERM));
+  uinfo.si_code = 0x80;  // SI_KERNEL.
+  EXPECT_THAT(rt_sigqueueinfo(child, SIGUSR1, &uinfo),
+              SyscallFailsWithErrno(EPERM));
+  uinfo.si_code = -6;  // SI_TKILL.
+  EXPECT_THAT(rt_sigqueueinfo(child, SIGUSR1, &uinfo),
+              SyscallFailsWithErrno(EPERM));
+  uinfo.si_code = -1;  // SI_QUEUE (allowed).
+  EXPECT_THAT(rt_sigqueueinfo(child, SIGUSR1, &uinfo), SyscallSucceeds());
+
+  // Join the child process.
+  EXPECT_THAT(waitpid(child, nullptr, 0), SyscallSucceeds());
+}
+
+TEST_F(RtSignalTest, ValueDelivered) {
+  siginfo_t uinfo;
+  uinfo.si_code = -1;  // SI_QUEUE (allowed).
+  uinfo.si_errno = 0x1234;
+
+  EXPECT_EQ(saved_info.si_errno, 0x0);
+  EXPECT_THAT(rt_sigqueueinfo(getpid(), SIGUSR1, &uinfo), SyscallSucceeds());
+  EXPECT_TRUE(has_saved_info);
+  EXPECT_EQ(saved_info.si_errno, 0x1234);
+}
+
+TEST_F(RtSignalTest, SignoMatch) {
+  auto action2_cleanup = ASSERT_NO_ERRNO_AND_VALUE(SetupSignalHandler(SIGUSR2));
+  auto mask2_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGUSR2));
+
+  siginfo_t uinfo;
+  uinfo.si_code = -1;  // SI_QUEUE (allowed).
+
+  EXPECT_THAT(rt_sigqueueinfo(getpid(), SIGUSR1, &uinfo), SyscallSucceeds());
+  EXPECT_TRUE(has_saved_info);
+  EXPECT_EQ(saved_info.si_signo, SIGUSR1);
+
+  ClearSavedInfo();
+
+  EXPECT_THAT(rt_sigqueueinfo(getpid(), SIGUSR2, &uinfo), SyscallSucceeds());
+  EXPECT_TRUE(has_saved_info);
+  EXPECT_EQ(saved_info.si_signo, SIGUSR2);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  // These tests depend on delivering SIGUSR1/2 to the main thread (so they can
+  // synchronously check has_saved_info). Block these so that any other threads
+  // created by TestInit will also have them blocked.
+  sigset_t set;
+  sigemptyset(&set);
+  sigaddset(&set, SIGUSR1);
+  sigaddset(&set, SIGUSR2);
+  TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+  gvisor::testing::TestInit(&argc, &argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/sched.cc b/test/syscalls/linux/sched.cc
new file mode 100644
index 000000000..60cb6c443
--- /dev/null
+++ b/test/syscalls/linux/sched.cc
@@ -0,0 +1,71 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// In linux, pid is limited to 29 bits because how futex is implemented.
+constexpr int kImpossiblePID = (1 << 29) + 1;
+
+TEST(SchedGetparamTest, ReturnsZero) {
+  struct sched_param param;
+  EXPECT_THAT(sched_getparam(getpid(), &param), SyscallSucceeds());
+  EXPECT_EQ(param.sched_priority, 0);
+  EXPECT_THAT(sched_getparam(/*pid=*/0, &param), SyscallSucceeds());
+  EXPECT_EQ(param.sched_priority, 0);
+}
+
+TEST(SchedGetparamTest, InvalidPIDReturnsEINVAL) {
+  struct sched_param param;
+  EXPECT_THAT(sched_getparam(/*pid=*/-1, &param),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SchedGetparamTest, ImpossiblePIDReturnsESRCH) {
+  struct sched_param param;
+  EXPECT_THAT(sched_getparam(kImpossiblePID, &param),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(SchedGetparamTest, NullParamReturnsEINVAL) {
+  EXPECT_THAT(sched_getparam(0, nullptr), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SchedGetschedulerTest, ReturnsSchedOther) {
+  EXPECT_THAT(sched_getscheduler(getpid()),
+              SyscallSucceedsWithValue(SCHED_OTHER));
+  EXPECT_THAT(sched_getscheduler(/*pid=*/0),
+              SyscallSucceedsWithValue(SCHED_OTHER));
+}
+
+TEST(SchedGetschedulerTest, ReturnsEINVAL) {
+  EXPECT_THAT(sched_getscheduler(/*pid=*/-1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SchedGetschedulerTest, ReturnsESRCH) {
+  EXPECT_THAT(sched_getscheduler(kImpossiblePID), SyscallFailsWithErrno(ESRCH));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sched_yield.cc b/test/syscalls/linux/sched_yield.cc
new file mode 100644
index 000000000..fc45aa5c2
--- /dev/null
+++ b/test/syscalls/linux/sched_yield.cc
@@ -0,0 +1,33 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SchedYieldTest, Success) {
+  EXPECT_THAT(sched_yield(), SyscallSucceeds());
+  EXPECT_THAT(sched_yield(), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
new file mode 100644
index 000000000..d6ac166a4
--- /dev/null
+++ b/test/syscalls/linux/seccomp.cc
@@ -0,0 +1,374 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+#include <atomic>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+#ifndef SYS_SECCOMP
+#define SYS_SECCOMP 1
+#endif
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// A syscall not implemented by Linux that we don't expect to be called.
+constexpr uint32_t kFilteredSyscall = SYS_vserver;
+
+// Applies a seccomp-bpf filter that returns `filtered_result` for
+// `sysno` and allows all other syscalls. Async-signal-safe.
+void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
+                        uint32_t flags = 0) {
+  // "Prior to [PR_SET_SECCOMP], the task must call prctl(PR_SET_NO_NEW_PRIVS,
+  // 1) or run with CAP_SYS_ADMIN privileges in its namespace." -
+  // Documentation/prctl/seccomp_filter.txt
+  //
+  // prctl(PR_SET_NO_NEW_PRIVS, 1) may be called repeatedly; calls after the
+  // first are no-ops.
+  TEST_PCHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0);
+  MaybeSave();
+
+  struct sock_filter filter[] = {
+      // A = seccomp_data.arch
+      BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
+      // if (A != AUDIT_ARCH_X86_64) goto kill
+      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
+      // A = seccomp_data.nr
+      BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
+      // if (A != sysno) goto allow
+      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, sysno, 0, 1),
+      // return filtered_result
+      BPF_STMT(BPF_RET | BPF_K, filtered_result),
+      // allow: return SECCOMP_RET_ALLOW
+      BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+      // kill: return SECCOMP_RET_KILL
+      BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
+  };
+  struct sock_fprog prog;
+  prog.len = ABSL_ARRAYSIZE(filter);
+  prog.filter = filter;
+  if (flags) {
+    TEST_CHECK(syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, flags, &prog) ==
+               0);
+  } else {
+    TEST_PCHECK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0) == 0);
+  }
+  MaybeSave();
+}
+
+// Wrapper for sigaction. Async-signal-safe.
+void RegisterSignalHandler(int signum,
+                           void (*handler)(int, siginfo_t*, void*)) {
+  struct sigaction sa = {};
+  sa.sa_sigaction = handler;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  TEST_PCHECK(sigaction(signum, &sa, nullptr) == 0);
+  MaybeSave();
+}
+
+// All of the following tests execute in a subprocess to ensure that each test
+// is run in a separate process. This avoids cross-contamination of seccomp
+// state between tests, and is necessary to ensure that test processes killed
+// by SECCOMP_RET_KILL are single-threaded (since SECCOMP_RET_KILL only kills
+// the offending thread, not the whole thread group).
+
+TEST(SeccompTest, RetKillCausesDeathBySIGSYS) {
+  pid_t const pid = fork();
+  if (pid == 0) {
+    // Register a signal handler for SIGSYS that we don't expect to be invoked.
+    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
+    syscall(kFilteredSyscall);
+    TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
+      << "status " << status;
+}
+
+TEST(SeccompTest, RetKillOnlyKillsOneThread) {
+  Mapping stack = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+
+  pid_t const pid = fork();
+  if (pid == 0) {
+    // Register a signal handler for SIGSYS that we don't expect to be invoked.
+    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
+    // Pass CLONE_VFORK to block the original thread in the child process until
+    // the clone thread exits with SIGSYS.
+    //
+    // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
+    // x86_64 implementation is safe. See glibc
+    // sysdeps/unix/sysv/linux/x86_64/clone.S.
+    clone(
+        +[](void* arg) {
+          syscall(kFilteredSyscall);  // should kill the thread
+          _exit(1);                   // should be unreachable
+          return 2;  // should be very unreachable, shut up the compiler
+        },
+        stack.endptr(),
+        CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
+            CLONE_VFORK,
+        nullptr);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+TEST(SeccompTest, RetTrapCausesSIGSYS) {
+  pid_t const pid = fork();
+  if (pid == 0) {
+    constexpr uint16_t kTrapValue = 0xdead;
+    RegisterSignalHandler(SIGSYS, +[](int signo, siginfo_t* info, void*) {
+      // This is a signal handler, so we must stay async-signal-safe.
+      TEST_CHECK(info->si_signo == SIGSYS);
+      TEST_CHECK(info->si_code == SYS_SECCOMP);
+      TEST_CHECK(info->si_errno == kTrapValue);
+      TEST_CHECK(info->si_call_addr != nullptr);
+      TEST_CHECK(info->si_syscall == kFilteredSyscall);
+      TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
+      _exit(0);
+    });
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRAP | kTrapValue);
+    syscall(kFilteredSyscall);
+    TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
+
+time_t vsyscall_time(time_t* t) {
+  return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
+}
+
+TEST(SeccompTest, SeccompAppliesToVsyscall) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+  pid_t const pid = fork();
+  if (pid == 0) {
+    constexpr uint16_t kTrapValue = 0xdead;
+    RegisterSignalHandler(SIGSYS, +[](int signo, siginfo_t* info, void*) {
+      // This is a signal handler, so we must stay async-signal-safe.
+      TEST_CHECK(info->si_signo == SIGSYS);
+      TEST_CHECK(info->si_code == SYS_SECCOMP);
+      TEST_CHECK(info->si_errno == kTrapValue);
+      TEST_CHECK(info->si_call_addr != nullptr);
+      TEST_CHECK(info->si_syscall == SYS_time);
+      TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
+      _exit(0);
+    });
+    ApplySeccompFilter(SYS_time, SECCOMP_RET_TRAP | kTrapValue);
+    vsyscall_time(nullptr);  // Should result in death.
+    TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+TEST(SeccompTest, RetTraceWithoutPtracerReturnsENOSYS) {
+  pid_t const pid = fork();
+  if (pid == 0) {
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE);
+    TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+TEST(SeccompTest, RetErrnoReturnsErrno) {
+  pid_t const pid = fork();
+  if (pid == 0) {
+    // ENOTNAM: "Not a XENIX named type file"
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM);
+    TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOTNAM);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+TEST(SeccompTest, RetAllowAllowsSyscall) {
+  pid_t const pid = fork();
+  if (pid == 0) {
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ALLOW);
+    TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+// This test will validate that TSYNC will apply to all threads.
+TEST(SeccompTest, TsyncAppliesToAllThreads) {
+  Mapping stack = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+
+  // We don't want to apply this policy to other test runner threads, so fork.
+  const pid_t pid = fork();
+
+  if (pid == 0) {
+    // First check that we receive a ENOSYS before the policy is applied.
+    TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
+
+    // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
+    // x86_64 implementation is safe. See glibc
+    // sysdeps/unix/sysv/linux/x86_64/clone.S.
+    clone(
+        +[](void* arg) {
+          ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM,
+                             SECCOMP_FILTER_FLAG_TSYNC);
+          return 0;
+        },
+        stack.endptr(),
+        CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
+            CLONE_VFORK,
+        nullptr);
+
+    // Because we're using CLONE_VFORK this thread will be blocked until
+    // the second thread has released resources to our virtual memory, since
+    // we're not execing that will happen on _exit.
+
+    // Now verify that the policy applied to this thread too.
+    TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOTNAM);
+    _exit(0);
+  }
+
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status = 0;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+// This test will validate that seccomp(2) rejects unsupported flags.
+TEST(SeccompTest, SeccompRejectsUnknownFlags) {
+  constexpr uint32_t kInvalidFlag = 123;
+  ASSERT_THAT(
+      syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, kInvalidFlag, nullptr),
+      SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SeccompTest, LeastPermissiveFilterReturnValueApplies) {
+  // This is RetKillCausesDeathBySIGSYS, plus extra filters before and after the
+  // one that causes the kill that should be ignored.
+  pid_t const pid = fork();
+  if (pid == 0) {
+    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE);
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM);
+    syscall(kFilteredSyscall);
+    TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
+      << "status " << status;
+}
+
+// Passed as argv[1] to cause the test binary to invoke kFilteredSyscall and
+// exit. Not a real flag since flag parsing happens during initialization,
+// which may create threads.
+constexpr char kInvokeFilteredSyscallFlag[] = "--seccomp_test_child";
+
+TEST(SeccompTest, FiltersPreservedAcrossForkAndExecve) {
+  ExecveArray const grandchild_argv(
+      {"/proc/self/exe", kInvokeFilteredSyscallFlag});
+
+  pid_t const pid = fork();
+  if (pid == 0) {
+    ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
+    pid_t const grandchild_pid = fork();
+    if (grandchild_pid == 0) {
+      execve(grandchild_argv.get()[0], grandchild_argv.get(),
+             /* envp = */ nullptr);
+      TEST_PCHECK_MSG(false, "execve failed");
+    }
+    int status;
+    TEST_PCHECK(waitpid(grandchild_pid, &status, 0) == grandchild_pid);
+    TEST_CHECK(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS);
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status " << status;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  if (argc >= 2 &&
+      strcmp(argv[1], gvisor::testing::kInvokeFilteredSyscallFlag) == 0) {
+    syscall(gvisor::testing::kFilteredSyscall);
+    exit(0);
+  }
+
+  gvisor::testing::TestInit(&argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
new file mode 100644
index 000000000..6b6fa9217
--- /dev/null
+++ b/test/syscalls/linux/select.cc
@@ -0,0 +1,128 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits.h>
+#include <signal.h>
+#include <sys/select.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class SelectTest : public BasePollTest {
+ protected:
+  void SetUp() override { BasePollTest::SetUp(); }
+  void TearDown() override { BasePollTest::TearDown(); }
+};
+
+// See that when there are no FD sets, select behaves like sleep.
+TEST_F(SelectTest, NullFds) {
+  struct timeval timeout = absl::ToTimeval(absl::Milliseconds(10));
+  ASSERT_THAT(select(0, nullptr, nullptr, nullptr, &timeout),
+              SyscallSucceeds());
+  EXPECT_EQ(timeout.tv_sec, 0);
+  EXPECT_EQ(timeout.tv_usec, 0);
+
+  timeout = absl::ToTimeval(absl::Milliseconds(10));
+  ASSERT_THAT(select(1, nullptr, nullptr, nullptr, &timeout),
+              SyscallSucceeds());
+  EXPECT_EQ(timeout.tv_sec, 0);
+  EXPECT_EQ(timeout.tv_usec, 0);
+}
+
+TEST_F(SelectTest, NegativeNfds) {
+  EXPECT_THAT(select(-1, nullptr, nullptr, nullptr, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(select(-100000, nullptr, nullptr, nullptr, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(select(INT_MIN, nullptr, nullptr, nullptr, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(SelectTest, ClosedFds) {
+  fd_set read_set;
+  FD_ZERO(&read_set);
+  int fd;
+  ASSERT_THAT(fd = dup(1), SyscallSucceeds());
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+  FD_SET(fd, &read_set);
+  struct timeval timeout = absl::ToTimeval(absl::Milliseconds(10));
+  EXPECT_THAT(select(fd + 1, &read_set, nullptr, nullptr, &timeout),
+              SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(SelectTest, ZeroTimeout) {
+  struct timeval timeout = {};
+  EXPECT_THAT(select(1, nullptr, nullptr, nullptr, &timeout),
+              SyscallSucceeds());
+  // Ignore timeout as its value is now undefined.
+}
+
+// If random S/R interrupts the select, SIGALRM may be delivered before select
+// restarts, causing the select to hang forever.
+TEST_F(SelectTest, NoTimeout_NoRandomSave) {
+  // When there's no timeout, select may never return so set a timer.
+  SetTimer(absl::Milliseconds(100));
+  // See that we get interrupted by the timer.
+  ASSERT_THAT(select(1, nullptr, nullptr, nullptr, nullptr),
+              SyscallFailsWithErrno(EINTR));
+  EXPECT_TRUE(TimerFired());
+}
+
+TEST_F(SelectTest, InvalidTimeoutNegative) {
+  struct timeval timeout = absl::ToTimeval(absl::Microseconds(-1));
+  EXPECT_THAT(select(1, nullptr, nullptr, nullptr, &timeout),
+              SyscallFailsWithErrno(EINVAL));
+  // Ignore timeout as its value is now undefined.
+}
+
+// Verify that a signal interrupts select.
+//
+// If random S/R interrupts the select, SIGALRM may be delivered before select
+// restarts, causing the select to hang forever.
+TEST_F(SelectTest, InterruptedBySignal_NoRandomSave) {
+  absl::Duration duration(absl::Seconds(5));
+  struct timeval timeout = absl::ToTimeval(duration);
+  SetTimer(absl::Milliseconds(100));
+  ASSERT_FALSE(TimerFired());
+  ASSERT_THAT(select(1, nullptr, nullptr, nullptr, &timeout),
+              SyscallFailsWithErrno(EINTR));
+  EXPECT_TRUE(TimerFired());
+  // Ignore timeout as its value is now undefined.
+}
+
+TEST_F(SelectTest, IgnoreBitsAboveNfds) {
+  // fd_set is a bit array with at least FD_SETSIZE bits. Test that bits
+  // corresponding to file descriptors above nfds are ignored.
+  fd_set read_set;
+  FD_ZERO(&read_set);
+  constexpr int kNfds = 1;
+  for (int fd = kNfds; fd < FD_SETSIZE; fd++) {
+    FD_SET(fd, &read_set);
+  }
+  // Pass a zero timeout so that select returns immediately.
+  struct timeval timeout = {};
+  EXPECT_THAT(select(kNfds, &read_set, nullptr, nullptr, &timeout),
+              SyscallSucceedsWithValue(0));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
new file mode 100644
index 000000000..12e33732d
--- /dev/null
+++ b/test/syscalls/linux/semaphore.cc
@@ -0,0 +1,438 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/types.h>
+#include <atomic>
+#include <cerrno>
+#include <ctime>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class AutoSem {
+ public:
+  explicit AutoSem(int id) : id_(id) {}
+  ~AutoSem() {
+    if (id_ >= 0) {
+      EXPECT_THAT(semctl(id_, 0, IPC_RMID), SyscallSucceeds());
+    }
+  }
+
+  int release() {
+    int old = id_;
+    id_ = -1;
+    return old;
+  }
+
+  int get() { return id_; }
+
+ private:
+  int id_ = -1;
+};
+
+TEST(SemaphoreTest, SemGet) {
+  // Test creation and lookup.
+  AutoSem sem(semget(1, 10, IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+  EXPECT_THAT(semget(1, 10, IPC_CREAT), SyscallSucceedsWithValue(sem.get()));
+  EXPECT_THAT(semget(1, 9, IPC_CREAT), SyscallSucceedsWithValue(sem.get()));
+
+  // Creation and lookup failure cases.
+  EXPECT_THAT(semget(1, 11, IPC_CREAT), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(semget(1, -1, IPC_CREAT), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(semget(1, 10, IPC_CREAT | IPC_EXCL),
+              SyscallFailsWithErrno(EEXIST));
+  EXPECT_THAT(semget(2, 1, 0), SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(semget(2, 0, IPC_CREAT), SyscallFailsWithErrno(EINVAL));
+
+  // Private semaphores never conflict.
+  AutoSem sem2(semget(IPC_PRIVATE, 1, 0));
+  AutoSem sem3(semget(IPC_PRIVATE, 1, 0));
+  ASSERT_THAT(sem2.get(), SyscallSucceeds());
+  EXPECT_NE(sem.get(), sem2.get());
+  ASSERT_THAT(sem3.get(), SyscallSucceeds());
+  EXPECT_NE(sem3.get(), sem2.get());
+}
+
+// Tests simple operations that shouldn't block in a single-thread.
+TEST(SemaphoreTest, SemOpSingleNoBlock) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  struct sembuf buf = {};
+  buf.sem_op = 1;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+  buf.sem_op = -1;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+  buf.sem_op = 0;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+  // Error cases with invalid values.
+  ASSERT_THAT(semop(sem.get() + 1, &buf, 1), SyscallFailsWithErrno(EINVAL));
+
+  buf.sem_num = 1;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EFBIG));
+
+  ASSERT_THAT(semop(sem.get(), nullptr, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+// Tests multiple operations that shouldn't block in a single-thread.
+TEST(SemaphoreTest, SemOpMultiNoBlock) {
+  AutoSem sem(semget(IPC_PRIVATE, 4, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  struct sembuf bufs[5] = {};
+  bufs[0].sem_num = 0;
+  bufs[0].sem_op = 10;
+  bufs[0].sem_flg = 0;
+
+  bufs[1].sem_num = 1;
+  bufs[1].sem_op = 2;
+  bufs[1].sem_flg = 0;
+
+  bufs[2].sem_num = 2;
+  bufs[2].sem_op = 3;
+  bufs[2].sem_flg = 0;
+
+  bufs[3].sem_num = 0;
+  bufs[3].sem_op = -5;
+  bufs[3].sem_flg = 0;
+
+  bufs[4].sem_num = 2;
+  bufs[4].sem_op = 2;
+  bufs[4].sem_flg = 0;
+
+  ASSERT_THAT(semop(sem.get(), bufs, ABSL_ARRAYSIZE(bufs)), SyscallSucceeds());
+
+  ASSERT_THAT(semctl(sem.get(), 0, GETVAL), SyscallSucceedsWithValue(5));
+  ASSERT_THAT(semctl(sem.get(), 1, GETVAL), SyscallSucceedsWithValue(2));
+  ASSERT_THAT(semctl(sem.get(), 2, GETVAL), SyscallSucceedsWithValue(5));
+  ASSERT_THAT(semctl(sem.get(), 3, GETVAL), SyscallSucceedsWithValue(0));
+
+  for (auto& b : bufs) {
+    b.sem_op = -b.sem_op;
+  }
+  // 0 and 3 order must be reversed, otherwise it will block.
+  std::swap(bufs[0].sem_op, bufs[3].sem_op);
+  ASSERT_THAT(RetryEINTR(semop)(sem.get(), bufs, ABSL_ARRAYSIZE(bufs)),
+              SyscallSucceeds());
+
+  // All semaphores should be back to 0 now.
+  for (size_t i = 0; i < 4; ++i) {
+    ASSERT_THAT(semctl(sem.get(), i, GETVAL), SyscallSucceedsWithValue(0));
+  }
+}
+
+// Makes a best effort attempt to ensure that operation would block.
+TEST(SemaphoreTest, SemOpBlock) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  std::atomic<int> blocked = ATOMIC_VAR_INIT(1);
+  ScopedThread th([&sem, &blocked] {
+    absl::SleepFor(absl::Milliseconds(100));
+    ASSERT_EQ(blocked.load(), 1);
+
+    struct sembuf buf = {};
+    buf.sem_op = 1;
+    ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+  });
+
+  struct sembuf buf = {};
+  buf.sem_op = -1;
+  ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+  blocked.store(0);
+}
+
+// Tests that IPC_NOWAIT returns with no wait.
+TEST(SemaphoreTest, SemOpNoBlock) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  struct sembuf buf = {};
+  buf.sem_flg = IPC_NOWAIT;
+
+  buf.sem_op = -1;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EAGAIN));
+
+  buf.sem_op = 1;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+  buf.sem_op = 0;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EAGAIN));
+}
+
+// Test runs 2 threads, one signals the other waits the same number of times.
+TEST(SemaphoreTest, SemOpSimple) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  constexpr size_t kLoops = 100;
+  ScopedThread th([&sem] {
+    struct sembuf buf = {};
+    buf.sem_op = 1;
+    for (size_t i = 0; i < kLoops; i++) {
+      // Sleep to prevent making all increments in one shot without letting
+      // the waiter wait.
+      absl::SleepFor(absl::Milliseconds(1));
+      ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+    }
+  });
+
+  struct sembuf buf = {};
+  buf.sem_op = -1;
+  for (size_t i = 0; i < kLoops; i++) {
+    ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+  }
+}
+
+// Tests that semaphore can be removed while there are waiters.
+// NoRandomSave: Test relies on timing that random save throws off.
+TEST(SemaphoreTest, SemOpRemoveWithWaiter_NoRandomSave) {
+  AutoSem sem(semget(IPC_PRIVATE, 2, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  ScopedThread th([&sem] {
+    absl::SleepFor(absl::Milliseconds(250));
+    ASSERT_THAT(semctl(sem.release(), 0, IPC_RMID), SyscallSucceeds());
+  });
+
+  // This must happen before IPC_RMID runs above. Otherwise it fails with EINVAL
+  // instead because the semaphire has already been removed.
+  struct sembuf buf = {};
+  buf.sem_op = -1;
+  ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1),
+              SyscallFailsWithErrno(EIDRM));
+}
+
+// Semaphore isn't fair. It will execute any waiter that can satisfy the
+// request even if it gets in front of other waiters.
+TEST(SemaphoreTest, SemOpBestFitExecution) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  ScopedThread th([&sem] {
+    struct sembuf buf = {};
+    buf.sem_op = -2;
+    ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallFails());
+    // Ensure that wait will only unblock when the semaphore is removed. On
+    // EINTR retry it may race with deletion and return EINVAL.
+    ASSERT_TRUE(errno == EIDRM || errno == EINVAL) << "errno=" << errno;
+  });
+
+  // Ensures that '-1' below will unblock even though '-10' above is waiting
+  // for the same semaphore.
+  for (size_t i = 0; i < 10; ++i) {
+    struct sembuf buf = {};
+    buf.sem_op = 1;
+    ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+
+    absl::SleepFor(absl::Milliseconds(10));
+
+    buf.sem_op = -1;
+    ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+  }
+
+  ASSERT_THAT(semctl(sem.release(), 0, IPC_RMID), SyscallSucceeds());
+}
+
+// Executes random operations in multiple threads and verify correctness.
+TEST(SemaphoreTest, SemOpRandom) {
+  // Don't do cooperative S/R tests because there are too many syscalls in
+  // this test,
+  const DisableSave ds;
+
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  // Protects the seed below.
+  absl::Mutex mutex;
+  uint32_t seed = time(nullptr);
+
+  int count = 0;      // Tracks semaphore value.
+  bool done = false;  // Tells waiters to stop after signal threads are done.
+
+  // These threads will wait in a loop.
+  std::unique_ptr<ScopedThread> decs[5];
+  for (auto& dec : decs) {
+    dec = absl::make_unique<ScopedThread>([&sem, &mutex, &count, &seed, &done] {
+      for (size_t i = 0; i < 500; ++i) {
+        int16_t val;
+        {
+          absl::MutexLock l(&mutex);
+          if (done) {
+            return;
+          }
+          val = (rand_r(&seed) % 10 + 1);  // Rand between 1 and 10.
+          count -= val;
+        }
+        struct sembuf buf = {};
+        buf.sem_op = -val;
+        ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+        absl::SleepFor(absl::Milliseconds(val * 2));
+      }
+    });
+  }
+
+  // These threads will wait for zero in a loop.
+  std::unique_ptr<ScopedThread> zeros[5];
+  for (auto& zero : zeros) {
+    zero = absl::make_unique<ScopedThread>([&sem, &mutex, &done] {
+      for (size_t i = 0; i < 500; ++i) {
+        {
+          absl::MutexLock l(&mutex);
+          if (done) {
+            return;
+          }
+        }
+        struct sembuf buf = {};
+        buf.sem_op = 0;
+        ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+        absl::SleepFor(absl::Milliseconds(10));
+      }
+    });
+  }
+
+  // These threads will signal in a loop.
+  std::unique_ptr<ScopedThread> incs[5];
+  for (auto& inc : incs) {
+    inc = absl::make_unique<ScopedThread>([&sem, &mutex, &count, &seed] {
+      for (size_t i = 0; i < 500; ++i) {
+        int16_t val;
+        {
+          absl::MutexLock l(&mutex);
+          val = (rand_r(&seed) % 10 + 1);  // Rand between 1 and 10.
+          count += val;
+        }
+        struct sembuf buf = {};
+        buf.sem_op = val;
+        ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+        absl::SleepFor(absl::Milliseconds(val * 2));
+      }
+    });
+  }
+
+  // First wait for signal threads to be done.
+  for (auto& inc : incs) {
+    inc->Join();
+  }
+
+  // Now there could be waiters blocked (remember operations are random).
+  // Notify waiters that we're done and signal semaphore just the right amount.
+  {
+    absl::MutexLock l(&mutex);
+    done = true;
+    struct sembuf buf = {};
+    buf.sem_op = -count;
+    ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+  }
+
+  // Now all waiters should unblock and exit.
+  for (auto& dec : decs) {
+    dec->Join();
+  }
+  for (auto& zero : zeros) {
+    zero->Join();
+  }
+}
+
+TEST(SemaphoreTest, SemOpNamespace) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  AutoSem sem(semget(123, 1, 0600 | IPC_CREAT | IPC_EXCL));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  ScopedThread([]() {
+    EXPECT_THAT(unshare(CLONE_NEWIPC), SyscallSucceeds());
+    AutoSem sem(semget(123, 1, 0600 | IPC_CREAT | IPC_EXCL));
+    ASSERT_THAT(sem.get(), SyscallSucceeds());
+  });
+}
+
+TEST(SemaphoreTest, SemCtlVal) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  // Semaphore must start with 0.
+  EXPECT_THAT(semctl(sem.get(), 0, GETVAL), SyscallSucceedsWithValue(0));
+
+  // Increase value and ensure waiters are woken up.
+  ScopedThread th([&sem] {
+    struct sembuf buf = {};
+    buf.sem_op = -10;
+    ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+  });
+
+  ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 9), SyscallSucceeds());
+  EXPECT_THAT(semctl(sem.get(), 0, GETVAL), SyscallSucceedsWithValue(9));
+
+  ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 20), SyscallSucceeds());
+  const int value = semctl(sem.get(), 0, GETVAL);
+  // 10 or 20 because it could have raced with waiter above.
+  EXPECT_TRUE(value == 10 || value == 20) << "value=" << value;
+  th.Join();
+
+  // Set it back to 0 and ensure that waiters are woken up.
+  ScopedThread thZero([&sem] {
+    struct sembuf buf = {};
+    buf.sem_op = 0;
+    ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+  });
+  ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 0), SyscallSucceeds());
+  EXPECT_THAT(semctl(sem.get(), 0, GETVAL), SyscallSucceedsWithValue(0));
+  thZero.Join();
+}
+
+TEST(SemaphoreTest, SemIpcSet) {
+  // Drop CAP_IPC_OWNER which allows us to bypass semaphore permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_IPC_OWNER, false));
+
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  struct semid_ds semid = {};
+  semid.sem_perm.uid = getuid();
+  semid.sem_perm.gid = getgid();
+
+  // Make semaphore readonly and check that signal fails.
+  semid.sem_perm.mode = 0400;
+  EXPECT_THAT(semctl(sem.get(), 0, IPC_SET, &semid), SyscallSucceeds());
+  struct sembuf buf = {};
+  buf.sem_op = 1;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EACCES));
+
+  // Make semaphore writeonly and check that wait for zero fails.
+  semid.sem_perm.mode = 0200;
+  EXPECT_THAT(semctl(sem.get(), 0, IPC_SET, &semid), SyscallSucceeds());
+  buf.sem_op = 0;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EACCES));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
new file mode 100644
index 000000000..92b7b9478
--- /dev/null
+++ b/test/syscalls/linux/sendfile.cc
@@ -0,0 +1,409 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/sendfile.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SendFileTest, SendZeroBytes) {
+  // Create temp files.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct value.
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST(SendFileTest, SendTrivially) {
+  // Create temp files.
+  constexpr char kData[] = "To be, or not to be, that is the question:";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  FileDescriptor outf;
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct value.
+  int bytes_sent;
+  EXPECT_THAT(bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+              SyscallSucceedsWithValue(kDataSize));
+
+  // Close outf to avoid leak.
+  outf.reset();
+
+  // Open the output file as read only.
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+  // Verify that the output file has the correct data.
+  char actual[kDataSize];
+  ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+              SyscallSucceedsWithValue(kDataSize));
+  EXPECT_EQ(kData, absl::string_view(actual, bytes_sent));
+}
+
+TEST(SendFileTest, SendTriviallyWithBothFilesReadWrite) {
+  // Create temp files.
+  constexpr char kData[] = "Whether 'tis nobler in the mind to suffer";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as readwrite.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+  // Open the output file as readwrite.
+  FileDescriptor outf;
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+  // Send data and verify that sendfile returns the correct value.
+  int bytes_sent;
+  EXPECT_THAT(bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+              SyscallSucceedsWithValue(kDataSize));
+
+  // Close outf to avoid leak.
+  outf.reset();
+
+  // Open the output file as read only.
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+  // Verify that the output file has the correct data.
+  char actual[kDataSize];
+  ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+              SyscallSucceedsWithValue(kDataSize));
+  EXPECT_EQ(kData, absl::string_view(actual, bytes_sent));
+}
+
+TEST(SendFileTest, SendAndUpdateFileOffset) {
+  // Create temp files.
+  // Test input std::string length must be > 2 AND even.
+  constexpr char kData[] = "The slings and arrows of outrageous fortune,";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  constexpr int kHalfDataSize = kDataSize / 2;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  FileDescriptor outf;
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct value.
+  int bytes_sent;
+  EXPECT_THAT(
+      bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kHalfDataSize),
+      SyscallSucceedsWithValue(kHalfDataSize));
+
+  // Close outf to avoid leak.
+  outf.reset();
+
+  // Open the output file as read only.
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+  // Verify that the output file has the correct data.
+  char actual[kHalfDataSize];
+  ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+              SyscallSucceedsWithValue(kHalfDataSize));
+  EXPECT_EQ(absl::string_view(kData, kHalfDataSize),
+            absl::string_view(actual, bytes_sent));
+
+  // Verify that the input file offset has been updated
+  ASSERT_THAT(read(inf.get(), &actual, kDataSize - bytes_sent),
+              SyscallSucceedsWithValue(kHalfDataSize));
+  EXPECT_EQ(
+      absl::string_view(kData + kDataSize - bytes_sent, kDataSize - bytes_sent),
+      absl::string_view(actual, kHalfDataSize));
+}
+
+TEST(SendFileTest, SendAndUpdateFileOffsetFromNonzeroStartingPoint) {
+  // Create temp files.
+  // Test input std::string length must be > 2 AND divisible by 4.
+  constexpr char kData[] = "The slings and arrows of outrageous fortune,";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  constexpr int kHalfDataSize = kDataSize / 2;
+  constexpr int kQuarterDataSize = kHalfDataSize / 2;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  FileDescriptor outf;
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Read a quarter of the data from the infile which should update the file
+  // offset, we don't actually care about the data so it goes into the garbage.
+  char garbage[kQuarterDataSize];
+  ASSERT_THAT(read(inf.get(), &garbage, kQuarterDataSize),
+              SyscallSucceedsWithValue(kQuarterDataSize));
+
+  // Send data and verify that sendfile returns the correct value.
+  int bytes_sent;
+  EXPECT_THAT(
+      bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kHalfDataSize),
+      SyscallSucceedsWithValue(kHalfDataSize));
+
+  // Close out_fd to avoid leak.
+  outf.reset();
+
+  // Open the output file as read only.
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+  // Verify that the output file has the correct data.
+  char actual[kHalfDataSize];
+  ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+              SyscallSucceedsWithValue(kHalfDataSize));
+  EXPECT_EQ(absl::string_view(kData + kQuarterDataSize, kHalfDataSize),
+            absl::string_view(actual, bytes_sent));
+
+  // Verify that the input file offset has been updated
+  ASSERT_THAT(read(inf.get(), &actual, kQuarterDataSize),
+              SyscallSucceedsWithValue(kQuarterDataSize));
+
+  EXPECT_EQ(
+      absl::string_view(kData + kDataSize - kQuarterDataSize, kQuarterDataSize),
+      absl::string_view(actual, kQuarterDataSize));
+}
+
+TEST(SendFileTest, SendAndUpdateGivenOffset) {
+  // Create temp files.
+  // Test input std::string length must be >= 4 AND divisible by 4.
+  constexpr char kData[] = "Or to take Arms against a Sea of troubles,";
+  constexpr int kDataSize = sizeof(kData) + 1;
+  constexpr int kHalfDataSize = kDataSize / 2;
+  constexpr int kQuarterDataSize = kHalfDataSize / 2;
+  constexpr int kThreeFourthsDataSize = 3 * kDataSize / 4;
+
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  FileDescriptor outf;
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Create offset for sending.
+  off_t offset = kQuarterDataSize;
+
+  // Send data and verify that sendfile returns the correct value.
+  int bytes_sent;
+  EXPECT_THAT(
+      bytes_sent = sendfile(outf.get(), inf.get(), &offset, kHalfDataSize),
+      SyscallSucceedsWithValue(kHalfDataSize));
+
+  // Close out_fd to avoid leak.
+  outf.reset();
+
+  // Open the output file as read only.
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+  // Verify that the output file has the correct data.
+  char actual[kHalfDataSize];
+  ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+              SyscallSucceedsWithValue(kHalfDataSize));
+  EXPECT_EQ(absl::string_view(kData + kQuarterDataSize, kHalfDataSize),
+            absl::string_view(actual, bytes_sent));
+
+  // Verify that the input file offset has NOT been updated.
+  ASSERT_THAT(read(inf.get(), &actual, kHalfDataSize),
+              SyscallSucceedsWithValue(kHalfDataSize));
+  EXPECT_EQ(absl::string_view(kData, kHalfDataSize),
+            absl::string_view(actual, kHalfDataSize));
+
+  // Verify that the offset pointer has been updated.
+  EXPECT_EQ(offset, kThreeFourthsDataSize);
+}
+
+TEST(SendFileTest, DoNotSendfileIfOutfileIsAppendOnly) {
+  // Create temp files.
+  constexpr char kData[] = "And by opposing end them: to die, to sleep";
+  constexpr int kDataSize = sizeof(kData) - 1;
+
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as append only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_APPEND));
+
+  // Send data and verify that sendfile returns the correct errno.
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+              SyscallFailsWithErrno(EBADF));
+}
+
+TEST(SendFileTest, DoNotSendfileIfOutfileIsNotWritable) {
+  // Create temp files.
+  constexpr char kData[] = "No more; and by a sleep, to say we end";
+  constexpr int kDataSize = sizeof(kData) - 1;
+
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as read only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+  // Send data and verify that sendfile returns the correct errno.
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+              SyscallFailsWithErrno(EBADF));
+}
+
+TEST(SendFileTest, DoNotSendfileIfInfileIsNotReadable) {
+  // Create temp files.
+  constexpr char kData[] = "the heart-ache, and the thousand natural shocks";
+  constexpr int kDataSize = sizeof(kData) - 1;
+
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as write only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_WRONLY));
+
+  // Open the output file as write only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct errno.
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+              SyscallFailsWithErrno(EBADF));
+}
+
+TEST(SendFileTest, DoNotSendANegativeNumberOfBytes) {
+  // Create temp files.
+  constexpr char kData[] = "that Flesh is heir to? 'Tis a consummation";
+
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct errno.
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, -1),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SendFileTest, SendTheCorrectNumberOfBytesEvenIfWeTryToSendTooManyBytes) {
+  // Create temp files.
+  constexpr char kData[] = "devoutly to be wished. To die, to sleep,";
+  constexpr int kDataSize = sizeof(kData) - 1;
+
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  FileDescriptor outf;
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct value.
+  int bytes_sent;
+  EXPECT_THAT(
+      bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kDataSize + 100),
+      SyscallSucceedsWithValue(kDataSize));
+
+  // Close outf to avoid leak.
+  outf.reset();
+
+  // Open the output file as read only.
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+  // Verify that the output file has the correct data.
+  char actual[kDataSize];
+  ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+              SyscallSucceedsWithValue(kDataSize));
+  EXPECT_EQ(kData, absl::string_view(actual, bytes_sent));
+}
+
+TEST(SendFileTest, SendToNotARegularFile) {
+  // Make temp input directory and open as read only.
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY));
+
+  // Make temp output file and open as write only.
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Receive an error since a directory is not a regular file.
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
new file mode 100644
index 000000000..7010dc211
--- /dev/null
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -0,0 +1,156 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/sendfile.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// Sends large file to exercise the path that read and writes data multiple
+// times, esp. when more data is read than can be written.
+TEST(SendFileTest, SendMultiple) {
+  std::vector<char> data(5 * 1024 * 1024);
+  RandomizeBuffer(data.data(), data.size());
+
+  // Create temp files.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::string_view(data.data(), data.size()),
+      TempPath::kDefaultFileMode));
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Use a socket for target file to make the write window small.
+  const FileDescriptor server(socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(server.get(), SyscallSucceeds());
+
+  struct sockaddr_in server_addr = {};
+  server_addr.sin_family = AF_INET;
+  server_addr.sin_addr.s_addr = INADDR_ANY;
+  ASSERT_THAT(
+      bind(server.get(), reinterpret_cast<struct sockaddr *>(&server_addr),
+           sizeof(server_addr)),
+      SyscallSucceeds());
+  ASSERT_THAT(listen(server.get(), 1), SyscallSucceeds());
+
+  // Thread that reads data from socket and dumps to a file.
+  ScopedThread th([&server, &out_file, &server_addr] {
+    socklen_t addrlen = sizeof(server_addr);
+    const FileDescriptor fd(RetryEINTR(accept)(
+        server.get(), reinterpret_cast<struct sockaddr *>(&server_addr),
+        &addrlen));
+    ASSERT_THAT(fd.get(), SyscallSucceeds());
+
+    FileDescriptor outf =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+    // Read until socket is closed.
+    char buf[10240];
+    for (int cnt = 0;; cnt++) {
+      int r = RetryEINTR(read)(fd.get(), buf, sizeof(buf));
+      // We cannot afford to save on every read() call.
+      if (cnt % 1000 == 0) {
+        ASSERT_THAT(r, SyscallSucceeds());
+      } else {
+        const DisableSave ds;
+        ASSERT_THAT(r, SyscallSucceeds());
+      }
+      if (r == 0) {
+        // EOF
+        break;
+      }
+      int w = RetryEINTR(write)(outf.get(), buf, r);
+      // We cannot afford to save on every write() call.
+      if (cnt % 1010 == 0) {
+        ASSERT_THAT(w, SyscallSucceedsWithValue(r));
+      } else {
+        const DisableSave ds;
+        ASSERT_THAT(w, SyscallSucceedsWithValue(r));
+      }
+    }
+  });
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  FileDescriptor outf(socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(outf.get(), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = sizeof(server_addr);
+  ASSERT_THAT(getsockname(server.get(),
+                          reinterpret_cast<sockaddr *>(&server_addr), &addrlen),
+              SyscallSucceeds());
+
+  struct sockaddr_in addr = {};
+  addr.sin_family = AF_INET;
+  addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+  addr.sin_port = server_addr.sin_port;
+  LOG(INFO) << "Connecting on port=" << server_addr.sin_port;
+  ASSERT_THAT(
+      RetryEINTR(connect)(
+          outf.get(), reinterpret_cast<struct sockaddr *>(&addr), sizeof(addr)),
+      SyscallSucceeds());
+
+  int cnt = 0;
+  for (size_t sent = 0; sent < data.size(); cnt++) {
+    const size_t remain = data.size() - sent;
+    LOG(INFO) << "sendfile, size=" << data.size() << ", sent=" << sent
+              << ", remain=" << remain;
+
+    // Send data and verify that sendfile returns the correct value.
+    int res = sendfile(outf.get(), inf.get(), nullptr, remain);
+    // We cannot afford to save on every sendfile() call.
+    if (cnt % 120 == 0) {
+      MaybeSave();
+    }
+    if (res == 0) {
+      // EOF
+      break;
+    }
+    if (res > 0) {
+      sent += res;
+    } else {
+      ASSERT_TRUE(errno == EINTR || errno == EAGAIN) << "errno=" << errno;
+    }
+  }
+
+  // Close socket to stop thread.
+  outf.reset();
+  th.Join();
+
+  // Verify that the output file has the correct data.
+  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+  std::vector<char> actual(data.size(), '\0');
+  ASSERT_THAT(RetryEINTR(read)(outf.get(), actual.data(), actual.size()),
+              SyscallSucceedsWithValue(actual.size()));
+  ASSERT_EQ(memcmp(data.data(), actual.data(), data.size()), 0);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
new file mode 100644
index 000000000..9f57476c9
--- /dev/null
+++ b/test/syscalls/linux/shm.cc
@@ -0,0 +1,445 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#include <sys/ipc.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+
+#include "absl/time/clock.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using ::testing::_;
+
+const uint64_t kAllocSize = kPageSize * 128ULL;
+
+PosixErrorOr<int> Shmget(key_t key, size_t size, int shmflg) {
+  int id = shmget(key, size, shmflg);
+  if (id == -1) {
+    return PosixError(errno, "shmget() failed");
+  }
+  return id;
+}
+
+PosixErrorOr<char*> Shmat(int shmid, const void* shmaddr, int shmflg) {
+  const intptr_t addr =
+      reinterpret_cast<intptr_t>(shmat(shmid, shmaddr, shmflg));
+  if (addr == -1) {
+    return PosixError(errno, "shmat() failed");
+  }
+  return reinterpret_cast<char*>(addr);
+}
+
+PosixError Shmdt(const char* shmaddr) {
+  const int ret = shmdt(shmaddr);
+  if (ret == -1) {
+    return PosixError(errno, "shmdt() failed");
+  }
+  return NoError();
+}
+
+template <typename T>
+PosixErrorOr<int> Shmctl(int shmid, int cmd, T* buf) {
+  int ret = shmctl(shmid, cmd, reinterpret_cast<struct shmid_ds*>(buf));
+  if (ret == -1) {
+    return PosixError(errno, "shmctl() failed");
+  }
+  return ret;
+}
+
+TEST(ShmTest, AttachDetach) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  struct shmid_ds attr;
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+  EXPECT_EQ(attr.shm_segsz, kAllocSize);
+  EXPECT_EQ(attr.shm_nattch, 0);
+
+  const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+  EXPECT_EQ(attr.shm_nattch, 1);
+
+  const char* addr2 = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+  EXPECT_EQ(attr.shm_nattch, 2);
+
+  ASSERT_NO_ERRNO(Shmdt(addr));
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+  EXPECT_EQ(attr.shm_nattch, 1);
+
+  ASSERT_NO_ERRNO(Shmdt(addr2));
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+  EXPECT_EQ(attr.shm_nattch, 0);
+}
+
+TEST(ShmTest, LookupByKey) {
+  const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const key_t key = ftok(keyfile.path().c_str(), 1);
+  const int id =
+      ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, IPC_CREAT | 0777));
+  const int id2 = ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, 0777));
+  EXPECT_EQ(id, id2);
+}
+
+TEST(ShmTest, DetachedSegmentsPersist) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  addr[0] = 'x';
+  ASSERT_NO_ERRNO(Shmdt(addr));
+
+  // We should be able to re-attach to the same segment and get our data back.
+  addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  EXPECT_EQ(addr[0], 'x');
+  ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+TEST(ShmTest, MultipleDetachFails) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  ASSERT_NO_ERRNO(Shmdt(addr));
+  EXPECT_THAT(Shmdt(addr), PosixErrorIs(EINVAL, _));
+}
+
+TEST(ShmTest, IpcStat) {
+  const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const key_t key = ftok(keyfile.path().c_str(), 1);
+
+  const time_t start = time(nullptr);
+
+  const int id =
+      ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, IPC_CREAT | 0777));
+
+  const uid_t uid = getuid();
+  const gid_t gid = getgid();
+  const pid_t pid = getpid();
+
+  struct shmid_ds attr;
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+
+  EXPECT_EQ(attr.shm_perm.__key, key);
+  EXPECT_EQ(attr.shm_perm.uid, uid);
+  EXPECT_EQ(attr.shm_perm.gid, gid);
+  EXPECT_EQ(attr.shm_perm.cuid, uid);
+  EXPECT_EQ(attr.shm_perm.cgid, gid);
+  EXPECT_EQ(attr.shm_perm.mode, 0777);
+
+  EXPECT_EQ(attr.shm_segsz, kAllocSize);
+
+  EXPECT_EQ(attr.shm_atime, 0);
+  EXPECT_EQ(attr.shm_dtime, 0);
+
+  // Change time is set on creation.
+  EXPECT_GE(attr.shm_ctime, start);
+
+  EXPECT_EQ(attr.shm_cpid, pid);
+  EXPECT_EQ(attr.shm_lpid, 0);
+
+  EXPECT_EQ(attr.shm_nattch, 0);
+
+  // The timestamps only have a resolution of seconds; slow down so we actually
+  // see the timestamps change.
+  absl::SleepFor(absl::Seconds(1));
+  const time_t pre_attach = time(nullptr);
+
+  const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+
+  EXPECT_GE(attr.shm_atime, pre_attach);
+  EXPECT_EQ(attr.shm_dtime, 0);
+  EXPECT_LT(attr.shm_ctime, pre_attach);
+  EXPECT_EQ(attr.shm_lpid, pid);
+  EXPECT_EQ(attr.shm_nattch, 1);
+
+  absl::SleepFor(absl::Seconds(1));
+  const time_t pre_detach = time(nullptr);
+
+  ASSERT_NO_ERRNO(Shmdt(addr));
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+
+  EXPECT_LT(attr.shm_atime, pre_detach);
+  EXPECT_GE(attr.shm_dtime, pre_detach);
+  EXPECT_LT(attr.shm_ctime, pre_detach);
+  EXPECT_EQ(attr.shm_lpid, pid);
+  EXPECT_EQ(attr.shm_nattch, 0);
+}
+
+TEST(ShmTest, ShmStat) {
+  // This test relies on the segment we create to be the first one on the
+  // system, causing it to occupy slot 1. We can't reasonably expect this on a
+  // general Linux host.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  ASSERT_NO_ERRNO(Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  struct shmid_ds attr;
+  ASSERT_NO_ERRNO(Shmctl(1, SHM_STAT, &attr));
+  // This does the same thing as IPC_STAT, so only test that the syscall
+  // succeeds here.
+}
+
+TEST(ShmTest, IpcInfo) {
+  struct shminfo info;
+  ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
+
+  EXPECT_EQ(info.shmmin, 1);  // This is always 1, according to the man page.
+  EXPECT_GT(info.shmmax, info.shmmin);
+  EXPECT_GT(info.shmmni, 0);
+  EXPECT_GT(info.shmseg, 0);
+  EXPECT_GT(info.shmall, 0);
+}
+
+TEST(ShmTest, ShmInfo) {
+  struct shm_info info;
+
+  // We generally can't know what other processes on a linux machine
+  // does with shared memory segments, so we can't test specific
+  // numbers on Linux. When running under gvisor, we're guaranteed to
+  // be the only ones using shm, so we can easily verify machine-wide
+  // numbers.
+  if (IsRunningOnGvisor()) {
+    ASSERT_NO_ERRNO(Shmctl(0, SHM_INFO, &info));
+    EXPECT_EQ(info.used_ids, 0);
+    EXPECT_EQ(info.shm_tot, 0);
+    EXPECT_EQ(info.shm_rss, 0);
+    EXPECT_EQ(info.shm_swp, 0);
+  }
+
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+
+  ASSERT_NO_ERRNO(Shmctl(1, SHM_INFO, &info));
+
+  if (IsRunningOnGvisor()) {
+    ASSERT_NO_ERRNO(Shmctl(id, SHM_INFO, &info));
+    EXPECT_EQ(info.used_ids, 1);
+    EXPECT_EQ(info.shm_tot, kAllocSize / kPageSize);
+    EXPECT_EQ(info.shm_rss, kAllocSize / kPageSize);
+    EXPECT_EQ(info.shm_swp, 0);  // Gvisor currently never swaps.
+  }
+
+  ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+TEST(ShmTest, ShmCtlSet) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+
+  struct shmid_ds attr;
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+  ASSERT_EQ(attr.shm_perm.mode, 0777);
+
+  attr.shm_perm.mode = 0766;
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_SET, &attr));
+
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+  ASSERT_EQ(attr.shm_perm.mode, 0766);
+
+  ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+TEST(ShmTest, RemovedSegmentsAreMarkedDeleted) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  ASSERT_NO_ERRNO(Shmctl<void>(id, IPC_RMID, nullptr));
+  struct shmid_ds attr;
+  ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+  EXPECT_NE(attr.shm_perm.mode & SHM_DEST, 0);
+  ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+TEST(ShmTest, RemovedSegmentsAreDestroyed) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+
+  const uint64_t alloc_pages = kAllocSize / kPageSize;
+
+  struct shm_info info;
+  ASSERT_NO_ERRNO(Shmctl(1, SHM_INFO, &info));
+  const uint64_t before = info.shm_tot;
+
+  ASSERT_NO_ERRNO(Shmctl<void>(id, IPC_RMID, nullptr));
+  ASSERT_NO_ERRNO(Shmdt(addr));
+
+  ASSERT_NO_ERRNO(Shmctl(1, SHM_INFO, &info));
+  const uint64_t after = info.shm_tot;
+  EXPECT_EQ(after, before - alloc_pages);
+}
+
+TEST(ShmTest, AllowsAttachToRemovedSegmentWithRefs) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  ASSERT_NO_ERRNO(Shmctl<void>(id, IPC_RMID, nullptr));
+  const char* addr2 = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  ASSERT_NO_ERRNO(Shmdt(addr));
+  ASSERT_NO_ERRNO(Shmdt(addr2));
+}
+
+TEST(ShmTest, RemovedSegmentsAreNotDiscoverable) {
+  const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const key_t key = ftok(keyfile.path().c_str(), 1);
+  const int id =
+      ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, IPC_CREAT | 0777));
+  ASSERT_NO_ERRNO(Shmctl<void>(id, IPC_RMID, nullptr));
+  EXPECT_THAT(Shmget(key, kAllocSize, 0777), PosixErrorIs(ENOENT, _));
+}
+
+TEST(ShmDeathTest, ReadonlySegment) {
+  SetupGvisorDeathTest();
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, SHM_RDONLY));
+  // Reading succeeds.
+  static_cast<void>(addr[0]);
+  // Writing fails.
+  EXPECT_EXIT(addr[0] = 'x', ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+TEST(ShmDeathTest, SegmentNotAccessibleAfterDetach) {
+  // This test is susceptible to races with concurrent mmaps running in parallel
+  // gtest threads since the test relies on the address freed during a shm
+  // segment destruction to remain unused. We run the test body in a forked
+  // child to guarantee a single-threaded context to avoid this.
+
+  SetupGvisorDeathTest();
+
+  const auto rest = [&] {
+    const int id = ASSERT_NO_ERRNO_AND_VALUE(
+        Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+    char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+    addr[0] = 'x';
+    ASSERT_NO_ERRNO(Shmdt(addr));
+
+    // This access should cause a SIGSEGV.
+    addr[0] = 'x';
+  };
+
+  EXPECT_THAT(InForkedProcess(rest),
+              IsPosixErrorOkAndHolds(W_EXITCODE(0, SIGSEGV)));
+}
+
+TEST(ShmTest, RequestingSegmentSmallerThanSHMMINFails) {
+  struct shminfo info;
+  ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
+  const uint64_t size = info.shmmin - 1;
+  EXPECT_THAT(Shmget(IPC_PRIVATE, size, IPC_CREAT | 0777),
+              PosixErrorIs(EINVAL, _));
+}
+
+TEST(ShmTest, RequestingSegmentLargerThanSHMMAXFails) {
+  struct shminfo info;
+  ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
+  const uint64_t size = info.shmmax + kPageSize;
+  EXPECT_THAT(Shmget(IPC_PRIVATE, size, IPC_CREAT | 0777),
+              PosixErrorIs(EINVAL, _));
+}
+
+TEST(ShmTest, RequestingUnalignedSizeSucceeds) {
+  EXPECT_NO_ERRNO(Shmget(IPC_PRIVATE, 4097, IPC_CREAT | 0777));
+}
+
+TEST(ShmTest, RequestingDuplicateCreationFails) {
+  const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const key_t key = ftok(keyfile.path().c_str(), 1);
+  ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(key, kAllocSize, IPC_CREAT | IPC_EXCL | 0777));
+  EXPECT_THAT(Shmget(key, kAllocSize, IPC_CREAT | IPC_EXCL | 0777),
+              PosixErrorIs(EEXIST, _));
+}
+
+TEST(ShmTest, SegmentsSizeFixedOnCreation) {
+  const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const key_t key = ftok(keyfile.path().c_str(), 1);
+
+  // Base segment.
+  const int id =
+      ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, IPC_CREAT | 0777));
+
+  // Ask for the same segment at half size. This succeeds.
+  const int id2 = ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize / 2, 0777));
+
+  // Ask for the same segment at double size.
+  EXPECT_THAT(Shmget(key, kAllocSize * 2, 0777), PosixErrorIs(EINVAL, _));
+
+  char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  char* addr2 = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id2, nullptr, 0));
+
+  // We have 2 different maps...
+  EXPECT_NE(addr, addr2);
+
+  // ... And both maps are kAllocSize bytes; despite asking for a half-sized
+  // segment for the second map.
+  addr[kAllocSize - 1] = 'x';
+  addr2[kAllocSize - 1] = 'x';
+
+  ASSERT_NO_ERRNO(Shmdt(addr));
+  ASSERT_NO_ERRNO(Shmdt(addr2));
+}
+
+TEST(ShmTest, PartialUnmap) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  EXPECT_THAT(munmap(addr + (kAllocSize / 4), kAllocSize / 2),
+              SyscallSucceeds());
+  ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+// Check that sentry does not panic when asked for a zero-length private shm
+// segment.
+TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) {
+  EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _));
+}
+
+TEST(ShmTest, NoDestructionOfAttachedSegmentWithMultipleRmid) {
+  const int id = ASSERT_NO_ERRNO_AND_VALUE(
+      Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+  char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+  char* addr2 = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+
+  // There should be 2 refs to the segment from the 2 attachments, and a single
+  // self-reference. Mark the segment as destroyed more than 3 times through
+  // shmctl(RMID). If there's a bug with the ref counting, this should cause the
+  // count to drop to zero.
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_NO_ERRNO(Shmctl<void>(id, IPC_RMID, nullptr));
+  }
+
+  // Segment should remain accessible.
+  addr[0] = 'x';
+  ASSERT_NO_ERRNO(Shmdt(addr));
+
+  // Segment should remain accessible even after one of the two attachments are
+  // detached.
+  addr2[0] = 'x';
+  ASSERT_NO_ERRNO(Shmdt(addr2));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sigaction.cc b/test/syscalls/linux/sigaction.cc
new file mode 100644
index 000000000..cdd2dbf31
--- /dev/null
+++ b/test/syscalls/linux/sigaction.cc
@@ -0,0 +1,70 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SigactionTest, GetLessThanOrEqualToZeroFails) {
+  struct sigaction act;
+  memset(&act, 0, sizeof(act));
+  ASSERT_THAT(sigaction(-1, NULL, &act), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(sigaction(0, NULL, &act), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, SetLessThanOrEqualToZeroFails) {
+  struct sigaction act;
+  memset(&act, 0, sizeof(act));
+  ASSERT_THAT(sigaction(0, &act, NULL), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(sigaction(0, &act, NULL), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, GetGreaterThanMaxFails) {
+  struct sigaction act;
+  memset(&act, 0, sizeof(act));
+  ASSERT_THAT(sigaction(SIGRTMAX + 1, NULL, &act),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, SetGreaterThanMaxFails) {
+  struct sigaction act;
+  memset(&act, 0, sizeof(act));
+  ASSERT_THAT(sigaction(SIGRTMAX + 1, &act, NULL),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, SetSigkillFails) {
+  struct sigaction act;
+  memset(&act, 0, sizeof(act));
+  ASSERT_THAT(sigaction(SIGKILL, NULL, &act), SyscallSucceeds());
+  ASSERT_THAT(sigaction(SIGKILL, &act, NULL), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, SetSigstopFails) {
+  struct sigaction act;
+  memset(&act, 0, sizeof(act));
+  ASSERT_THAT(sigaction(SIGSTOP, NULL, &act), SyscallSucceeds());
+  ASSERT_THAT(sigaction(SIGSTOP, &act, NULL), SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
new file mode 100644
index 000000000..fa991545c
--- /dev/null
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -0,0 +1,274 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <functional>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/util/cleanup.h"
+#include "test/util/fs_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<Cleanup> ScopedSigaltstack(stack_t const& stack) {
+  stack_t old_stack;
+  int rc = sigaltstack(&stack, &old_stack);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "sigaltstack failed");
+  }
+  return Cleanup([old_stack] {
+    EXPECT_THAT(sigaltstack(&old_stack, nullptr), SyscallSucceeds());
+  });
+}
+
+volatile bool got_signal = false;
+volatile int sigaltstack_errno = 0;
+volatile int ss_flags = 0;
+
+void sigaltstack_handler(int sig, siginfo_t* siginfo, void* arg) {
+  got_signal = true;
+
+  stack_t stack;
+  int ret = sigaltstack(nullptr, &stack);
+  MaybeSave();
+  if (ret < 0) {
+    sigaltstack_errno = errno;
+    return;
+  }
+  ss_flags = stack.ss_flags;
+}
+
+TEST(SigaltstackTest, Success) {
+  std::vector<char> stack_mem(SIGSTKSZ);
+  stack_t stack = {};
+  stack.ss_sp = stack_mem.data();
+  stack.ss_size = stack_mem.size();
+  auto const cleanup_sigstack =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
+
+  struct sigaction sa = {};
+  sa.sa_sigaction = sigaltstack_handler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+  auto const cleanup_sa =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGUSR1, sa));
+
+  // Send signal to this thread, as sigaltstack is per-thread.
+  EXPECT_THAT(tgkill(getpid(), gettid(), SIGUSR1), SyscallSucceeds());
+
+  EXPECT_TRUE(got_signal);
+  EXPECT_EQ(sigaltstack_errno, 0);
+  EXPECT_NE(0, ss_flags & SS_ONSTACK);
+}
+
+TEST(SigaltstackTest, ResetByExecve) {
+  std::vector<char> stack_mem(SIGSTKSZ);
+  stack_t stack = {};
+  stack.ss_sp = stack_mem.data();
+  stack.ss_size = stack_mem.size();
+  auto const cleanup_sigstack =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
+
+  std::string full_path;
+  char* test_src = getenv("TEST_SRCDIR");
+  if (test_src) {
+    full_path = JoinPath(test_src, "../../linux/sigaltstack_check");
+  }
+  ASSERT_FALSE(full_path.empty());
+
+  pid_t child_pid = -1;
+  int execve_errno = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(full_path, {"sigaltstack_check"}, {}, nullptr, &child_pid,
+                  &execve_errno));
+
+  ASSERT_GT(child_pid, 0);
+  ASSERT_EQ(execve_errno, 0);
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  ASSERT_TRUE(WIFEXITED(status));
+  ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
+volatile bool badhandler_on_sigaltstack = true;      // Set by the handler.
+char* volatile badhandler_low_water_mark = nullptr;  // Set by the handler.
+volatile uint8_t badhandler_recursive_faults = 0;      // Consumed by the handler.
+
+void badhandler(int sig, siginfo_t* siginfo, void* arg) {
+  char stack_var = 0;
+  char* current_ss = &stack_var;
+
+  stack_t stack;
+  int ret = sigaltstack(nullptr, &stack);
+  if (ret < 0 || (stack.ss_flags & SS_ONSTACK) != SS_ONSTACK) {
+    // We should always be marked as being on the stack. Don't allow this to hit
+    // the bottom if this is ever not true (the main test will fail as a
+    // result, but we still need to unwind the recursive faults).
+    badhandler_on_sigaltstack = false;
+  }
+  if (current_ss < badhandler_low_water_mark) {
+    // Record the low point for the signal stack. We never expected this to be
+    // before stack bottom, but this is asserted in the actual test.
+    badhandler_low_water_mark = current_ss;
+  }
+  if (badhandler_recursive_faults > 0) {
+    badhandler_recursive_faults--;
+    Fault();
+  }
+  FixupFault(reinterpret_cast<ucontext*>(arg));
+}
+
+TEST(SigaltstackTest, WalksOffBottom) {
+  // This test marks the upper half of the stack_mem array as the signal stack.
+  // It asserts that when a fault occurs in the handler (already on the signal
+  // stack), we eventually continue to fault our way off the stack. We should
+  // not revert to the top of the signal stack when we fall off the bottom and
+  // the signal stack should remain "in use". When we fall off the signal stack,
+  // we should have an unconditional signal delivered and not start using the
+  // first part of the stack_mem array.
+  std::vector<char> stack_mem(SIGSTKSZ * 2);
+  stack_t stack = {};
+  stack.ss_sp = stack_mem.data() + SIGSTKSZ;  // See above: upper half.
+  stack.ss_size = SIGSTKSZ;                   // Only one half the array.
+  auto const cleanup_sigstack =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
+
+  // Setup the handler: this must be for SIGSEGV, and it must allow proper
+  // nesting (no signal mask, no defer) so that we can trigger multiple times.
+  //
+  // When we walk off the bottom of the signal stack and force signal delivery
+  // of a SIGSEGV, the handler will revert to the default behavior (kill).
+  struct sigaction sa = {};
+  sa.sa_sigaction = badhandler;
+  sa.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_NODEFER;
+  auto const cleanup_sa =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+  // Trigger a single fault.
+  badhandler_low_water_mark =
+      reinterpret_cast<char*>(&stack.ss_sp) + SIGSTKSZ;  // Expected top.
+  badhandler_recursive_faults = 0;                       // Disable refault.
+  Fault();
+  EXPECT_TRUE(badhandler_on_sigaltstack);
+  EXPECT_THAT(sigaltstack(nullptr, &stack), SyscallSucceeds());
+  EXPECT_EQ(stack.ss_flags & SS_ONSTACK, 0);
+  EXPECT_LT(badhandler_low_water_mark,
+            reinterpret_cast<char*>(stack.ss_sp) + 2 * SIGSTKSZ);
+  EXPECT_GT(badhandler_low_water_mark, reinterpret_cast<char*>(stack.ss_sp));
+
+  // Trigger two faults.
+  char* prev_low_water_mark = badhandler_low_water_mark;  // Previous top.
+  badhandler_recursive_faults = 1;                        // One refault.
+  Fault();
+  ASSERT_TRUE(badhandler_on_sigaltstack);
+  EXPECT_THAT(sigaltstack(nullptr, &stack), SyscallSucceeds());
+  EXPECT_EQ(stack.ss_flags & SS_ONSTACK, 0);
+  EXPECT_LT(badhandler_low_water_mark, prev_low_water_mark);
+  EXPECT_GT(badhandler_low_water_mark, reinterpret_cast<char*>(stack.ss_sp));
+
+  // Calculate the stack growth for a fault, and set the recursive faults to
+  // ensure that the signal handler stack required exceeds our marked stack area
+  // by a minimal amount. It should remain in the valid stack_mem area so that
+  // we can test the signal is forced merely by going out of the signal stack
+  // bounds, not by a genuine fault.
+  uintptr_t frame_size =
+      static_cast<uintptr_t>(prev_low_water_mark - badhandler_low_water_mark);
+  badhandler_recursive_faults = (SIGSTKSZ + frame_size) / frame_size;
+  EXPECT_EXIT(Fault(), ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+volatile int setonstack_retval = 0;  // Set by the handler.
+volatile int setonstack_errno = 0;   // Set by the handler.
+
+void setonstack(int sig, siginfo_t* siginfo, void* arg) {
+  char stack_mem[SIGSTKSZ];
+  stack_t stack = {};
+  stack.ss_sp = &stack_mem[0];
+  stack.ss_size = SIGSTKSZ;
+  setonstack_retval = sigaltstack(&stack, nullptr);
+  setonstack_errno = errno;
+  FixupFault(reinterpret_cast<ucontext*>(arg));
+}
+
+TEST(SigaltstackTest, SetWhileOnStack) {
+  // Reserve twice as much stack here, since the handler will allocate a vector
+  // of size SIGTKSZ and attempt to set the sigaltstack to that value.
+  std::vector<char> stack_mem(2 * SIGSTKSZ);
+  stack_t stack = {};
+  stack.ss_sp = stack_mem.data();
+  stack.ss_size = stack_mem.size();
+  auto const cleanup_sigstack =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
+
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_sigaction = setonstack;
+  sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+  auto const cleanup_sa =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+  // Trigger a fault.
+  Fault();
+
+  // The set should have failed.
+  EXPECT_EQ(setonstack_retval, -1);
+  EXPECT_EQ(setonstack_errno, EPERM);
+}
+
+TEST(SigaltstackTest, SetCurrentStack) {
+  // This is executed as an exit test because once the signal stack is set to
+  // the local stack, there's no good way to unwind. We don't want to taint the
+  // test of any other tests that might run within this process.
+  EXPECT_EXIT(
+      {
+        char stack_value = 0;
+        stack_t stack = {};
+        stack.ss_sp = &stack_value - kPageSize;  // Lower than current level.
+        stack.ss_size = 2 * kPageSize;  // => &stack_value +/- kPageSize.
+        TEST_CHECK(sigaltstack(&stack, nullptr) == 0);
+        TEST_CHECK(sigaltstack(nullptr, &stack) == 0);
+        TEST_CHECK((stack.ss_flags & SS_ONSTACK) != 0);
+
+        // Should not be able to change the stack (even no-op).
+        TEST_CHECK(sigaltstack(&stack, nullptr) == -1 && errno == EPERM);
+
+        // Should not be able to disable the stack.
+        stack.ss_flags = SS_DISABLE;
+        TEST_CHECK(sigaltstack(&stack, nullptr) == -1 && errno == EPERM);
+        exit(0);
+      },
+      ::testing::ExitedWithCode(0), "");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sigaltstack_check.cc b/test/syscalls/linux/sigaltstack_check.cc
new file mode 100644
index 000000000..b71f812a8
--- /dev/null
+++ b/test/syscalls/linux/sigaltstack_check.cc
@@ -0,0 +1,33 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Checks that there is no alternate signal stack by default.
+//
+// Used by a test in sigaltstack.cc.
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "test/util/logging.h"
+
+int main(int /* argc */, char** /* argv */) {
+  stack_t stack;
+  TEST_CHECK(sigaltstack(nullptr, &stack) >= 0);
+  TEST_CHECK(stack.ss_flags == SS_DISABLE);
+  TEST_CHECK(stack.ss_sp == 0);
+  TEST_CHECK(stack.ss_size == 0);
+  return 0;
+}
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
new file mode 100644
index 000000000..1b7cecccb
--- /dev/null
+++ b/test/syscalls/linux/sigiret.cc
@@ -0,0 +1,137 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr uint64_t kOrigRcx = 0xdeadbeeffacefeed;
+constexpr uint64_t kOrigR11 = 0xfacefeedbaad1dea;
+
+volatile int gotvtalrm, ready;
+
+void sigvtalrm(int sig, siginfo_t* siginfo, void* _uc) {
+  ucontext_t* uc = reinterpret_cast<ucontext_t*>(_uc);
+
+  // Verify that:
+  // - test is in the busy-wait loop waiting for signal.
+  // - %rcx and %r11 values in mcontext_t match kOrigRcx and kOrigR11.
+  if (ready &&
+      static_cast<uint64_t>(uc->uc_mcontext.gregs[REG_RCX]) == kOrigRcx &&
+      static_cast<uint64_t>(uc->uc_mcontext.gregs[REG_R11]) == kOrigR11) {
+    // Modify the values %rcx and %r11 in the ucontext. These are the
+    // values seen by the application after the signal handler returns.
+    uc->uc_mcontext.gregs[REG_RCX] = ~kOrigRcx;
+    uc->uc_mcontext.gregs[REG_R11] = ~kOrigR11;
+    gotvtalrm = 1;
+  }
+}
+
+TEST(SigIretTest, CheckRcxR11) {
+  // Setup signal handler for SIGVTALRM.
+  struct sigaction sa = {};
+  sigfillset(&sa.sa_mask);
+  sa.sa_sigaction = sigvtalrm;
+  sa.sa_flags = SA_SIGINFO;
+  auto const action_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGVTALRM, sa));
+
+  auto const mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGVTALRM));
+
+  // Setup itimer to fire after 500 msecs.
+  struct itimerval itimer = {};
+  itimer.it_value.tv_usec = 500 * 1000;  // 500 msecs.
+  auto const timer_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_VIRTUAL, itimer));
+
+  // Initialize %rcx and %r11 and spin until the signal handler returns.
+  uint64_t rcx = kOrigRcx;
+  uint64_t r11 = kOrigR11;
+  asm volatile(
+      "movq %[rcx], %%rcx;"                      // %rcx = rcx
+      "movq %[r11], %%r11;"                      // %r11 = r11
+      "movl $1, %[ready];"                       // ready = 1
+      "1: pause; cmpl $0, %[gotvtalrm]; je 1b;"  // while (!gotvtalrm);
+      "movq %%rcx, %[rcx];"                      // rcx = %rcx
+      "movq %%r11, %[r11];"                      // r11 = %r11
+      : [ready] "=m"(ready), [rcx] "+m"(rcx), [r11] "+m"(r11)
+      : [gotvtalrm] "m"(gotvtalrm)
+      : "cc", "memory", "rcx", "r11");
+
+  // If sigreturn(2) returns via 'sysret' then %rcx and %r11 will be
+  // clobbered and set to 'ptregs->rip' and 'ptregs->rflags' respectively.
+  //
+  // The following check verifies that %rcx and %r11 were not clobbered
+  // when returning from the signal handler (via sigreturn(2)).
+  EXPECT_EQ(rcx, ~kOrigRcx);
+  EXPECT_EQ(r11, ~kOrigR11);
+}
+
+constexpr uint64_t kNonCanonicalRip = 0xCCCC000000000000;
+
+// Test that a non-canonical signal handler faults as expected.
+TEST(SigIretTest, BadHandler) {
+  struct sigaction sa = {};
+  sa.sa_sigaction =
+      reinterpret_cast<void (*)(int, siginfo_t*, void*)>(kNonCanonicalRip);
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGUSR1, sa));
+
+  pid_t pid = fork();
+  if (pid == 0) {
+    // Child, wait for signal.
+    while (1) {
+      pause();
+    }
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+
+  EXPECT_THAT(kill(pid, SIGUSR1), SyscallSucceeds());
+
+  int status;
+  EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
+      << "status = " << status;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  // SigIretTest.CheckRcxR11 depends on delivering SIGVTALRM to the main thread.
+  // Block SIGVTALRM so that any other threads created by TestInit will also
+  // have SIGVTALRM blocked.
+  sigset_t set;
+  sigemptyset(&set);
+  sigaddset(&set, SIGVTALRM);
+  TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+  gvisor::testing::TestInit(&argc, &argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
new file mode 100644
index 000000000..d8b918446
--- /dev/null
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -0,0 +1,272 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <stddef.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Signals numbers used for testing.
+static constexpr int kTestSignal1 = SIGUSR1;
+static constexpr int kTestSignal2 = SIGUSR2;
+
+static int raw_sigprocmask(int how, const sigset_t* set, sigset_t* oldset) {
+  return syscall(SYS_rt_sigprocmask, how, set, oldset, _NSIG / 8);
+}
+
+// count of the number of signals received
+int signal_count[kMaxSignal + 1];
+
+// signal handler increments the signal counter
+void SigHandler(int sig, siginfo_t* info, void* context) {
+  TEST_CHECK(sig > 0 && sig <= kMaxSignal);
+  signal_count[sig] += 1;
+}
+
+// The test fixture saves and restores the signal mask and
+// sets up handlers for kTestSignal1 and kTestSignal2.
+class SigProcMaskTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Save the current signal mask.
+    EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &mask_),
+                SyscallSucceeds());
+
+    // Setup signal handlers for kTestSignal1 and kTestSignal2.
+    struct sigaction sa;
+    sa.sa_sigaction = SigHandler;
+    sigfillset(&sa.sa_mask);
+    sa.sa_flags = SA_SIGINFO;
+    EXPECT_THAT(sigaction(kTestSignal1, &sa, &sa_test_sig_1_),
+                SyscallSucceeds());
+    EXPECT_THAT(sigaction(kTestSignal2, &sa, &sa_test_sig_2_),
+                SyscallSucceeds());
+
+    // Clear the signal counters.
+    memset(signal_count, 0, sizeof(signal_count));
+  }
+
+  void TearDown() override {
+    // Restore the signal mask.
+    EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &mask_, nullptr),
+                SyscallSucceeds());
+
+    // Restore the signal handlers for kTestSignal1 and kTestSignal2.
+    EXPECT_THAT(sigaction(kTestSignal1, &sa_test_sig_1_, nullptr),
+                SyscallSucceeds());
+    EXPECT_THAT(sigaction(kTestSignal2, &sa_test_sig_2_, nullptr),
+                SyscallSucceeds());
+  }
+
+ private:
+  sigset_t mask_;
+  struct sigaction sa_test_sig_1_;
+  struct sigaction sa_test_sig_2_;
+};
+
+// Both sigsets nullptr should succeed and do nothing.
+TEST_F(SigProcMaskTest, NullAddress) {
+  EXPECT_THAT(raw_sigprocmask(SIG_BLOCK, nullptr, NULL), SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_UNBLOCK, nullptr, NULL), SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, NULL), SyscallSucceeds());
+}
+
+// Bad address for either sigset should fail with EFAULT.
+TEST_F(SigProcMaskTest, BadAddress) {
+  sigset_t* bad_addr = reinterpret_cast<sigset_t*>(-1);
+
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, bad_addr, nullptr),
+              SyscallFailsWithErrno(EFAULT));
+
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, bad_addr),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+// Bad value of the "how" parameter should fail with EINVAL.
+TEST_F(SigProcMaskTest, BadParameter) {
+  int bad_param_1 = -1;
+  int bad_param_2 = 42;
+
+  sigset_t set1;
+  sigemptyset(&set1);
+
+  EXPECT_THAT(raw_sigprocmask(bad_param_1, &set1, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+
+  EXPECT_THAT(raw_sigprocmask(bad_param_2, &set1, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Check that we can get the current signal mask.
+TEST_F(SigProcMaskTest, GetMask) {
+  sigset_t set1;
+  sigset_t set2;
+
+  sigemptyset(&set1);
+  sigfillset(&set2);
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &set1), SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &set2), SyscallSucceeds());
+  EXPECT_THAT(set1, EqualsSigset(set2));
+}
+
+// Check that we can set the signal mask.
+TEST_F(SigProcMaskTest, SetMask) {
+  sigset_t actual;
+  sigset_t expected;
+
+  // Try to mask all signals
+  sigfillset(&expected);
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &expected, nullptr),
+              SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+              SyscallSucceeds());
+  // sigprocmask() should have silently ignored SIGKILL and SIGSTOP.
+  sigdelset(&expected, SIGSTOP);
+  sigdelset(&expected, SIGKILL);
+  EXPECT_THAT(actual, EqualsSigset(expected));
+
+  // Try to clear the signal mask
+  sigemptyset(&expected);
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &expected, nullptr),
+              SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+              SyscallSucceeds());
+  EXPECT_THAT(actual, EqualsSigset(expected));
+
+  // Try to set a mask with one signal.
+  sigemptyset(&expected);
+  sigaddset(&expected, kTestSignal1);
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &expected, nullptr),
+              SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+              SyscallSucceeds());
+  EXPECT_THAT(actual, EqualsSigset(expected));
+}
+
+// Check that we can add and remove signals.
+TEST_F(SigProcMaskTest, BlockUnblock) {
+  sigset_t actual;
+  sigset_t expected;
+
+  // Try to set a mask with one signal.
+  sigemptyset(&expected);
+  sigaddset(&expected, kTestSignal1);
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &expected, nullptr),
+              SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+              SyscallSucceeds());
+  EXPECT_THAT(actual, EqualsSigset(expected));
+
+  // Try to add another signal.
+  sigset_t block;
+  sigemptyset(&block);
+  sigaddset(&block, kTestSignal2);
+  EXPECT_THAT(raw_sigprocmask(SIG_BLOCK, &block, nullptr), SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+              SyscallSucceeds());
+  sigaddset(&expected, kTestSignal2);
+  EXPECT_THAT(actual, EqualsSigset(expected));
+
+  // Try to remove a signal.
+  sigset_t unblock;
+  sigemptyset(&unblock);
+  sigaddset(&unblock, kTestSignal1);
+  EXPECT_THAT(raw_sigprocmask(SIG_UNBLOCK, &unblock, nullptr),
+              SyscallSucceeds());
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+              SyscallSucceeds());
+  sigdelset(&expected, kTestSignal1);
+  EXPECT_THAT(actual, EqualsSigset(expected));
+}
+
+// Test that the signal mask actually blocks signals.
+TEST_F(SigProcMaskTest, SignalHandler) {
+  sigset_t mask;
+
+  // clear the signal mask
+  sigemptyset(&mask);
+  EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &mask, nullptr), SyscallSucceeds());
+
+  // Check the initial signal counts.
+  EXPECT_EQ(0, signal_count[kTestSignal1]);
+  EXPECT_EQ(0, signal_count[kTestSignal2]);
+
+  // Check that both kTestSignal1 and kTestSignal2 are not blocked.
+  raise(kTestSignal1);
+  raise(kTestSignal2);
+  EXPECT_EQ(1, signal_count[kTestSignal1]);
+  EXPECT_EQ(1, signal_count[kTestSignal2]);
+
+  // Block kTestSignal1.
+  sigaddset(&mask, kTestSignal1);
+  EXPECT_THAT(raw_sigprocmask(SIG_BLOCK, &mask, nullptr), SyscallSucceeds());
+
+  // Check that kTestSignal1 is blocked.
+  raise(kTestSignal1);
+  raise(kTestSignal2);
+  EXPECT_EQ(1, signal_count[kTestSignal1]);
+  EXPECT_EQ(2, signal_count[kTestSignal2]);
+
+  // Unblock kTestSignal1.
+  sigaddset(&mask, kTestSignal1);
+  EXPECT_THAT(raw_sigprocmask(SIG_UNBLOCK, &mask, nullptr), SyscallSucceeds());
+
+  // Check that the unblocked kTestSignal1 has been delivered.
+  // TODO: gvisor currently drops masked signals on the floor.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_EQ(2, signal_count[kTestSignal1]);
+  }
+  EXPECT_EQ(2, signal_count[kTestSignal2]);
+}
+
+// Check that sigprocmask correctly handles aliasing of the set and oldset
+// pointers.
+TEST_F(SigProcMaskTest, AliasedSets) {
+  sigset_t mask;
+
+  // Set a mask in which only kTestSignal1 is blocked.
+  sigset_t mask1;
+  sigemptyset(&mask1);
+  sigaddset(&mask1, kTestSignal1);
+  mask = mask1;
+  ASSERT_THAT(raw_sigprocmask(SIG_SETMASK, &mask, nullptr), SyscallSucceeds());
+
+  // Exchange it with a mask in which only kTestSignal2 is blocked.
+  sigset_t mask2;
+  sigemptyset(&mask2);
+  sigaddset(&mask2, kTestSignal2);
+  mask = mask2;
+  ASSERT_THAT(raw_sigprocmask(SIG_SETMASK, &mask, &mask), SyscallSucceeds());
+
+  // Check that the exchange succeeeded:
+  // mask should now contain the previously-set mask blocking only kTestSignal1.
+  EXPECT_THAT(mask, EqualsSigset(mask1));
+  // The current mask should block only kTestSignal2.
+  ASSERT_THAT(raw_sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+  EXPECT_THAT(mask, EqualsSigset(mask2));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sigstop.cc b/test/syscalls/linux/sigstop.cc
new file mode 100644
index 000000000..e21d23d51
--- /dev/null
+++ b/test/syscalls/linux/sigstop.cc
@@ -0,0 +1,150 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <stdlib.h>
+#include <sys/select.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_bool(sigstop_test_child, false,
+            "If true, run the SigstopTest child workload.");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr absl::Duration kChildStartupDelay = absl::Seconds(5);
+constexpr absl::Duration kChildMainThreadDelay = absl::Seconds(10);
+constexpr absl::Duration kChildExtraThreadDelay = absl::Seconds(15);
+constexpr absl::Duration kPostSIGSTOPDelay = absl::Seconds(20);
+
+// Comparisons on absl::Duration aren't yet constexpr (2017-07-14), so we
+// can't just use static_assert.
+TEST(SigstopTest, TimesAreRelativelyConsistent) {
+  EXPECT_LT(kChildStartupDelay, kChildMainThreadDelay)
+      << "Child process will exit before the parent process attempts to stop "
+         "it";
+  EXPECT_LT(kChildMainThreadDelay, kChildExtraThreadDelay)
+      << "Secondary thread in child process will exit before main thread, "
+         "causing it to exit with the wrong code";
+  EXPECT_LT(kChildExtraThreadDelay, kPostSIGSTOPDelay)
+      << "Parent process stops waiting before child process may exit if "
+         "improperly stopped, rendering the test ineffective";
+}
+
+// Exit codes communicated from the child workload to the parent test process.
+constexpr int kChildMainThreadExitCode = 10;
+constexpr int kChildExtraThreadExitCode = 11;
+
+TEST(SigstopTest, Correctness) {
+  pid_t child_pid = -1;
+  int execve_errno = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec("/proc/self/exe", {"/proc/self/exe", "--sigstop_test_child"},
+                  {}, nullptr, &child_pid, &execve_errno));
+
+  ASSERT_GT(child_pid, 0);
+  ASSERT_EQ(execve_errno, 0);
+
+  // Wait for the child subprocess to start the second thread before stopping
+  // it.
+  absl::SleepFor(kChildStartupDelay);
+  ASSERT_THAT(kill(child_pid, SIGSTOP), SyscallSucceeds());
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child_pid, &status, WUNTRACED),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status));
+  EXPECT_EQ(SIGSTOP, WSTOPSIG(status));
+
+  // Sleep for longer than either of the sleeps in the child subprocess,
+  // expecting the child to stay alive because it's stopped.
+  absl::SleepFor(kPostSIGSTOPDelay);
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, WNOHANG),
+              SyscallSucceedsWithValue(0));
+
+  // Resume the child.
+  ASSERT_THAT(kill(child_pid, SIGCONT), SyscallSucceeds());
+
+  EXPECT_THAT(RetryEINTR(waitpid)(child_pid, &status, WCONTINUED),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFCONTINUED(status));
+
+  // Expect it to die.
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  ASSERT_TRUE(WIFEXITED(status));
+  ASSERT_EQ(WEXITSTATUS(status), kChildMainThreadExitCode);
+}
+
+// Like base:SleepFor, but tries to avoid counting time spent stopped due to a
+// stop signal toward the sleep.
+//
+// This is required due to an inconsistency in how nanosleep(2) and stop signals
+// interact on Linux. When nanosleep is interrupted, it writes the remaining
+// time back to its second timespec argument, so that if nanosleep is
+// interrupted by a signal handler then userspace can immediately call nanosleep
+// again with that timespec. However, if nanosleep is automatically restarted
+// (because it's interrupted by a signal that is not delivered to a handler,
+// such as a stop signal), it's restarted based on the timer's former *absolute*
+// expiration time (via ERESTART_RESTARTBLOCK => SYS_restart_syscall =>
+// hrtimer_nanosleep_restart). This means that time spent stopped is effectively
+// counted as time spent sleeping, resulting in less time spent sleeping than
+// expected.
+//
+// Dividing the sleep into multiple smaller sleeps limits the impact of this
+// effect to the length of each sleep during which a stop occurs; for example,
+// if a sleeping process is only stopped once, SleepIgnoreStopped can
+// under-sleep by at most 100ms.
+void SleepIgnoreStopped(absl::Duration d) {
+  absl::Duration const max_sleep = absl::Milliseconds(100);
+  while (d > absl::ZeroDuration()) {
+    absl::Duration to_sleep = std::min(d, max_sleep);
+    absl::SleepFor(to_sleep);
+    d -= to_sleep;
+  }
+}
+
+void RunChild() {
+  // Start another thread that attempts to call exit_group with a different
+  // error code, in order to verify that SIGSTOP stops this thread as well.
+  ScopedThread t([] {
+    SleepIgnoreStopped(kChildExtraThreadDelay);
+    exit(kChildExtraThreadExitCode);
+  });
+  SleepIgnoreStopped(kChildMainThreadDelay);
+  exit(kChildMainThreadExitCode);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  gvisor::testing::TestInit(&argc, &argv);
+
+  if (FLAGS_sigstop_test_child) {
+    gvisor::testing::RunChild();
+    return 1;
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
new file mode 100644
index 000000000..3a350fc28
--- /dev/null
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -0,0 +1,248 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// N.B. main() blocks SIGALRM and SIGCHLD on all threads.
+
+constexpr int kAlarmSecs = 12;
+
+void NoopHandler(int sig, siginfo_t* info, void* context) {}
+
+TEST(SigtimedwaitTest, InvalidTimeout) {
+  sigset_t mask;
+  sigemptyset(&mask);
+  struct timespec timeout = {0, 1000000001};
+  EXPECT_THAT(sigtimedwait(&mask, nullptr, &timeout),
+              SyscallFailsWithErrno(EINVAL));
+  timeout = {-1, 0};
+  EXPECT_THAT(sigtimedwait(&mask, nullptr, &timeout),
+              SyscallFailsWithErrno(EINVAL));
+  timeout = {0, -1};
+  EXPECT_THAT(sigtimedwait(&mask, nullptr, &timeout),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and wait.
+TEST(SigtimedwaitTest, AlarmReturnsAlarm_NoRandomSave) {
+  struct itimerval itv = {};
+  itv.it_value.tv_sec = kAlarmSecs;
+  const auto itimer_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, itv));
+
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGALRM);
+  siginfo_t info = {};
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, &info, nullptr),
+              SyscallSucceedsWithValue(SIGALRM));
+  EXPECT_EQ(SIGALRM, info.si_signo);
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and wait.
+TEST(SigtimedwaitTest, NullTimeoutReturnsEINTR_NoRandomSave) {
+  struct sigaction sa;
+  sa.sa_sigaction = NoopHandler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  const auto action_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+  const auto mask_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+  struct itimerval itv = {};
+  itv.it_value.tv_sec = kAlarmSecs;
+  const auto itimer_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, itv));
+
+  sigset_t mask;
+  sigemptyset(&mask);
+  EXPECT_THAT(sigtimedwait(&mask, nullptr, nullptr),
+              SyscallFailsWithErrno(EINTR));
+}
+
+TEST(SigtimedwaitTest, LegitTimeoutReturnsEAGAIN) {
+  sigset_t mask;
+  sigemptyset(&mask);
+  struct timespec timeout = {1, 0};  // 1 second
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &timeout),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(SigtimedwaitTest, ZeroTimeoutReturnsEAGAIN) {
+  sigset_t mask;
+  sigemptyset(&mask);
+  struct timespec timeout = {0, 0};  // 0 second
+  EXPECT_THAT(sigtimedwait(&mask, nullptr, &timeout),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(SigtimedwaitTest, KillGeneratedSIGCHLD) {
+  EXPECT_THAT(kill(getpid(), SIGCHLD), SyscallSucceeds());
+
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGCHLD);
+  struct timespec ts = {5, 0};
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &ts),
+              SyscallSucceedsWithValue(SIGCHLD));
+}
+
+TEST(SigtimedwaitTest, ChildExitGeneratedSIGCHLD) {
+  pid_t pid = fork();
+  if (pid == 0) {
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+
+  int status;
+  EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
+
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGCHLD);
+  struct timespec ts = {5, 0};
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &ts),
+              SyscallSucceedsWithValue(SIGCHLD));
+}
+
+TEST(SigtimedwaitTest, ChildExitGeneratedSIGCHLDWithHandler) {
+  // Setup handler for SIGCHLD, but don't unblock it.
+  struct sigaction sa;
+  sa.sa_sigaction = NoopHandler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  const auto action_cleanup =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGCHLD, sa));
+
+  pid_t pid = fork();
+  if (pid == 0) {
+    _exit(0);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGCHLD);
+  struct timespec ts = {5, 0};
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &ts),
+              SyscallSucceedsWithValue(SIGCHLD));
+
+  int status;
+  EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
+}
+
+TEST(SigtimedwaitTest, IgnoredUnmaskedSignal) {
+  constexpr int kSigno = SIGUSR1;
+  constexpr auto kSigtimedwaitSetupTime = absl::Seconds(2);
+  constexpr auto kSigtimedwaitTimeout = absl::Seconds(5);
+  ASSERT_GT(kSigtimedwaitTimeout, kSigtimedwaitSetupTime);
+
+  // Ensure that kSigno is ignored, and unmasked on this thread.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_IGN;
+  const auto scoped_sigaction =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kSigno, sa));
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, mask));
+
+  // Create a thread which will send us kSigno while we are blocked in
+  // sigtimedwait.
+  pid_t tid = gettid();
+  ScopedThread sigthread([&] {
+    absl::SleepFor(kSigtimedwaitSetupTime);
+    EXPECT_THAT(tgkill(getpid(), tid, kSigno), SyscallSucceeds());
+  });
+
+  // sigtimedwait should not observe kSigno since it is ignored and already
+  // unmasked, causing it to be dropped before it is enqueued.
+  struct timespec timeout_ts = absl::ToTimespec(kSigtimedwaitTimeout);
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &timeout_ts),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(SigtimedwaitTest, IgnoredMaskedSignal) {
+  constexpr int kSigno = SIGUSR1;
+  constexpr auto kSigtimedwaitSetupTime = absl::Seconds(2);
+  constexpr auto kSigtimedwaitTimeout = absl::Seconds(5);
+  ASSERT_GT(kSigtimedwaitTimeout, kSigtimedwaitSetupTime);
+
+  // Ensure that kSigno is ignored, and masked on this thread.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_IGN;
+  const auto scoped_sigaction =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kSigno, sa));
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, mask));
+
+  // Create a thread which will send us kSigno while we are blocked in
+  // sigtimedwait.
+  pid_t tid = gettid();
+  ScopedThread sigthread([&] {
+    absl::SleepFor(kSigtimedwaitSetupTime);
+    EXPECT_THAT(tgkill(getpid(), tid, kSigno), SyscallSucceeds());
+  });
+
+  // sigtimedwait should observe kSigno since it is normally masked, causing it
+  // to be enqueued despite being ignored.
+  struct timespec timeout_ts = absl::ToTimespec(kSigtimedwaitTimeout);
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &timeout_ts),
+              SyscallSucceedsWithValue(kSigno));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  // These tests depend on delivering SIGALRM/SIGCHLD to the main thread or in
+  // sigtimedwait. Block them so that any other threads created by TestInit will
+  // also have them blocked.
+  sigset_t set;
+  sigemptyset(&set);
+  sigaddset(&set, SIGALRM);
+  sigaddset(&set, SIGCHLD);
+  TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+  gvisor::testing::TestInit(&argc, &argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
new file mode 100644
index 000000000..7b111a2dd
--- /dev/null
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -0,0 +1,43 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      AbstractBoundUnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{0, SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, AllSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
new file mode 100644
index 000000000..eea6f2810
--- /dev/null
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -0,0 +1,43 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      FilesystemBoundUnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{0, SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, AllSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
new file mode 100644
index 000000000..fbc3bebed
--- /dev/null
+++ b/test/syscalls/linux/socket_generic.cc
@@ -0,0 +1,403 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_generic.h"
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+// This file is a generic socket test file. It must be built with another file
+// that provides the test types.
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(AllSocketPairTest, BasicReadWrite) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char buf[20];
+  const std::string data = "abc";
+  ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), 3),
+              SyscallSucceedsWithValue(3));
+  ASSERT_THAT(ReadFd(sockets->second_fd(), buf, 3),
+              SyscallSucceedsWithValue(3));
+  EXPECT_EQ(data, absl::string_view(buf, 3));
+}
+
+TEST_P(AllSocketPairTest, BasicSendRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data)];
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(AllSocketPairTest, BasicSendmmsg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[200];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  std::vector<struct mmsghdr> msgs(10);
+  std::vector<struct iovec> iovs(msgs.size());
+  const int chunk_size = sizeof(sent_data) / msgs.size();
+  for (size_t i = 0; i < msgs.size(); i++) {
+    iovs[i].iov_len = chunk_size;
+    iovs[i].iov_base = &sent_data[i * chunk_size];
+    msgs[i].msg_hdr.msg_iov = &iovs[i];
+    msgs[i].msg_hdr.msg_iovlen = 1;
+  }
+
+  ASSERT_THAT(
+      RetryEINTR(sendmmsg)(sockets->first_fd(), &msgs[0], msgs.size(), 0),
+      SyscallSucceedsWithValue(msgs.size()));
+
+  for (const struct mmsghdr& msg : msgs) {
+    EXPECT_EQ(chunk_size, msg.msg_len);
+  }
+
+  char received_data[sizeof(sent_data)];
+  for (size_t i = 0; i < msgs.size(); i++) {
+    ASSERT_THAT(ReadFd(sockets->second_fd(), &received_data[i * chunk_size],
+                       chunk_size),
+                SyscallSucceedsWithValue(chunk_size));
+  }
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(AllSocketPairTest, BasicRecvmmsg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[200];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  char received_data[sizeof(sent_data)];
+  std::vector<struct mmsghdr> msgs(10);
+  std::vector<struct iovec> iovs(msgs.size());
+  const int chunk_size = sizeof(sent_data) / msgs.size();
+  for (size_t i = 0; i < msgs.size(); i++) {
+    iovs[i].iov_len = chunk_size;
+    iovs[i].iov_base = &received_data[i * chunk_size];
+    msgs[i].msg_hdr.msg_iov = &iovs[i];
+    msgs[i].msg_hdr.msg_iovlen = 1;
+  }
+
+  for (size_t i = 0; i < msgs.size(); i++) {
+    ASSERT_THAT(
+        WriteFd(sockets->first_fd(), &sent_data[i * chunk_size], chunk_size),
+        SyscallSucceedsWithValue(chunk_size));
+  }
+
+  ASSERT_THAT(RetryEINTR(recvmmsg)(sockets->second_fd(), &msgs[0], msgs.size(),
+                                   0, nullptr),
+              SyscallSucceedsWithValue(msgs.size()));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  for (const struct mmsghdr& msg : msgs) {
+    EXPECT_EQ(chunk_size, msg.msg_len);
+  }
+}
+
+TEST_P(AllSocketPairTest, SendmsgRecvmsg10KB) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  std::vector<char> sent_data(10 * 1024);
+  RandomizeBuffer(sent_data.data(), sent_data.size());
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data.data(), sent_data.size()));
+
+  std::vector<char> received_data(sent_data.size());
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sockets->second_fd(), received_data.data(),
+                                     received_data.size()));
+
+  EXPECT_EQ(0,
+            memcmp(sent_data.data(), received_data.data(), sent_data.size()));
+}
+
+// This test validates that a sendmsg/recvmsg w/ MSG_CTRUNC is a no-op on
+// input flags.
+TEST_P(AllSocketPairTest, SendmsgRecvmsgMsgCtruncNoop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  std::vector<char> sent_data(10 * 1024);
+  RandomizeBuffer(sent_data.data(), sent_data.size());
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data.data(), sent_data.size()));
+
+  std::vector<char> received_data(sent_data.size());
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(int)) + CMSG_SPACE(sizeof(struct ucred))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  iov.iov_base = &received_data[0];
+  iov.iov_len = received_data.size();
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  // MSG_CTRUNC should be a no-op.
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CTRUNC),
+              SyscallSucceedsWithValue(received_data.size()));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  EXPECT_EQ(cmsg, nullptr);
+  EXPECT_EQ(msg.msg_controllen, 0);
+  EXPECT_EQ(0,
+            memcmp(sent_data.data(), received_data.data(), sent_data.size()));
+}
+
+TEST_P(AllSocketPairTest, SendmsgRecvmsg16KB) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  std::vector<char> sent_data(16 * 1024);
+  RandomizeBuffer(sent_data.data(), sent_data.size());
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data.data(), sent_data.size()));
+
+  std::vector<char> received_data(sent_data.size());
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sockets->second_fd(), received_data.data(),
+                                     received_data.size()));
+
+  EXPECT_EQ(0,
+            memcmp(sent_data.data(), received_data.data(), sent_data.size()));
+}
+
+TEST_P(AllSocketPairTest, RecvmmsgInvalidTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char buf[10];
+  struct mmsghdr msg = {};
+  struct iovec iov = {};
+  iov.iov_len = sizeof(buf);
+  iov.iov_base = buf;
+  msg.msg_hdr.msg_iov = &iov;
+  msg.msg_hdr.msg_iovlen = 1;
+  struct timespec timeout = {-1, -1};
+  ASSERT_THAT(RetryEINTR(recvmmsg)(sockets->first_fd(), &msg, 1, 0, &timeout),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, RecvmmsgTimeoutBeforeRecv) {
+  // There is a known bug in the Linux recvmmsg(2) causing it to block forever
+  // if the timeout expires while blocking for the first message.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char buf[10];
+  struct mmsghdr msg = {};
+  struct iovec iov = {};
+  iov.iov_len = sizeof(buf);
+  iov.iov_base = buf;
+  msg.msg_hdr.msg_iov = &iov;
+  msg.msg_hdr.msg_iovlen = 1;
+  struct timespec timeout = {};
+  ASSERT_THAT(RetryEINTR(recvmmsg)(sockets->first_fd(), &msg, 1, 0, &timeout),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, MsgPeek) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[50];
+  memset(&sent_data, 0, sizeof(sent_data));
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data)];
+  for (int i = 0; i < 3; i++) {
+    memset(received_data, 0, sizeof(received_data));
+    EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                                 sizeof(received_data), MSG_PEEK),
+                SyscallSucceedsWithValue(sizeof(received_data)));
+    EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+  }
+
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+}
+
+TEST_P(AllSocketPairTest, LingerSocketOption) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  struct linger got_linger = {-1, -1};
+  socklen_t length = sizeof(struct linger);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &length),
+              SyscallSucceedsWithValue(0));
+  struct linger want_linger = {};
+  EXPECT_EQ(0, memcmp(&want_linger, &got_linger, sizeof(struct linger)));
+  EXPECT_EQ(sizeof(struct linger), length);
+}
+
+TEST_P(AllSocketPairTest, KeepAliveSocketOption) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  int keepalive = -1;
+  socklen_t length = sizeof(int);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE,
+                         &keepalive, &length),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(0, keepalive);
+  EXPECT_EQ(sizeof(int), length);
+}
+
+TEST_P(AllSocketPairTest, RcvBufSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  int size = 0;
+  socklen_t size_size = sizeof(size);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &size, &size_size),
+      SyscallSucceeds());
+  EXPECT_GT(size, 0);
+}
+
+TEST_P(AllSocketPairTest, SndBufSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  int size = 0;
+  socklen_t size_size = sizeof(size);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &size, &size_size),
+      SyscallSucceeds());
+  EXPECT_GT(size, 0);
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[20] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutOneSecondSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 1, .tv_usec = 0
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[20] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvmsgTimeoutSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  struct msghdr msg = {};
+  char buf[20] = {};
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = sizeof(buf);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  EXPECT_THAT(RetryEINTR(recvmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, SoRcvTimeoIsSet) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 35
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval_with_extra {
+    struct timeval tv;
+    int64_t extra_data;
+  } ABSL_ATTRIBUTE_PACKED;
+
+  timeval_with_extra tv_extra;
+  tv_extra.tv.tv_sec = 0;
+  tv_extra.tv.tv_usec = 25;
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &tv_extra, sizeof(tv_extra)),
+              SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, RecvmsgTimeoutOneSecondSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 1, .tv_usec = 0
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  struct msghdr msg = {};
+  char buf[20] = {};
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = sizeof(buf);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  EXPECT_THAT(RetryEINTR(recvmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvWaitAll) {
+  SKIP_IF(IsRunningOnGvisor());  // FIXME: Support MSG_WAITALL.
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[100];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), MSG_WAITALL),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_generic.h b/test/syscalls/linux/socket_generic.h
new file mode 100644
index 000000000..cd826abcf
--- /dev/null
+++ b/test/syscalls/linux/socket_generic.h
@@ -0,0 +1,30 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_GENERIC_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_GENERIC_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking and non-blocking
+// connected stream sockets.
+using AllSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_GENERIC_H_
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
new file mode 100644
index 000000000..7bdbd7797
--- /dev/null
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -0,0 +1,812 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <string.h>
+#include <sys/socket.h>
+
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
+  switch (family) {
+    case AF_INET:
+      return static_cast<uint16_t>(
+          reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
+    case AF_INET6:
+      return static_cast<uint16_t>(
+          reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+}
+
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
+      return NoError();
+    case AF_INET6:
+      reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port;
+      return NoError();
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+}
+
+struct TestAddress {
+  std::string description;
+  sockaddr_storage addr;
+  socklen_t addr_len;
+
+  int family() const { return addr.ss_family; }
+  explicit TestAddress(std::string description = "")
+      : description(std::move(description)), addr(), addr_len() {}
+};
+
+TestAddress V4Any() {
+  TestAddress t("V4Any");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr = htonl(INADDR_ANY);
+  return t;
+}
+
+TestAddress V4Loopback() {
+  TestAddress t("V4Loopback");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      htonl(INADDR_LOOPBACK);
+  return t;
+}
+
+TestAddress V4MappedAny() {
+  TestAddress t("V4MappedAny");
+  t.addr.ss_family = AF_INET6;
+  t.addr_len = sizeof(sockaddr_in6);
+  inet_pton(AF_INET6, "::ffff:0.0.0.0",
+            reinterpret_cast<sockaddr_in6*>(&t.addr)->sin6_addr.s6_addr);
+  return t;
+}
+
+TestAddress V4MappedLoopback() {
+  TestAddress t("V4MappedLoopback");
+  t.addr.ss_family = AF_INET6;
+  t.addr_len = sizeof(sockaddr_in6);
+  inet_pton(AF_INET6, "::ffff:127.0.0.1",
+            reinterpret_cast<sockaddr_in6*>(&t.addr)->sin6_addr.s6_addr);
+  return t;
+}
+
+TestAddress V6Any() {
+  TestAddress t("V6Any");
+  t.addr.ss_family = AF_INET6;
+  t.addr_len = sizeof(sockaddr_in6);
+  reinterpret_cast<sockaddr_in6*>(&t.addr)->sin6_addr = in6addr_any;
+  return t;
+}
+
+TestAddress V6Loopback() {
+  TestAddress t("V6Loopback");
+  t.addr.ss_family = AF_INET6;
+  t.addr_len = sizeof(sockaddr_in6);
+  reinterpret_cast<sockaddr_in6*>(&t.addr)->sin6_addr = in6addr_loopback;
+  return t;
+}
+
+struct TestParam {
+  TestAddress listener;
+  TestAddress connector;
+};
+
+std::string DescribeTestParam(::testing::TestParamInfo<TestParam> const& info) {
+  return absl::StrCat("Listen", info.param.listener.description, "_Connect",
+                      info.param.connector.description);
+}
+
+using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>;
+
+TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
+  int fd[2] = {};
+
+  // Valid AF but invalid for socketpair(2) return ESOCKTNOSUPPORT.
+  ASSERT_THAT(socketpair(AF_INET, 0, 0, fd),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  ASSERT_THAT(socketpair(AF_INET6, 0, 0, fd),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+
+  // Invalid AF will return ENOAFSUPPORT.
+  ASSERT_THAT(socketpair(AF_MAX, 0, 0, fd),
+              SyscallFailsWithErrno(EAFNOSUPPORT));
+  ASSERT_THAT(socketpair(8675309, 0, 0, fd),
+              SyscallFailsWithErrno(EAFNOSUPPORT));
+}
+
+TEST_P(SocketInetLoopbackTest, TCP) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RDWR), SyscallSucceeds());
+
+  ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    All, SocketInetLoopbackTest,
+    ::testing::Values(
+        // Listeners bound to IPv4 addresses refuse connections using IPv6
+        // addresses.
+        TestParam{V4Any(), V4Any()}, TestParam{V4Any(), V4Loopback()},
+        TestParam{V4Any(), V4MappedAny()},
+        TestParam{V4Any(), V4MappedLoopback()},
+        TestParam{V4Loopback(), V4Any()}, TestParam{V4Loopback(), V4Loopback()},
+        TestParam{V4Loopback(), V4MappedLoopback()},
+        TestParam{V4MappedAny(), V4Any()},
+        TestParam{V4MappedAny(), V4Loopback()},
+        TestParam{V4MappedAny(), V4MappedAny()},
+        TestParam{V4MappedAny(), V4MappedLoopback()},
+        TestParam{V4MappedLoopback(), V4Any()},
+        TestParam{V4MappedLoopback(), V4Loopback()},
+        TestParam{V4MappedLoopback(), V4MappedLoopback()},
+
+        // Listeners bound to IN6ADDR_ANY accept all connections.
+        TestParam{V6Any(), V4Any()}, TestParam{V6Any(), V4Loopback()},
+        TestParam{V6Any(), V4MappedAny()},
+        TestParam{V6Any(), V4MappedLoopback()}, TestParam{V6Any(), V6Any()},
+        TestParam{V6Any(), V6Loopback()},
+
+        // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4
+        // addresses.
+        TestParam{V6Loopback(), V6Any()},
+        TestParam{V6Loopback(), V6Loopback()}),
+    DescribeTestParam);
+
+struct ProtocolTestParam {
+  std::string description;
+  int type;
+};
+
+std::string DescribeProtocolTestParam(
+    ::testing::TestParamInfo<ProtocolTestParam> const& info) {
+  return info.param.description;
+}
+
+using SocketMultiProtocolInetLoopbackTest =
+    ::testing::TestWithParam<ProtocolTestParam>;
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedLoopbackOnlyReservesV4) {
+  auto const& param = GetParam();
+
+  for (int i = 0; true; i++) {
+    // Bind the v4 loopback on a dual stack socket.
+    TestAddress const& test_addr_dual = V4MappedLoopback();
+    sockaddr_storage addr_dual = test_addr_dual.addr;
+    const FileDescriptor fd_dual = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_dual.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+                     test_addr_dual.addr_len),
+                SyscallSucceeds());
+
+    // Get the port that we bound.
+    socklen_t addrlen = test_addr_dual.addr_len;
+    ASSERT_THAT(getsockname(fd_dual.get(),
+                            reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+                SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+    // Verify that we can still bind the v6 loopback on the same port.
+    TestAddress const& test_addr_v6 = V6Loopback();
+    sockaddr_storage addr_v6 = test_addr_v6.addr;
+    ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+    const FileDescriptor fd_v6 =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+    int ret = bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+                   test_addr_v6.addr_len);
+    if (ret == -1 && errno == EADDRINUSE) {
+      // Port may have been in use.
+      ASSERT_LT(i, 100);  // Give up after 100 tries.
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallSucceeds());
+
+    // Verify that binding the v4 loopback with the same port on a v4 socket
+    // fails.
+    TestAddress const& test_addr_v4 = V4Loopback();
+    sockaddr_storage addr_v4 = test_addr_v4.addr;
+    ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4.family(), &addr_v4, port));
+    const FileDescriptor fd_v4 =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+                     test_addr_v4.addr_len),
+                SyscallFailsWithErrno(EADDRINUSE));
+
+    // No need to try again.
+    break;
+  }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedAnyOnlyReservesV4) {
+  auto const& param = GetParam();
+
+  for (int i = 0; true; i++) {
+    // Bind the v4 any on a dual stack socket.
+    TestAddress const& test_addr_dual = V4MappedAny();
+    sockaddr_storage addr_dual = test_addr_dual.addr;
+    const FileDescriptor fd_dual = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_dual.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+                     test_addr_dual.addr_len),
+                SyscallSucceeds());
+
+    // Get the port that we bound.
+    socklen_t addrlen = test_addr_dual.addr_len;
+    ASSERT_THAT(getsockname(fd_dual.get(),
+                            reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+                SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+    // Verify that we can still bind the v6 loopback on the same port.
+    TestAddress const& test_addr_v6 = V6Loopback();
+    sockaddr_storage addr_v6 = test_addr_v6.addr;
+    ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+    const FileDescriptor fd_v6 =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+    int ret = bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+                   test_addr_v6.addr_len);
+    if (ret == -1 && errno == EADDRINUSE) {
+      // Port may have been in use.
+      ASSERT_LT(i, 100);  // Give up after 100 tries.
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallSucceeds());
+
+    // Verify that binding the v4 loopback with the same port on a v4 socket
+    // fails.
+    TestAddress const& test_addr_v4 = V4Loopback();
+    sockaddr_storage addr_v4 = test_addr_v4.addr;
+    ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4.family(), &addr_v4, port));
+    const FileDescriptor fd_v4 =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+                     test_addr_v4.addr_len),
+                SyscallFailsWithErrno(EADDRINUSE));
+
+    // No need to try again.
+    break;
+  }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyReservesEverything) {
+  auto const& param = GetParam();
+
+  // Bind the v6 any on a dual stack socket.
+  TestAddress const& test_addr_dual = V6Any();
+  sockaddr_storage addr_dual = test_addr_dual.addr;
+  const FileDescriptor fd_dual =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_dual.family(), param.type, 0));
+  ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+                   test_addr_dual.addr_len),
+              SyscallSucceeds());
+
+  // Get the port that we bound.
+  socklen_t addrlen = test_addr_dual.addr_len;
+  ASSERT_THAT(getsockname(fd_dual.get(),
+                          reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+  // Verify that binding the v6 loopback with the same port fails.
+  TestAddress const& test_addr_v6 = V6Loopback();
+  sockaddr_storage addr_v6 = test_addr_v6.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+  const FileDescriptor fd_v6 =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+  ASSERT_THAT(bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+                   test_addr_v6.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+
+  // Verify that binding the v4 loopback on the same port with a v6 socket
+  // fails.
+  TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+  sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+  ASSERT_NO_ERRNO(
+      SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped, port));
+  const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(test_addr_v4_mapped.family(), param.type, 0));
+  ASSERT_THAT(
+      bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+           test_addr_v4_mapped.addr_len),
+      SyscallFailsWithErrno(EADDRINUSE));
+
+  // Verify that binding the v4 loopback on the same port with a v4 socket
+  // fails.
+  TestAddress const& test_addr_v4 = V4Loopback();
+  sockaddr_storage addr_v4 = test_addr_v4.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4.family(), &addr_v4, port));
+  const FileDescriptor fd_v4 =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+  ASSERT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+                   test_addr_v4.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
+  auto const& param = GetParam();
+
+  for (int i = 0; true; i++) {
+    // Bind the v6 any on a v6-only socket.
+    TestAddress const& test_addr_dual = V6Any();
+    sockaddr_storage addr_dual = test_addr_dual.addr;
+    const FileDescriptor fd_dual = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_dual.family(), param.type, 0));
+    int one = 1;
+    EXPECT_THAT(
+        setsockopt(fd_dual.get(), IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)),
+        SyscallSucceeds());
+    ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+                     test_addr_dual.addr_len),
+                SyscallSucceeds());
+
+    // Get the port that we bound.
+    socklen_t addrlen = test_addr_dual.addr_len;
+    ASSERT_THAT(getsockname(fd_dual.get(),
+                            reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+                SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+    // Verify that binding the v6 loopback with the same port fails.
+    TestAddress const& test_addr_v6 = V6Loopback();
+    sockaddr_storage addr_v6 = test_addr_v6.addr;
+    ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+    const FileDescriptor fd_v6 =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+                     test_addr_v6.addr_len),
+                SyscallFailsWithErrno(EADDRINUSE));
+
+    // Verify that we can still bind the v4 loopback on the same port.
+    TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+    sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+    ASSERT_NO_ERRNO(
+        SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped, port));
+    const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_v4_mapped.family(), param.type, 0));
+    int ret =
+        bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+             test_addr_v4_mapped.addr_len);
+    if (ret == -1 && errno == EADDRINUSE) {
+      // Port may have been in use.
+      ASSERT_LT(i, 100);  // Give up after 100 tries.
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallSucceeds());
+
+    // No need to try again.
+    break;
+  }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
+  auto const& param = GetParam();
+
+  // FIXME
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
+
+  for (int i = 0; true; i++) {
+    // Bind the v6 loopback on a dual stack socket.
+    TestAddress const& test_addr = V6Loopback();
+    sockaddr_storage bound_addr = test_addr.addr;
+    const FileDescriptor bound_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                     test_addr.addr_len),
+                SyscallSucceeds());
+
+    // Listen iff TCP.
+    if (param.type == SOCK_STREAM) {
+      ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+    }
+
+    // Get the port that we bound.
+    socklen_t bound_addr_len = test_addr.addr_len;
+    ASSERT_THAT(
+        getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                    &bound_addr_len),
+        SyscallSucceeds());
+
+    // Connect to bind an ephemeral port.
+    const FileDescriptor connected_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    ASSERT_THAT(
+        connect(connected_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                bound_addr_len),
+        SyscallSucceeds());
+
+    // Get the ephemeral port.
+    sockaddr_storage connected_addr = {};
+    socklen_t connected_addr_len = sizeof(connected_addr);
+    ASSERT_THAT(getsockname(connected_fd.get(),
+                            reinterpret_cast<sockaddr*>(&connected_addr),
+                            &connected_addr_len),
+                SyscallSucceeds());
+    uint16_t const ephemeral_port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+    // Verify that we actually got an ephemeral port.
+    ASSERT_NE(ephemeral_port, 0);
+
+    // Verify that the ephemeral port is reserved.
+    const FileDescriptor checking_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    EXPECT_THAT(
+        bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+             connected_addr_len),
+        SyscallFailsWithErrno(EADDRINUSE));
+
+    // Verify that binding the v6 loopback with the same port fails.
+    TestAddress const& test_addr_v6 = V6Loopback();
+    sockaddr_storage addr_v6 = test_addr_v6.addr;
+    ASSERT_NO_ERRNO(
+        SetAddrPort(test_addr_v6.family(), &addr_v6, ephemeral_port));
+    const FileDescriptor fd_v6 =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+                     test_addr_v6.addr_len),
+                SyscallFailsWithErrno(EADDRINUSE));
+
+    // Verify that binding the v4 any with the same port fails.
+    TestAddress const& test_addr_v4_any = V4Any();
+    sockaddr_storage addr_v4_any = test_addr_v4_any.addr;
+    ASSERT_NO_ERRNO(
+        SetAddrPort(test_addr_v4_any.family(), &addr_v4_any, ephemeral_port));
+    const FileDescriptor fd_v4_any = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_v4_any.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_v4_any.get(), reinterpret_cast<sockaddr*>(&addr_v4_any),
+                     test_addr_v4_any.addr_len),
+                SyscallFailsWithErrno(EADDRINUSE));
+
+    // Verify that we can still bind the v4 loopback on the same port.
+    TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+    sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+    ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped,
+                                ephemeral_port));
+    const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_v4_mapped.family(), param.type, 0));
+    int ret =
+        bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+             test_addr_v4_mapped.addr_len);
+    if (ret == -1 && errno == EADDRINUSE) {
+      // Port may have been in use.
+      ASSERT_LT(i, 100);  // Give up after 100 tries.
+      continue;
+    }
+    EXPECT_THAT(ret, SyscallSucceeds());
+
+    // No need to try again.
+    break;
+  }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
+  auto const& param = GetParam();
+
+  // FIXME
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
+
+  for (int i = 0; true; i++) {
+    // Bind the v4 loopback on a dual stack socket.
+    TestAddress const& test_addr = V4MappedLoopback();
+    sockaddr_storage bound_addr = test_addr.addr;
+    const FileDescriptor bound_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                     test_addr.addr_len),
+                SyscallSucceeds());
+
+    // Listen iff TCP.
+    if (param.type == SOCK_STREAM) {
+      ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+    }
+
+    // Get the port that we bound.
+    socklen_t bound_addr_len = test_addr.addr_len;
+    ASSERT_THAT(
+        getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                    &bound_addr_len),
+        SyscallSucceeds());
+
+    // Connect to bind an ephemeral port.
+    const FileDescriptor connected_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    ASSERT_THAT(
+        connect(connected_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                bound_addr_len),
+        SyscallSucceeds());
+
+    // Get the ephemeral port.
+    sockaddr_storage connected_addr = {};
+    socklen_t connected_addr_len = sizeof(connected_addr);
+    ASSERT_THAT(getsockname(connected_fd.get(),
+                            reinterpret_cast<sockaddr*>(&connected_addr),
+                            &connected_addr_len),
+                SyscallSucceeds());
+    uint16_t const ephemeral_port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+    // Verify that we actually got an ephemeral port.
+    ASSERT_NE(ephemeral_port, 0);
+
+    // Verify that the ephemeral port is reserved.
+    const FileDescriptor checking_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    EXPECT_THAT(
+        bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+             connected_addr_len),
+        SyscallFailsWithErrno(EADDRINUSE));
+
+    // Verify that binding the v4 loopback on the same port with a v4 socket
+    // fails.
+    TestAddress const& test_addr_v4 = V4Loopback();
+    sockaddr_storage addr_v4 = test_addr_v4.addr;
+    ASSERT_NO_ERRNO(
+        SetAddrPort(test_addr_v4.family(), &addr_v4, ephemeral_port));
+    const FileDescriptor fd_v4 =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+    EXPECT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+                     test_addr_v4.addr_len),
+                SyscallFailsWithErrno(EADDRINUSE));
+
+    // Verify that binding the v6 any on the same port with a dual-stack socket
+    // fails.
+    TestAddress const& test_addr_v6_any = V6Any();
+    sockaddr_storage addr_v6_any = test_addr_v6_any.addr;
+    ASSERT_NO_ERRNO(
+        SetAddrPort(test_addr_v6_any.family(), &addr_v6_any, ephemeral_port));
+    const FileDescriptor fd_v6_any = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_v6_any.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_v6_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
+                     test_addr_v6_any.addr_len),
+                SyscallFailsWithErrno(EADDRINUSE));
+
+    // For some reason, binding the TCP v6-only any is flaky on Linux. Maybe we
+    // tend to run out of ephemeral ports? Regardless, binding the v6 loopback
+    // seems pretty reliable. Only try to bind the v6-only any on UDP and
+    // gVisor.
+
+    int ret = -1;
+
+    if (!IsRunningOnGvisor() && param.type == SOCK_STREAM) {
+      // Verify that we can still bind the v6 loopback on the same port.
+      TestAddress const& test_addr_v6 = V6Loopback();
+      sockaddr_storage addr_v6 = test_addr_v6.addr;
+      ASSERT_NO_ERRNO(
+          SetAddrPort(test_addr_v6.family(), &addr_v6, ephemeral_port));
+      const FileDescriptor fd_v6 = ASSERT_NO_ERRNO_AND_VALUE(
+          Socket(test_addr_v6.family(), param.type, 0));
+      ret = bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+                 test_addr_v6.addr_len);
+    } else {
+      // Verify that we can still bind the v6 any on the same port with a
+      // v6-only socket.
+      const FileDescriptor fd_v6_only_any = ASSERT_NO_ERRNO_AND_VALUE(
+          Socket(test_addr_v6_any.family(), param.type, 0));
+      int one = 1;
+      EXPECT_THAT(setsockopt(fd_v6_only_any.get(), IPPROTO_IPV6, IPV6_V6ONLY,
+                             &one, sizeof(one)),
+                  SyscallSucceeds());
+      ret =
+          bind(fd_v6_only_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
+               test_addr_v6_any.addr_len);
+    }
+
+    if (ret == -1 && errno == EADDRINUSE) {
+      // Port may have been in use.
+      ASSERT_LT(i, 100);  // Give up after 100 tries.
+      continue;
+    }
+    EXPECT_THAT(ret, SyscallSucceeds());
+
+    // No need to try again.
+    break;
+  }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
+  auto const& param = GetParam();
+
+  // FIXME
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
+
+  for (int i = 0; true; i++) {
+    // Bind the v4 loopback on a v4 socket.
+    TestAddress const& test_addr = V4Loopback();
+    sockaddr_storage bound_addr = test_addr.addr;
+    const FileDescriptor bound_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                     test_addr.addr_len),
+                SyscallSucceeds());
+
+    // Listen iff TCP.
+    if (param.type == SOCK_STREAM) {
+      ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+    }
+
+    // Get the port that we bound.
+    socklen_t bound_addr_len = test_addr.addr_len;
+    ASSERT_THAT(
+        getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                    &bound_addr_len),
+        SyscallSucceeds());
+
+    // Connect to bind an ephemeral port.
+    const FileDescriptor connected_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    ASSERT_THAT(
+        connect(connected_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                bound_addr_len),
+        SyscallSucceeds());
+
+    // Get the ephemeral port.
+    sockaddr_storage connected_addr = {};
+    socklen_t connected_addr_len = sizeof(connected_addr);
+    ASSERT_THAT(getsockname(connected_fd.get(),
+                            reinterpret_cast<sockaddr*>(&connected_addr),
+                            &connected_addr_len),
+                SyscallSucceeds());
+    uint16_t const ephemeral_port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+    // Verify that we actually got an ephemeral port.
+    ASSERT_NE(ephemeral_port, 0);
+
+    // Verify that the ephemeral port is reserved.
+    const FileDescriptor checking_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    EXPECT_THAT(
+        bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+             connected_addr_len),
+        SyscallFailsWithErrno(EADDRINUSE));
+
+    // Verify that binding the v4 loopback on the same port with a v6 socket
+    // fails.
+    TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+    sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+    ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped,
+                                ephemeral_port));
+    const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_v4_mapped.family(), param.type, 0));
+    EXPECT_THAT(
+        bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+             test_addr_v4_mapped.addr_len),
+        SyscallFailsWithErrno(EADDRINUSE));
+
+    // Verify that binding the v6 any on the same port with a dual-stack socket
+    // fails.
+    TestAddress const& test_addr_v6_any = V6Any();
+    sockaddr_storage addr_v6_any = test_addr_v6_any.addr;
+    ASSERT_NO_ERRNO(
+        SetAddrPort(test_addr_v6_any.family(), &addr_v6_any, ephemeral_port));
+    const FileDescriptor fd_v6_any = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(test_addr_v6_any.family(), param.type, 0));
+    ASSERT_THAT(bind(fd_v6_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
+                     test_addr_v6_any.addr_len),
+                SyscallFailsWithErrno(EADDRINUSE));
+
+    // For some reason, binding the TCP v6-only any is flaky on Linux. Maybe we
+    // tend to run out of ephemeral ports? Regardless, binding the v6 loopback
+    // seems pretty reliable. Only try to bind the v6-only any on UDP and
+    // gVisor.
+
+    int ret = -1;
+
+    if (!IsRunningOnGvisor() && param.type == SOCK_STREAM) {
+      // Verify that we can still bind the v6 loopback on the same port.
+      TestAddress const& test_addr_v6 = V6Loopback();
+      sockaddr_storage addr_v6 = test_addr_v6.addr;
+      ASSERT_NO_ERRNO(
+          SetAddrPort(test_addr_v6.family(), &addr_v6, ephemeral_port));
+      const FileDescriptor fd_v6 = ASSERT_NO_ERRNO_AND_VALUE(
+          Socket(test_addr_v6.family(), param.type, 0));
+      ret = bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+                 test_addr_v6.addr_len);
+    } else {
+      // Verify that we can still bind the v6 any on the same port with a
+      // v6-only socket.
+      const FileDescriptor fd_v6_only_any = ASSERT_NO_ERRNO_AND_VALUE(
+          Socket(test_addr_v6_any.family(), param.type, 0));
+      int one = 1;
+      EXPECT_THAT(setsockopt(fd_v6_only_any.get(), IPPROTO_IPV6, IPV6_V6ONLY,
+                             &one, sizeof(one)),
+                  SyscallSucceeds());
+      ret =
+          bind(fd_v6_only_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
+               test_addr_v6_any.addr_len);
+    }
+
+    if (ret == -1 && errno == EADDRINUSE) {
+      // Port may have been in use.
+      ASSERT_LT(i, 100);  // Give up after 100 tries.
+      continue;
+    }
+    EXPECT_THAT(ret, SyscallSucceeds());
+
+    // No need to try again.
+    break;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(AllFamlies, SocketMultiProtocolInetLoopbackTest,
+                        ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM},
+                                          ProtocolTestParam{"UDP", SOCK_DGRAM}),
+                        DescribeProtocolTestParam);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
new file mode 100644
index 000000000..bb5a83c9a
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -0,0 +1,392 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ip_tcp_generic.h"
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(TCPSocketPairTest, TcpInfoSucceedes) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct tcp_info opt = {};
+  socklen_t optLen = sizeof(opt);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &opt, &optLen),
+              SyscallSucceeds());
+}
+
+TEST_P(TCPSocketPairTest, ShortTcpInfoSucceedes) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct tcp_info opt = {};
+  socklen_t optLen = 1;
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &opt, &optLen),
+              SyscallSucceeds());
+}
+
+TEST_P(TCPSocketPairTest, ZeroTcpInfoSucceedes) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct tcp_info opt = {};
+  socklen_t optLen = 0;
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &opt, &optLen),
+              SyscallSucceeds());
+}
+
+// This test validates that an RST is sent instead of a FIN when data is
+// unread on calls to close(2).
+TEST_P(TCPSocketPairTest, RSTSentOnCloseWithUnreadData) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Wait until t_ sees the data on its side but don't read it.
+  struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+  constexpr int kPollTimeoutMs = 20000;  // Wait up to 20 seconds for the data.
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now close the connected without reading the data.
+  ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+
+  // Wait for the other end to receive the RST (up to 20 seconds).
+  struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN | POLLHUP, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // A shutdown with unread data will cause a RST to be sent instead
+  // of a FIN, per RFC 2525 section 2.17; this is also what Linux does.
+  ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(ECONNRESET));
+}
+
+// This test will validate that a RST will cause POLLHUP to trigger.
+TEST_P(TCPSocketPairTest, RSTCausesPollHUP) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Wait until second sees the data on its side but don't read it.
+  struct pollfd poll_fd = {sockets->second_fd(), POLLIN, 0};
+  constexpr int kPollTimeoutMs = 20000;  // Wait up to 20 seconds for the data.
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+
+  // Confirm we at least have one unread byte.
+  int bytes_available = 0;
+  ASSERT_THAT(
+      RetryEINTR(ioctl)(sockets->second_fd(), FIONREAD, &bytes_available),
+      SyscallSucceeds());
+  EXPECT_GT(bytes_available, 0);
+
+  // Now close the connected socket without reading the data from the second,
+  // this will cause a RST and we should see that with POLLHUP.
+  ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+
+  // Wait for the other end to receive the RST (up to 20 seconds).
+  struct pollfd poll_fd3 = {sockets->first_fd(), POLLHUP, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd3, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+  ASSERT_NE(poll_fd.revents & (POLLHUP | POLLIN), 0);
+}
+
+// This test validates that even if a RST is sent the other end will not
+// get an ECONNRESET until it's read all data.
+TEST_P(TCPSocketPairTest, RSTSentOnCloseWithUnreadDataAllowsReadBuffered) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(write)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Wait until second sees the data on its side but don't read it.
+  struct pollfd poll_fd = {sockets->second_fd(), POLLIN, 0};
+  constexpr int kPollTimeoutMs = 30000;  // Wait up to 30 seconds for the data.
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Wait until first sees the data on its side but don't read it.
+  struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now close the connected socket without reading the data from the second.
+  ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+
+  // Wait for the other end to receive the RST (up to 30 seconds).
+  struct pollfd poll_fd3 = {sockets->first_fd(), POLLHUP, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd3, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Since we also have data buffered we should be able to read it before
+  // the syscall will fail with ECONNRESET.
+  ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // A shutdown with unread data will cause a RST to be sent instead
+  // of a FIN, per RFC 2525 section 2.17; this is also what Linux does.
+  ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(ECONNRESET));
+}
+
+// This test will verify that a clean shutdown (FIN) is preformed when there
+// is unread data but only the write side is closed.
+TEST_P(TCPSocketPairTest, FINSentOnShutdownWrWithUnreadData) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Wait until t_ sees the data on its side but don't read it.
+  struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+  constexpr int kPollTimeoutMs = 20000;  // Wait up to 20 seconds for the data.
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now shutdown the write end leaving the read end open.
+  ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_WR), SyscallSucceeds());
+
+  // Wait for the other end to receive the FIN (up to 20 seconds).
+  struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN | POLLHUP, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Since we didn't shutdown the read end this will be a clean close.
+  ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(0));
+}
+
+// This test will verify that when data is received by a socket, even if it's
+// not read SHUT_RD will not cause any packets to be generated and data will
+// remain in the buffer and can be read later.
+TEST_P(TCPSocketPairTest, ShutdownRdShouldCauseNoPacketsWithUnreadData) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Wait until t_ sees the data on its side but don't read it.
+  struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+  constexpr int kPollTimeoutMs = 20000;  // Wait up to 20 seconds for the data.
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now shutdown the read end, this will generate no packets to the other end.
+  ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RD), SyscallSucceeds());
+
+  // We should not receive any events on the other side of the socket.
+  struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN | POLLHUP, 0};
+  constexpr int kPollNoResponseTimeoutMs = 3000;
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollNoResponseTimeoutMs),
+              SyscallSucceedsWithValue(0));  // Timeout.
+
+  // Even though we did a SHUT_RD on the read end we can still read the data.
+  ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(TCPSocketPairTest, ClosedReadNonBlockingSocket) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the read end to O_NONBLOCK.
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(sockets->second_fd(), F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(sockets->second_fd(), F_SETFL, opts | O_NONBLOCK),
+              SyscallSucceeds());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Wait until second_fd sees the data and then recv it.
+  struct pollfd poll_fd = {sockets->second_fd(), POLLIN, 0};
+  constexpr int kPollTimeoutMs = 2000;  // Wait up to 2 seconds for the data.
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Now shutdown the write end leaving the read end open.
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+
+  // Wait for close notification and recv again.
+  struct pollfd poll_fd2 = {sockets->second_fd(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(TCPSocketPairTest,
+       ShutdownRdUnreadDataShouldCauseNoPacketsUnlessClosed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Wait until t_ sees the data on its side but don't read it.
+  struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+  constexpr int kPollTimeoutMs = 20000;  // Wait up to 20 seconds for the data.
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now shutdown the read end, this will generate no packets to the other end.
+  ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RD), SyscallSucceeds());
+
+  // We should not receive any events on the other side of the socket.
+  struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN | POLLHUP, 0};
+  constexpr int kPollNoResponseTimeoutMs = 3000;
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollNoResponseTimeoutMs),
+              SyscallSucceedsWithValue(0));  // Timeout.
+
+  // Now since we've fully closed the connection it will generate a RST.
+  ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));  // The other end has closed.
+
+  // A shutdown with unread data will cause a RST to be sent instead
+  // of a FIN, per RFC 2525 section 2.17; this is also what Linux does.
+  ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(ECONNRESET));
+}
+
+TEST_P(TCPSocketPairTest, TCPCorkDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPCork) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TCPSocketPairTest, TCPCork) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  constexpr char kData[] = "abc";
+  ASSERT_THAT(WriteFd(sockets->first_fd(), kData, sizeof(kData)),
+              SyscallSucceedsWithValue(sizeof(kData)));
+
+  ASSERT_NO_FATAL_FAILURE(RecvNoData(sockets->second_fd()));
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Create a receive buffer larger than kData.
+  char buf[(sizeof(kData) + 1) * 2] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(kData)));
+  EXPECT_EQ(absl::string_view(kData, sizeof(kData)),
+            absl::string_view(buf, sizeof(kData)));
+}
+
+TEST_P(TCPSocketPairTest, TCPQuickAckDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK, &get,
+                         &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPQuickAck) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK, &get,
+                         &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK, &get,
+                         &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.h b/test/syscalls/linux/socket_ip_tcp_generic.h
new file mode 100644
index 000000000..f38500d14
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_generic.h
@@ -0,0 +1,29 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_TCP_GENERIC_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_TCP_GENERIC_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected TCP sockets.
+using TCPSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_TCP_GENERIC_H_
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
new file mode 100644
index 000000000..9e10dea30
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -0,0 +1,47 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ip_tcp_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return ApplyVecToVec<SocketPairKind>(
+      std::vector<Middleware>{
+          NoOp, SetSockOpt(IPPROTO_TCP, TCP_NODELAY, &kSockOptOn)},
+      VecCat<SocketPairKind>(
+          ApplyVec<SocketPairKind>(
+              IPv6TCPAcceptBindSocketPair,
+              AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})),
+          ApplyVec<SocketPairKind>(
+              IPv4TCPAcceptBindSocketPair,
+              AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})),
+          ApplyVec<SocketPairKind>(
+              DualStackTCPAcceptBindSocketPair,
+              AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC}))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, TCPSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc
new file mode 100644
index 000000000..f95061506
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc
@@ -0,0 +1,43 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          IPv6TCPAcceptBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          IPv4TCPAcceptBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          DualStackTCPAcceptBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, AllSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
new file mode 100644
index 000000000..bb419e3a8
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -0,0 +1,44 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_stream_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return ApplyVecToVec<SocketPairKind>(
+      std::vector<Middleware>{
+          NoOp, SetSockOpt(IPPROTO_TCP, TCP_NODELAY, &kSockOptOn)},
+      VecCat<SocketPairKind>(
+          ApplyVec<SocketPairKind>(
+              IPv6TCPAcceptBindSocketPair,
+              AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})),
+          ApplyVec<SocketPairKind>(
+              IPv4TCPAcceptBindSocketPair,
+              AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC}))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, BlockingStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
new file mode 100644
index 000000000..af6fd635e
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -0,0 +1,46 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return ApplyVecToVec<SocketPairKind>(
+      std::vector<Middleware>{
+          NoOp, SetSockOpt(IPPROTO_TCP, TCP_NODELAY, &kSockOptOn)},
+      VecCat<SocketPairKind>(
+          ApplyVec<SocketPairKind>(
+              IPv6TCPAcceptBindSocketPair,
+              AllBitwiseCombinations(List<int>{SOCK_NONBLOCK},
+                                     List<int>{0, SOCK_CLOEXEC})),
+          ApplyVec<SocketPairKind>(
+              IPv4TCPAcceptBindSocketPair,
+              AllBitwiseCombinations(List<int>{SOCK_NONBLOCK},
+                                     List<int>{0, SOCK_CLOEXEC}))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
new file mode 100644
index 000000000..91d029985
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
@@ -0,0 +1,78 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of TCP and UDP sockets.
+using TcpUdpSocketPairTest = SocketPairTest;
+
+TEST_P(TcpUdpSocketPairTest, ShutdownWrFollowedBySendIsError) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Now shutdown the write end of the first.
+  ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_WR), SyscallSucceeds());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EPIPE));
+}
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          IPv6UDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+      ApplyVec<SocketPairKind>(
+          IPv4UDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+      ApplyVec<SocketPairKind>(
+          DualStackUDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+      ApplyVec<SocketPairKind>(
+          IPv6TCPAcceptBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+      ApplyVec<SocketPairKind>(
+          IPv4TCPAcceptBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+      ApplyVec<SocketPairKind>(
+          DualStackTCPAcceptBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllTCPSockets, TcpUdpSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
new file mode 100644
index 000000000..8a98fa8df
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_non_stream.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          IPv6UDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          IPv4UDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          DualStackUDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, AllSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
new file mode 100644
index 000000000..08ff3e656
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -0,0 +1,40 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_non_stream_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          IPv6UDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          IPv4UDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, BlockingNonStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
new file mode 100644
index 000000000..256bcfccf
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -0,0 +1,42 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          IPv6UDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          IPv4UDPBidirectionalBindSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
new file mode 100644
index 000000000..7bfb62a6f
--- /dev/null
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -0,0 +1,182 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/sockios.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/internal/endian.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for netdevice queries.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+TEST(NetdeviceTest, Loopback) {
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  // Prepare the request.
+  struct ifreq ifr;
+  snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+  // Check for a non-zero interface index.
+  ASSERT_THAT(ioctl(sock.get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Check that the loopback is zero hardware address.
+  ASSERT_THAT(ioctl(sock.get(), SIOCGIFHWADDR, &ifr), SyscallSucceeds());
+  EXPECT_EQ(ifr.ifr_hwaddr.sa_data[0], 0);
+  EXPECT_EQ(ifr.ifr_hwaddr.sa_data[1], 0);
+  EXPECT_EQ(ifr.ifr_hwaddr.sa_data[2], 0);
+  EXPECT_EQ(ifr.ifr_hwaddr.sa_data[3], 0);
+  EXPECT_EQ(ifr.ifr_hwaddr.sa_data[4], 0);
+  EXPECT_EQ(ifr.ifr_hwaddr.sa_data[5], 0);
+}
+
+TEST(NetdeviceTest, Netmask) {
+  // We need an interface index to identify the loopback device.
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  struct ifreq ifr;
+  snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+  ASSERT_THAT(ioctl(sock.get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Use a netlink socket to get the netmask, which we'll then compare to the
+  // netmask obtained via ioctl.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct rtgenmsg rgm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req;
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.rgm.rtgen_family = AF_UNSPEC;
+
+  // Iterate through messages until we find the one containing the prefix length
+  // (i.e. netmask) for the loopback device.
+  int prefixlen = -1;
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req), [&](const struct nlmsghdr *hdr) {
+        EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWADDR), Eq(NLMSG_DONE)));
+
+        EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
+            << std::hex << hdr->nlmsg_flags;
+
+        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
+        EXPECT_EQ(hdr->nlmsg_pid, port);
+
+        if (hdr->nlmsg_type != RTM_NEWADDR) {
+          return;
+        }
+
+        // RTM_NEWADDR contains at least the header and ifaddrmsg.
+        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct ifaddrmsg));
+
+        struct ifaddrmsg *ifaddrmsg =
+            reinterpret_cast<struct ifaddrmsg *>(NLMSG_DATA(hdr));
+        if (ifaddrmsg->ifa_index == static_cast<uint32_t>(ifr.ifr_ifindex) &&
+            ifaddrmsg->ifa_family == AF_INET) {
+          prefixlen = ifaddrmsg->ifa_prefixlen;
+        }
+      }));
+
+  ASSERT_GE(prefixlen, 0);
+
+  // Netmask is stored big endian in struct sockaddr_in, so we do the same for
+  // comparison.
+  uint32_t mask = 0xffffffff << (32 - prefixlen);
+  mask = absl::gbswap_32(mask);
+
+  // Check that the loopback interface has the correct subnet mask.
+  snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+  ASSERT_THAT(ioctl(sock.get(), SIOCGIFNETMASK, &ifr), SyscallSucceeds());
+  EXPECT_EQ(ifr.ifr_netmask.sa_family, AF_INET);
+  struct sockaddr_in *sin =
+      reinterpret_cast<struct sockaddr_in *>(&ifr.ifr_netmask);
+  EXPECT_EQ(sin->sin_addr.s_addr, mask);
+}
+
+TEST(NetdeviceTest, InterfaceName) {
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  // Prepare the request.
+  struct ifreq ifr;
+  snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+  // Check for a non-zero interface index.
+  ASSERT_THAT(ioctl(sock.get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Check that SIOCGIFNAME finds the loopback interface.
+  snprintf(ifr.ifr_name, IFNAMSIZ, "foo");
+  ASSERT_THAT(ioctl(sock.get(), SIOCGIFNAME, &ifr), SyscallSucceeds());
+  EXPECT_STREQ(ifr.ifr_name, "lo");
+}
+
+TEST(NetdeviceTest, InterfaceFlags) {
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  // Prepare the request.
+  struct ifreq ifr;
+  snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+  // Check that SIOCGIFFLAGS marks the interface with IFF_LOOPBACK, IFF_UP, and
+  // IFF_RUNNING.
+  ASSERT_THAT(ioctl(sock.get(), SIOCGIFFLAGS, &ifr), SyscallSucceeds());
+  EXPECT_EQ(ifr.ifr_flags & IFF_UP, IFF_UP);
+  EXPECT_EQ(ifr.ifr_flags & IFF_RUNNING, IFF_RUNNING);
+}
+
+TEST(NetdeviceTest, InterfaceMTU) {
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  // Prepare the request.
+  struct ifreq ifr = {};
+  snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+  // Check that SIOCGIFMTU returns a nonzero MTU.
+  ASSERT_THAT(ioctl(sock.get(), SIOCGIFMTU, &ifr), SyscallSucceeds());
+  EXPECT_GT(ifr.ifr_mtu, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
new file mode 100644
index 000000000..9fc695460
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -0,0 +1,314 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ifaddrs.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for NETLINK_ROUTE sockets.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+// Netlink sockets must be SOCK_DGRAM or SOCK_RAW.
+TEST(NetlinkRouteTest, Types) {
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_STREAM, NETLINK_ROUTE),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_SEQPACKET, NETLINK_ROUTE),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_RDM, NETLINK_ROUTE),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_DCCP, NETLINK_ROUTE),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_PACKET, NETLINK_ROUTE),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+
+  int fd;
+  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(NetlinkRouteTest, AutomaticPort) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
+
+  struct sockaddr_nl addr = {};
+  addr.nl_family = AF_NETLINK;
+
+  EXPECT_THAT(
+      bind(fd.get(), reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallSucceeds());
+
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+  // This is the only netlink socket in the process, so it should get the PID as
+  // the port id.
+  //
+  // N.B. Another process could theoretically have explicitly reserved our pid
+  // as a port ID, but that is very unlikely.
+  EXPECT_EQ(addr.nl_pid, getpid());
+}
+
+// Calling connect automatically binds to an automatic port.
+TEST(NetlinkRouteTest, ConnectBinds) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
+
+  struct sockaddr_nl addr = {};
+  addr.nl_family = AF_NETLINK;
+
+  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+  // This is the only netlink socket in the process, so it should get the PID as
+  // the port id.
+  //
+  // N.B. Another process could theoretically have explicitly reserved our pid
+  // as a port ID, but that is very unlikely.
+  EXPECT_EQ(addr.nl_pid, getpid());
+
+  memset(&addr, 0, sizeof(addr));
+  addr.nl_family = AF_NETLINK;
+
+  // Connecting again is allowed, but keeps the same port.
+  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+  EXPECT_EQ(addr.nl_pid, getpid());
+}
+
+TEST(NetlinkRouteTest, GetPeerName) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
+
+  struct sockaddr_nl addr = {};
+  socklen_t addrlen = sizeof(addr);
+
+  EXPECT_THAT(getpeername(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, sizeof(addr));
+  EXPECT_EQ(addr.nl_family, AF_NETLINK);
+  // Peer is the kernel if we didn't connect elsewhere.
+  EXPECT_EQ(addr.nl_pid, 0);
+}
+
+using IntSockOptTest = ::testing::TestWithParam<int>;
+
+TEST_P(IntSockOptTest, GetSockOpt) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
+
+  int res;
+  socklen_t len = sizeof(res);
+
+  EXPECT_THAT(getsockopt(fd.get(), SOL_SOCKET, GetParam(), &res, &len),
+              SyscallSucceeds());
+
+  EXPECT_EQ(len, sizeof(res));
+  EXPECT_GT(res, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(NetlinkRouteTest, IntSockOptTest,
+                        ::testing::Values(SO_SNDBUF, SO_RCVBUF));
+
+// Validates the reponses to RTM_GETLINK + NLM_F_DUMP.
+void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
+  EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWLINK), Eq(NLMSG_DONE)));
+
+  EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
+      << std::hex << hdr->nlmsg_flags;
+
+  EXPECT_EQ(hdr->nlmsg_seq, seq);
+  EXPECT_EQ(hdr->nlmsg_pid, port);
+
+  if (hdr->nlmsg_type != RTM_NEWLINK) {
+    return;
+  }
+
+  // RTM_NEWLINK contains at least the header and ifinfomsg.
+  EXPECT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+
+  // TODO: Check ifinfomsg contents and following attrs.
+}
+
+TEST(NetlinkRouteTest, GetLinkDump) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  // Loopback is common among all tests, check that it's found.
+  bool loopbackFound = false;
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req), [&](const struct nlmsghdr* hdr) {
+        CheckGetLinkResponse(hdr, kSeq, port);
+        if (hdr->nlmsg_type != RTM_NEWLINK) {
+          return;
+        }
+        ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+        const struct ifinfomsg* msg =
+            reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+        LOG(INFO) << "Found interface idx=" << msg->ifi_index
+                  << ", type=" << std::hex << msg->ifi_type;
+        if (msg->ifi_type == ARPHRD_LOOPBACK) {
+          loopbackFound = true;
+          EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
+        }
+      }));
+  EXPECT_TRUE(loopbackFound);
+}
+
+TEST(NetlinkRouteTest, ControlMessageIgnored) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+  struct request {
+    struct nlmsghdr control_hdr;
+    struct nlmsghdr message_hdr;
+    struct ifinfomsg ifm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req = {};
+
+  // This control message is ignored. We still receive a response for the
+  // following RTM_GETLINK.
+  req.control_hdr.nlmsg_len = sizeof(req.control_hdr);
+  req.control_hdr.nlmsg_type = NLMSG_DONE;
+  req.control_hdr.nlmsg_seq = kSeq;
+
+  req.message_hdr.nlmsg_len = sizeof(req.message_hdr) + sizeof(req.ifm);
+  req.message_hdr.nlmsg_type = RTM_GETLINK;
+  req.message_hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.message_hdr.nlmsg_seq = kSeq;
+
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req), [&](const struct nlmsghdr* hdr) {
+        CheckGetLinkResponse(hdr, kSeq, port);
+      }));
+}
+
+TEST(NetlinkRouteTest, GetAddrDump) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct rtgenmsg rgm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req;
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.rgm.rtgen_family = AF_UNSPEC;
+
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req), [&](const struct nlmsghdr* hdr) {
+        EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWADDR), Eq(NLMSG_DONE)));
+
+        EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
+            << std::hex << hdr->nlmsg_flags;
+
+        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
+        EXPECT_EQ(hdr->nlmsg_pid, port);
+
+        if (hdr->nlmsg_type != RTM_NEWADDR) {
+          return;
+        }
+
+        // RTM_NEWADDR contains at least the header and ifaddrmsg.
+        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct ifaddrmsg));
+
+        // TODO: Check ifaddrmsg contents and following attrs.
+      }));
+}
+
+TEST(NetlinkRouteTest, LookupAll) {
+  struct ifaddrs* if_addr_list = nullptr;
+  auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+  // Not a syscall but we can use the syscall matcher as glibc sets errno.
+  ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+  int count = 0;
+  for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+    if (!i->ifa_addr || (i->ifa_addr->sa_family != AF_INET &&
+                         i->ifa_addr->sa_family != AF_INET6)) {
+      continue;
+    }
+    count++;
+  }
+  ASSERT_GT(count, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
new file mode 100644
index 000000000..ee0e03966
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -0,0 +1,100 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/if_arp.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <sys/socket.h>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<FileDescriptor> NetlinkBoundSocket() {
+  FileDescriptor fd;
+  ASSIGN_OR_RETURN_ERRNO(fd, Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
+
+  struct sockaddr_nl addr = {};
+  addr.nl_family = AF_NETLINK;
+
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      bind(fd.get(), reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+  MaybeSave();
+
+  return std::move(fd);
+}
+
+PosixErrorOr<uint32_t> NetlinkPortID(int fd) {
+  struct sockaddr_nl addr;
+  socklen_t addrlen = sizeof(addr);
+
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr), &addrlen));
+  MaybeSave();
+
+  return static_cast<uint32_t>(addr.nl_pid);
+}
+
+PosixError NetlinkRequestResponse(
+    const FileDescriptor& fd, void* request, size_t len,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct iovec iov = {};
+  iov.iov_base = request;
+  iov.iov_len = len;
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  // No destination required; it defaults to pid 0, the kernel.
+
+  RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(sendmsg)(fd.get(), &msg, 0));
+
+  constexpr size_t kBufferSize = 4096;
+  std::vector<char> buf(kBufferSize);
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  // Response is a series of NLM_F_MULTI messages, ending with a NLMSG_DONE
+  // message.
+  int type = -1;
+  do {
+    int len;
+    RETURN_ERROR_IF_SYSCALL_FAIL(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
+
+    // We don't bother with the complexity of dealing with truncated messages.
+    // We must allocate a large enough buffer up front.
+    if ((msg.msg_flags & MSG_TRUNC) == MSG_TRUNC) {
+      return PosixError(EIO,
+                        absl::StrCat("Received truncated message with flags: ",
+                                     msg.msg_flags));
+    }
+
+    for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
+         NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
+      fn(hdr);
+      type = hdr->nlmsg_type;
+    }
+  } while (type != NLMSG_DONE && type != NLMSG_ERROR);
+
+  EXPECT_EQ(type, NLMSG_DONE);
+  return NoError();
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
new file mode 100644
index 000000000..44b1f148c
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -0,0 +1,42 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
+
+#include <linux/if_arp.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// Returns a bound NETLINK_ROUTE socket.
+PosixErrorOr<FileDescriptor> NetlinkBoundSocket();
+
+// Returns the port ID of the passed socket.
+PosixErrorOr<uint32_t> NetlinkPortID(int fd);
+
+// Send the passed request and call fn will all response netlink messages.
+PosixError NetlinkRequestResponse(
+    const FileDescriptor& fd, void* request, size_t len,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
diff --git a/test/syscalls/linux/socket_non_blocking.cc b/test/syscalls/linux/socket_non_blocking.cc
new file mode 100644
index 000000000..1bcc6fb7f
--- /dev/null
+++ b/test/syscalls/linux/socket_non_blocking.cc
@@ -0,0 +1,63 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_non_blocking.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+TEST_P(NonBlockingSocketPairTest, ReadNothingAvailable) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char buf[20] = {};
+  ASSERT_THAT(ReadFd(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(NonBlockingSocketPairTest, RecvNothingAvailable) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char buf[20] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->first_fd(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(NonBlockingSocketPairTest, RecvMsgNothingAvailable) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct iovec iov;
+  char buf[20] = {};
+  iov.iov_base = buf;
+  iov.iov_len = sizeof(buf);
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_blocking.h b/test/syscalls/linux/socket_non_blocking.h
new file mode 100644
index 000000000..287e096bb
--- /dev/null
+++ b/test/syscalls/linux/socket_non_blocking.h
@@ -0,0 +1,29 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_BLOCKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_BLOCKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected non-blocking sockets.
+using NonBlockingSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_BLOCKING_H_
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
new file mode 100644
index 000000000..d49aab363
--- /dev/null
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -0,0 +1,174 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_non_stream.h"
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(NonStreamSocketPairTest, SendMsgTooLarge) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int sndbuf;
+  socklen_t length = sizeof(sndbuf);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &sndbuf, &length),
+      SyscallSucceeds());
+
+  // Make the call too large to fit in the send buffer.
+  const int buffer_size = 3 * sndbuf;
+
+  EXPECT_THAT(SendLargeSendMsg(sockets, buffer_size, false /* reader */),
+              SyscallFailsWithErrno(EMSGSIZE));
+}
+
+// Stream sockets allow data sent with a single (e.g. write, sendmsg) syscall
+// to be read in pieces with multiple (e.g. read, recvmsg) syscalls.
+//
+// SplitRecv checks that control messages can only be read on the first (e.g.
+// read, recvmsg) syscall, even if it doesn't provide space for the control
+// message.
+TEST_P(NonStreamSocketPairTest, SplitRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data) / 2];
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+// Stream sockets allow data sent with multiple sends to be read in a single
+// recv. Datagram sockets do not.
+//
+// SingleRecv checks that only a single message is readable in a single recv.
+TEST_P(NonStreamSocketPairTest, SingleRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data1, sizeof(sent_data1), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data1)));
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data2, sizeof(sent_data2), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data2)));
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+}
+
+// Stream sockets allow data sent with multiple sends to be peeked at in a
+// single recv. Datagram sockets (except for unix sockets) do not.
+//
+// SinglePeek checks that only a single message is peekable in a single recv.
+TEST_P(NonStreamSocketPairTest, SinglePeek) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data1, sizeof(sent_data1), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data1)));
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data2, sizeof(sent_data2), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data2)));
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+  for (int i = 0; i < 3; i++) {
+    memset(received_data, 0, sizeof(received_data));
+    ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                                 sizeof(received_data), MSG_PEEK),
+                SyscallSucceedsWithValue(sizeof(sent_data1)));
+    EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+  }
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(sent_data1), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(sent_data2), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+  EXPECT_EQ(0, memcmp(sent_data2, received_data, sizeof(sent_data2)));
+}
+
+TEST_P(NonStreamSocketPairTest, MsgTruncTruncation) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data) / 2, MSG_TRUNC),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+
+  // Check that we didn't get any extra data.
+  EXPECT_NE(0, memcmp(sent_data + sizeof(sent_data) / 2,
+                      received_data + sizeof(received_data) / 2,
+                      sizeof(sent_data) / 2));
+}
+
+TEST_P(NonStreamSocketPairTest, MsgTruncSameSize) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data)];
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), MSG_TRUNC),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(NonStreamSocketPairTest, MsgTruncNotFull) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[2 * sizeof(sent_data)];
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), MSG_TRUNC),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_stream.h b/test/syscalls/linux/socket_non_stream.h
new file mode 100644
index 000000000..02dd2a958
--- /dev/null
+++ b/test/syscalls/linux/socket_non_stream.h
@@ -0,0 +1,29 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected non-stream sockets.
+using NonStreamSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_H_
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
new file mode 100644
index 000000000..d64b181c9
--- /dev/null
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -0,0 +1,51 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_non_stream_blocking.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(BlockingNonStreamSocketPairTest, RecvLessThanBufferWaitAll) {
+  SKIP_IF(IsRunningOnGvisor());  // FIXME: Support MSG_WAITALL.
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[100];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data) * 2] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), MSG_WAITALL),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_stream_blocking.h b/test/syscalls/linux/socket_non_stream_blocking.h
new file mode 100644
index 000000000..bde355452
--- /dev/null
+++ b/test/syscalls/linux/socket_non_stream_blocking.h
@@ -0,0 +1,30 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_BLOCKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_BLOCKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking connected non-stream
+// sockets.
+using BlockingNonStreamSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_BLOCKING_H_
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
new file mode 100644
index 000000000..32e9d958b
--- /dev/null
+++ b/test/syscalls/linux/socket_stream.cc
@@ -0,0 +1,99 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_stream.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(StreamSocketPairTest, SplitRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data) / 2];
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(sent_data + sizeof(received_data), received_data,
+                      sizeof(received_data)));
+}
+
+// Stream sockets allow data sent with multiple sends to be read in a single
+// recv.
+//
+// CoalescedRecv checks that multiple messages are readable in a single recv.
+TEST_P(StreamSocketPairTest, CoalescedRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data1, sizeof(sent_data1), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data1)));
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data2, sizeof(sent_data2), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data2)));
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+  EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+                      sizeof(sent_data2)));
+}
+
+TEST_P(StreamSocketPairTest, WriteOneSideClosed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  const char str[] = "abc";
+  ASSERT_THAT(write(sockets->second_fd(), str, 3),
+              SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(StreamSocketPairTest, MsgTrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data)];
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data) / 2, MSG_TRUNC),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream.h b/test/syscalls/linux/socket_stream.h
new file mode 100644
index 000000000..35e591e17
--- /dev/null
+++ b/test/syscalls/linux/socket_stream.h
@@ -0,0 +1,30 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking and non-blocking
+// connected stream sockets.
+using StreamSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_H_
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
new file mode 100644
index 000000000..dd209c67c
--- /dev/null
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -0,0 +1,131 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_stream_blocking.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(BlockingStreamSocketPairTest, BlockPartialWriteClosed) {
+    // FIXME: gVisor doesn't support SO_SNDBUF on UDS, nor does it
+    // enforce any limit; it will write arbitrary amounts of data without
+    // blocking.
+    SKIP_IF(IsRunningOnGvisor());
+
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+    int buffer_size;
+    socklen_t length = sizeof(buffer_size);
+    ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+                           &buffer_size, &length),
+                SyscallSucceeds());
+
+    int wfd = sockets->first_fd();
+    ScopedThread t([wfd, buffer_size]() {
+      std::vector<char> buf(2 * buffer_size);
+      // Write more than fits in the buffer. Blocks then returns partial write
+      // when the other end is closed. The next call returns EPIPE.
+      //
+      // N.B. writes occur in chunks, so we may see less than buffer_size from
+      // the first call.
+      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                  SyscallSucceedsWithValue(::testing::Gt(0)));
+      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                  ::testing::AnyOf(SyscallFailsWithErrno(EPIPE),
+                                   SyscallFailsWithErrno(ECONNRESET)));
+    });
+
+    // Leave time for write to become blocked.
+    absl::SleepFor(absl::Seconds(1.0));
+
+    ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+}
+
+TEST_P(BlockingStreamSocketPairTest, SendMsgTooLarge) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int sndbuf;
+  socklen_t length = sizeof(sndbuf);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &sndbuf, &length),
+      SyscallSucceeds());
+
+  // Make the call too large to fit in the send buffer.
+  const int buffer_size = 3 * sndbuf;
+
+  EXPECT_THAT(SendLargeSendMsg(sockets, buffer_size, true /* reader */),
+              SyscallSucceedsWithValue(buffer_size));
+}
+
+TEST_P(BlockingStreamSocketPairTest, RecvLessThanBuffer) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[100];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[200] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+}
+
+TEST_P(BlockingStreamSocketPairTest, RecvLessThanBufferWaitAll) {
+  SKIP_IF(IsRunningOnGvisor());  // FIXME: Support MSG_WAITALL.
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[100];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  constexpr auto kDuration = absl::Milliseconds(200);
+  auto before = Now(CLOCK_MONOTONIC);
+
+  const ScopedThread t([&]() {
+    absl::SleepFor(kDuration);
+    ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+                SyscallSucceedsWithValue(sizeof(sent_data)));
+  });
+
+  char received_data[sizeof(sent_data) * 2] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), MSG_WAITALL),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  auto after = Now(CLOCK_MONOTONIC);
+  EXPECT_GE(after - before, kDuration);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream_blocking.h b/test/syscalls/linux/socket_stream_blocking.h
new file mode 100644
index 000000000..06113ad03
--- /dev/null
+++ b/test/syscalls/linux/socket_stream_blocking.h
@@ -0,0 +1,30 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_BLOCKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_BLOCKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking connected stream
+// sockets.
+using BlockingStreamSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_BLOCKING_H_
diff --git a/test/syscalls/linux/socket_stream_nonblock.cc b/test/syscalls/linux/socket_stream_nonblock.cc
new file mode 100644
index 000000000..a3202ffe4
--- /dev/null
+++ b/test/syscalls/linux/socket_stream_nonblock.cc
@@ -0,0 +1,50 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_stream_nonblock.h"
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using ::testing::Le;
+
+TEST_P(NonBlockingStreamSocketPairTest, SendMsgTooLarge) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int sndbuf;
+  socklen_t length = sizeof(sndbuf);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &sndbuf, &length),
+      SyscallSucceeds());
+
+  // Make the call too large to fit in the send buffer.
+  const int buffer_size = 3 * sndbuf;
+
+  EXPECT_THAT(SendLargeSendMsg(sockets, buffer_size, false /* reader */),
+              SyscallSucceedsWithValue(Le(buffer_size)));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream_nonblock.h b/test/syscalls/linux/socket_stream_nonblock.h
new file mode 100644
index 000000000..491f53848
--- /dev/null
+++ b/test/syscalls/linux/socket_stream_nonblock.h
@@ -0,0 +1,30 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_NONBLOCK_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_NONBLOCK_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of non-blocking connected stream
+// sockets.
+using NonBlockingStreamSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_NONBLOCK_H_
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
new file mode 100644
index 000000000..80a59df7e
--- /dev/null
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -0,0 +1,660 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+#include <arpa/inet.h>
+#include <poll.h>
+#include <sys/socket.h>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+Creator<SocketPair> SyscallSocketPairCreator(int domain, int type,
+                                             int protocol) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+    int pair[2];
+    RETURN_ERROR_IF_SYSCALL_FAIL(socketpair(domain, type, protocol, pair));
+    MaybeSave();  // Save on successful creation.
+    return absl::make_unique<FDSocketPair>(pair[0], pair[1]);
+  };
+}
+
+Creator<FileDescriptor> SyscallSocketCreator(int domain, int type,
+                                             int protocol) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<FileDescriptor>> {
+    int fd = 0;
+    RETURN_ERROR_IF_SYSCALL_FAIL(fd = socket(domain, type, protocol));
+    MaybeSave();  // Save on successful creation.
+    return absl::make_unique<FileDescriptor>(fd);
+  };
+}
+
+PosixErrorOr<struct sockaddr_un> UniqueUnixAddr(bool abstract, int domain) {
+  struct sockaddr_un addr = {};
+  std::string path = NewTempAbsPathInDir("/tmp");
+  if (path.size() >= sizeof(addr.sun_path)) {
+    return PosixError(EINVAL,
+                      "Unable to generate a temp path of appropriate length");
+  }
+
+  if (abstract) {
+    // Indicate that the path is in the abstract namespace.
+    path[0] = 0;
+  }
+  memcpy(addr.sun_path, path.c_str(), path.length());
+  addr.sun_family = domain;
+  return addr;
+}
+
+Creator<SocketPair> AcceptBindSocketPairCreator(bool abstract, int domain,
+                                                int type, int protocol) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+    ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un bind_addr,
+                           UniqueUnixAddr(abstract, domain));
+    ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un extra_addr,
+                           UniqueUnixAddr(abstract, domain));
+
+    int bound;
+    RETURN_ERROR_IF_SYSCALL_FAIL(bound = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        bind(bound, reinterpret_cast<struct sockaddr*>(&bind_addr),
+             sizeof(bind_addr)));
+    MaybeSave();  // Successful bind.
+    RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5));
+    MaybeSave();  // Successful listen.
+
+    int connected;
+    RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        connect(connected, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                sizeof(bind_addr)));
+    MaybeSave();  // Successful connect.
+
+    int accepted;
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        accepted = accept4(bound, nullptr, nullptr,
+                           type & (SOCK_NONBLOCK | SOCK_CLOEXEC)));
+    MaybeSave();  // Successful connect.
+
+    // Cleanup no longer needed resources.
+    RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
+    MaybeSave();  // Dropped original socket.
+
+    // Only unlink if path is not in abstract namespace.
+    if (bind_addr.sun_path[0] != 0) {
+      RETURN_ERROR_IF_SYSCALL_FAIL(unlink(bind_addr.sun_path));
+      MaybeSave();  // Unlinked path.
+    }
+
+    return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
+                                               extra_addr);
+  };
+}
+
+Creator<SocketPair> FilesystemAcceptBindSocketPairCreator(int domain, int type,
+                                                          int protocol) {
+  return AcceptBindSocketPairCreator(/* abstract= */ false, domain, type,
+                                     protocol);
+}
+
+Creator<SocketPair> AbstractAcceptBindSocketPairCreator(int domain, int type,
+                                                        int protocol) {
+  return AcceptBindSocketPairCreator(/* abstract= */ true, domain, type,
+                                     protocol);
+}
+
+Creator<SocketPair> BidirectionalBindSocketPairCreator(bool abstract,
+                                                       int domain, int type,
+                                                       int protocol) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+    ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un addr1,
+                           UniqueUnixAddr(abstract, domain));
+    ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un addr2,
+                           UniqueUnixAddr(abstract, domain));
+
+    int sock1;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        bind(sock1, reinterpret_cast<struct sockaddr*>(&addr1), sizeof(addr1)));
+    MaybeSave();  // Successful bind.
+
+    int sock2;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        bind(sock2, reinterpret_cast<struct sockaddr*>(&addr2), sizeof(addr2)));
+    MaybeSave();  // Successful bind.
+
+    RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+        sock1, reinterpret_cast<struct sockaddr*>(&addr2), sizeof(addr2)));
+    MaybeSave();  // Successful connect.
+
+    RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+        sock2, reinterpret_cast<struct sockaddr*>(&addr1), sizeof(addr1)));
+    MaybeSave();  // Successful connect.
+
+    // Cleanup no longer needed resources.
+
+    // Only unlink if path is not in abstract namespace.
+    if (addr1.sun_path[0] != 0) {
+      RETURN_ERROR_IF_SYSCALL_FAIL(unlink(addr1.sun_path));
+      MaybeSave();  // Successful unlink.
+    }
+
+    // Only unlink if path is not in abstract namespace.
+    if (addr2.sun_path[0] != 0) {
+      RETURN_ERROR_IF_SYSCALL_FAIL(unlink(addr2.sun_path));
+      MaybeSave();  // Successful unlink.
+    }
+
+    return absl::make_unique<FDSocketPair>(sock1, sock2);
+  };
+}
+
+Creator<SocketPair> FilesystemBidirectionalBindSocketPairCreator(int domain,
+                                                                 int type,
+                                                                 int protocol) {
+  return BidirectionalBindSocketPairCreator(/* abstract= */ false, domain, type,
+                                            protocol);
+}
+
+Creator<SocketPair> AbstractBidirectionalBindSocketPairCreator(int domain,
+                                                               int type,
+                                                               int protocol) {
+  return BidirectionalBindSocketPairCreator(/* abstract= */ true, domain, type,
+                                            protocol);
+}
+
+Creator<SocketPair> SocketpairGoferSocketPairCreator(int domain, int type,
+                                                     int protocol) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+    struct sockaddr_un addr = {};
+    constexpr char kSocketGoferPath[] = "/socket";
+    memcpy(addr.sun_path, kSocketGoferPath, sizeof(kSocketGoferPath));
+    addr.sun_family = domain;
+
+    int sock1;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+    RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+        sock1, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+    MaybeSave();  // Successful connect.
+
+    int sock2;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+    RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+        sock2, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+    MaybeSave();  // Successful connect.
+
+    // Make and close another socketpair to ensure that the duped ends of the
+    // first socketpair get closed.
+    //
+    // The problem is that there is no way to atomically send and close an FD.
+    // The closest that we can do is send and then immediately close the FD,
+    // which is what we do in the gofer. The gofer won't respond to another
+    // request until the reply is sent and the FD is closed, so forcing the
+    // gofer to handle another request will ensure that this has happened.
+    for (int i = 0; i < 2; i++) {
+      int sock;
+      RETURN_ERROR_IF_SYSCALL_FAIL(sock = socket(domain, type, protocol));
+      RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+          sock, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+      RETURN_ERROR_IF_SYSCALL_FAIL(close(sock));
+    }
+
+    return absl::make_unique<FDSocketPair>(sock1, sock2);
+  };
+}
+
+Creator<SocketPair> SocketpairGoferFileSocketPairCreator(int flags) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+    constexpr char kSocketGoferPath[] = "/socket";
+
+    int sock1;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock1 =
+                                     open(kSocketGoferPath, O_RDWR | flags));
+    MaybeSave();  // Successful socket creation.
+
+    int sock2;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock2 =
+                                     open(kSocketGoferPath, O_RDWR | flags));
+    MaybeSave();  // Successful socket creation.
+
+    return absl::make_unique<FDSocketPair>(sock1, sock2);
+  };
+}
+
+Creator<SocketPair> UnboundSocketPairCreator(bool abstract, int domain,
+                                             int type, int protocol) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+    ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un addr1,
+                           UniqueUnixAddr(abstract, domain));
+    ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un addr2,
+                           UniqueUnixAddr(abstract, domain));
+
+    int sock1;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+    int sock2;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+    return absl::make_unique<AddrFDSocketPair>(sock1, sock2, addr1, addr2);
+  };
+}
+
+Creator<SocketPair> FilesystemUnboundSocketPairCreator(int domain, int type,
+                                                       int protocol) {
+  return UnboundSocketPairCreator(/* abstract= */ false, domain, type,
+                                  protocol);
+}
+
+Creator<SocketPair> AbstractUnboundSocketPairCreator(int domain, int type,
+                                                     int protocol) {
+  return UnboundSocketPairCreator(/* abstract= */ true, domain, type, protocol);
+}
+
+void LocalhostAddr(struct sockaddr_in* addr, bool dual_stack) {
+  addr->sin_family = AF_INET;
+  addr->sin_port = htons(0);
+  inet_pton(AF_INET, "127.0.0.1",
+            reinterpret_cast<void*>(&addr->sin_addr.s_addr));
+}
+
+void LocalhostAddr(struct sockaddr_in6* addr, bool dual_stack) {
+  addr->sin6_family = AF_INET6;
+  addr->sin6_port = htons(0);
+  if (dual_stack) {
+    inet_pton(AF_INET6, "::ffff:127.0.0.1",
+              reinterpret_cast<void*>(&addr->sin6_addr.s6_addr));
+  } else {
+    inet_pton(AF_INET6, "::1",
+              reinterpret_cast<void*>(&addr->sin6_addr.s6_addr));
+  }
+  addr->sin6_scope_id = 0;
+}
+
+template <typename T>
+PosixErrorOr<T> BindIP(int fd, bool dual_stack) {
+  T addr = {};
+  LocalhostAddr(&addr, dual_stack);
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      bind(fd, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+  socklen_t addrlen = sizeof(addr);
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr), &addrlen));
+  return addr;
+}
+
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
+    int bound, int connected, int type, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T bind_addr, BindIP<T>(bound, dual_stack));
+  RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5));
+
+  int connect_result = 0;
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      (connect_result = RetryEINTR(connect)(
+           connected, reinterpret_cast<struct sockaddr*>(&bind_addr),
+           sizeof(bind_addr))) == -1 &&
+              errno == EINPROGRESS
+          ? 0
+          : connect_result);
+  MaybeSave();  // Successful connect.
+
+  if (connect_result == -1) {
+    struct pollfd connect_poll = {connected, POLLOUT | POLLERR | POLLHUP, 0};
+    RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(poll)(&connect_poll, 1, 0));
+    int error = 0;
+    socklen_t errorlen = sizeof(error);
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        getsockopt(connected, SOL_SOCKET, SO_ERROR, &error, &errorlen));
+    errno = error;
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        /* connect */ error == 0 ? 0 : -1);
+  }
+
+  int accepted = -1;
+  struct pollfd accept_poll = {bound, POLLIN, 0};
+  while (accepted == -1) {
+    RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(poll)(&accept_poll, 1, 0));
+
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        (accepted = RetryEINTR(accept4)(
+             bound, nullptr, nullptr, type & (SOCK_NONBLOCK | SOCK_CLOEXEC))) ==
+                    -1 &&
+                errno == EAGAIN
+            ? 0
+            : accepted);
+  }
+  MaybeSave();  // Successful accept.
+
+  // FIXME
+  if (connect_result == -1) {
+    absl::SleepFor(absl::Seconds(1));
+  }
+
+  // Cleanup no longer needed resources.
+  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
+  MaybeSave();  // Successful close.
+
+  T extra_addr = {};
+  LocalhostAddr(&extra_addr, dual_stack);
+  return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
+                                             extra_addr);
+}
+
+Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
+                                                   int protocol,
+                                                   bool dual_stack) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+    int bound;
+    RETURN_ERROR_IF_SYSCALL_FAIL(bound = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    int connected;
+    RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    if (domain == AF_INET) {
+      return CreateTCPAcceptBindSocketPair<sockaddr_in>(bound, connected, type,
+                                                        dual_stack);
+    }
+    return CreateTCPAcceptBindSocketPair<sockaddr_in6>(bound, connected, type,
+                                                       dual_stack);
+  };
+}
+
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>>
+CreateUDPBidirectionalBindSocketPair(int sock1, int sock2, int type,
+                                     bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T addr1, BindIP<T>(sock1, dual_stack));
+  ASSIGN_OR_RETURN_ERRNO(T addr2, BindIP<T>(sock2, dual_stack));
+
+  // Connect sock1 to sock2.
+  RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+      sock1, reinterpret_cast<struct sockaddr*>(&addr2), sizeof(addr2)));
+  MaybeSave();  // Successful connection.
+
+  // Connect sock2 to sock1.
+  RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+      sock2, reinterpret_cast<struct sockaddr*>(&addr1), sizeof(addr1)));
+  MaybeSave();  // Successful connection.
+
+  return absl::make_unique<AddrFDSocketPair>(sock1, sock2, addr1, addr2);
+}
+
+Creator<SocketPair> UDPBidirectionalBindSocketPairCreator(int domain, int type,
+                                                          int protocol,
+                                                          bool dual_stack) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+    int sock1;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    int sock2;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    if (domain == AF_INET) {
+      return CreateUDPBidirectionalBindSocketPair<sockaddr_in>(
+          sock1, sock2, type, dual_stack);
+    }
+    return CreateUDPBidirectionalBindSocketPair<sockaddr_in6>(sock1, sock2,
+                                                              type, dual_stack);
+  };
+}
+
+SocketPairKind Reversed(SocketPairKind const& base) {
+  auto const& creator = base.creator;
+  return SocketPairKind{
+      absl::StrCat("reversed ", base.description),
+      [creator]() -> PosixErrorOr<std::unique_ptr<ReversedSocketPair>> {
+        ASSIGN_OR_RETURN_ERRNO(auto creator_value, creator());
+        return absl::make_unique<ReversedSocketPair>(std::move(creator_value));
+      }};
+}
+
+std::vector<SocketPairKind> IncludeReversals(std::vector<SocketPairKind> vec) {
+  return ApplyVecToVec<SocketPairKind>(std::vector<Middleware>{NoOp, Reversed},
+                                       vec);
+}
+
+SocketPairKind NoOp(SocketPairKind const& base) { return base; }
+
+void TransferTest(int fd1, int fd2) {
+  char buf1[20];
+  RandomizeBuffer(buf1, sizeof(buf1));
+  ASSERT_THAT(WriteFd(fd1, buf1, sizeof(buf1)),
+              SyscallSucceedsWithValue(sizeof(buf1)));
+
+  char buf2[20];
+  ASSERT_THAT(ReadFd(fd2, buf2, sizeof(buf2)),
+              SyscallSucceedsWithValue(sizeof(buf2)));
+
+  EXPECT_EQ(0, memcmp(buf1, buf2, sizeof(buf1)));
+
+  RandomizeBuffer(buf1, sizeof(buf1));
+  ASSERT_THAT(WriteFd(fd2, buf1, sizeof(buf1)),
+              SyscallSucceedsWithValue(sizeof(buf1)));
+
+  ASSERT_THAT(ReadFd(fd1, buf2, sizeof(buf2)),
+              SyscallSucceedsWithValue(sizeof(buf2)));
+
+  EXPECT_EQ(0, memcmp(buf1, buf2, sizeof(buf1)));
+}
+
+// Initializes the given buffer with random data.
+void RandomizeBuffer(char* ptr, size_t len) {
+  uint32_t seed = time(nullptr);
+  for (size_t i = 0; i < len; ++i) {
+    ptr[i] = static_cast<char>(rand_r(&seed));
+  }
+}
+
+size_t CalculateUnixSockAddrLen(const char* sun_path) {
+  // Abstract addresses always return the full length.
+  if (sun_path[0] == 0) {
+    return sizeof(sockaddr_un);
+  }
+  // Filesystem addresses use the address length plus the 2 byte sun_family and
+  // null terminator.
+  return strlen(sun_path) + 3;
+}
+
+struct sockaddr_storage AddrFDSocketPair::to_storage(const sockaddr_un& addr) {
+  struct sockaddr_storage addr_storage = {};
+  memcpy(&addr_storage, &addr, sizeof(addr));
+  return addr_storage;
+}
+
+struct sockaddr_storage AddrFDSocketPair::to_storage(const sockaddr_in& addr) {
+  struct sockaddr_storage addr_storage = {};
+  memcpy(&addr_storage, &addr, sizeof(addr));
+  return addr_storage;
+}
+
+struct sockaddr_storage AddrFDSocketPair::to_storage(const sockaddr_in6& addr) {
+  struct sockaddr_storage addr_storage = {};
+  memcpy(&addr_storage, &addr, sizeof(addr));
+  return addr_storage;
+}
+
+SocketKind SimpleSocket(int fam, int type, int proto) {
+  return SocketKind{
+      absl::StrCat("Family ", fam, ", type ", type, ", proto ", proto),
+      SyscallSocketCreator(fam, type, proto)};
+}
+
+ssize_t SendLargeSendMsg(const std::unique_ptr<SocketPair>& sockets,
+                         size_t size, bool reader) {
+  const int rfd = sockets->second_fd();
+  ScopedThread t([rfd, size, reader] {
+    if (!reader) {
+      return;
+    }
+
+    // Potentially too many syscalls in the loop.
+    const DisableSave ds;
+
+    std::vector<char> buf(size);
+    size_t total = 0;
+
+    while (total < size) {
+      int ret = read(rfd, buf.data(), buf.size());
+      if (ret == -1 && errno == EAGAIN) {
+        continue;
+      }
+      if (ret > 0) {
+        total += ret;
+      }
+
+      // Assert to return on first failure.
+      ASSERT_THAT(ret, SyscallSucceeds());
+    }
+  });
+
+  std::vector<char> buf(size);
+
+  struct iovec iov = {};
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  return RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0);
+}
+
+PosixErrorOr<int> PortAvailable(int port, AddressFamily family, SocketType type,
+                                bool reuse_addr) {
+  if (port < 0) {
+    return PosixError(EINVAL, "Invalid port");
+  }
+
+  // Both Ipv6 and Dualstack are AF_INET6.
+  int sock_fam = (family == AddressFamily::kIpv4 ? AF_INET : AF_INET6);
+  int sock_type = (type == SocketType::kTcp ? SOCK_STREAM : SOCK_DGRAM);
+  ASSIGN_OR_RETURN_ERRNO(auto fd, Socket(sock_fam, sock_type, 0));
+
+  if (reuse_addr) {
+    int one = 1;
+    RETURN_ERROR_IF_SYSCALL_FAIL(
+        setsockopt(fd.get(), SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)));
+  }
+
+  // Try to bind.
+  sockaddr_storage storage = {};
+  int storage_size = 0;
+  if (family == AddressFamily::kIpv4) {
+    sockaddr_in* addr = reinterpret_cast<sockaddr_in*>(&storage);
+    storage_size = sizeof(*addr);
+    addr->sin_family = AF_INET;
+    addr->sin_port = htons(port);
+    addr->sin_addr.s_addr = htonl(INADDR_ANY);
+  } else {
+    sockaddr_in6* addr = reinterpret_cast<sockaddr_in6*>(&storage);
+    storage_size = sizeof(*addr);
+    addr->sin6_family = AF_INET6;
+    addr->sin6_port = htons(port);
+    if (family == AddressFamily::kDualStack) {
+      inet_pton(AF_INET6, "::ffff:0.0.0.0",
+                reinterpret_cast<void*>(&addr->sin6_addr.s6_addr));
+    } else {
+      addr->sin6_addr = in6addr_any;
+    }
+  }
+
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      bind(fd.get(), reinterpret_cast<sockaddr*>(&storage), storage_size));
+
+  // If the user specified 0 as the port, we will return the port that the
+  // kernel gave us, otherwise we will validate that this socket bound to the
+  // requested port.
+  sockaddr_storage bound_storage = {};
+  socklen_t bound_storage_size = sizeof(bound_storage);
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      getsockname(fd.get(), reinterpret_cast<sockaddr*>(&bound_storage),
+                  &bound_storage_size));
+
+  int available_port = -1;
+  if (bound_storage.ss_family == AF_INET) {
+    sockaddr_in* addr = reinterpret_cast<sockaddr_in*>(&bound_storage);
+    available_port = ntohs(addr->sin_port);
+  } else if (bound_storage.ss_family == AF_INET6) {
+    sockaddr_in6* addr = reinterpret_cast<sockaddr_in6*>(&bound_storage);
+    available_port = ntohs(addr->sin6_port);
+  } else {
+    return PosixError(EPROTOTYPE, "Getsockname returned invalid family");
+  }
+
+  // If we requested a specific port make sure our bound port is that port.
+  if (port != 0 && available_port != port) {
+    return PosixError(EINVAL,
+                      absl::StrCat("Bound port ", available_port,
+                                   " was not equal to requested port ", port));
+  }
+
+  // If we're trying to do a TCP socket, let's also try to listen.
+  if (type == SocketType::kTcp) {
+    RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd.get(), 1));
+  }
+
+  return available_port;
+}
+
+PosixError FreeAvailablePort(int port) {
+  return NoError();
+}
+
+PosixErrorOr<int> SendMsg(int sock, msghdr* msg, char buf[], int buf_size) {
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = buf_size;
+  msg->msg_iov = &iov;
+  msg->msg_iovlen = 1;
+
+  int ret;
+  RETURN_ERROR_IF_SYSCALL_FAIL(ret = RetryEINTR(sendmsg)(sock, msg, 0));
+  return ret;
+}
+
+void RecvNoData(int sock) {
+  char data = 0;
+  struct iovec iov;
+  iov.iov_base = &data;
+  iov.iov_len = 1;
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
new file mode 100644
index 000000000..e3e741478
--- /dev/null
+++ b/test/syscalls/linux/socket_test_util.h
@@ -0,0 +1,449 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_SOCKET_TEST_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_SOCKET_TEST_UTIL_H_
+
+#include <errno.h>
+#include <netinet/ip.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_format.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Wrapper for socket(2) that returns a FileDescriptor.
+inline PosixErrorOr<FileDescriptor> Socket(int family, int type, int protocol) {
+  int fd = socket(family, type, protocol);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(
+        errno, absl::StrFormat("socket(%d, %d, %d)", family, type, protocol));
+  }
+  return FileDescriptor(fd);
+}
+
+// Wrapper for accept(2) that returns a FileDescriptor.
+inline PosixErrorOr<FileDescriptor> Accept(int sockfd, sockaddr* addr,
+                                           socklen_t* addrlen) {
+  int fd = RetryEINTR(accept)(sockfd, addr, addrlen);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(
+        errno, absl::StrFormat("accept(%d, %p, %p)", sockfd, addr, addrlen));
+  }
+  return FileDescriptor(fd);
+}
+
+// Wrapper for accept4(2) that returns a FileDescriptor.
+inline PosixErrorOr<FileDescriptor> Accept4(int sockfd, sockaddr* addr,
+                                            socklen_t* addrlen, int flags) {
+  int fd = RetryEINTR(accept4)(sockfd, addr, addrlen, flags);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, absl::StrFormat("accept4(%d, %p, %p, %#x)", sockfd,
+                                             addr, addrlen, flags));
+  }
+  return FileDescriptor(fd);
+}
+
+inline ssize_t SendFd(int fd, void* buf, size_t count, int flags) {
+  return internal::ApplyFileIoSyscall(
+      [&](size_t completed) {
+        return sendto(fd, static_cast<char*>(buf) + completed,
+                      count - completed, flags, nullptr, 0);
+      },
+      count);
+}
+
+// A Creator<T> is a function that attempts to create and return a new T. (This
+// is copy/pasted from cloud/gvisor/api/sandbox_util.h and is just duplicated
+// here for clarity.)
+template <typename T>
+using Creator = std::function<PosixErrorOr<std::unique_ptr<T>>()>;
+
+// A SocketPair represents a pair of socket file descriptors owned by the
+// SocketPair.
+class SocketPair {
+ public:
+  virtual ~SocketPair() = default;
+
+  virtual int first_fd() const = 0;
+  virtual int second_fd() const = 0;
+  virtual int release_first_fd() = 0;
+  virtual int release_second_fd() = 0;
+  virtual const struct sockaddr* first_addr() const = 0;
+  virtual const struct sockaddr* second_addr() const = 0;
+  virtual size_t first_addr_size() const = 0;
+  virtual size_t second_addr_size() const = 0;
+  virtual size_t first_addr_len() const = 0;
+  virtual size_t second_addr_len() const = 0;
+};
+
+// A FDSocketPair is a SocketPair that consists of only a pair of file
+// descriptors.
+class FDSocketPair : public SocketPair {
+ public:
+  FDSocketPair(int first_fd, int second_fd)
+      : first_(first_fd), second_(second_fd) {}
+
+  int first_fd() const override { return first_.get(); }
+  int second_fd() const override { return second_.get(); }
+  int release_first_fd() override { return first_.release(); }
+  int release_second_fd() override { return second_.release(); }
+  const struct sockaddr* first_addr() const override { return nullptr; }
+  const struct sockaddr* second_addr() const override { return nullptr; }
+  size_t first_addr_size() const override { return 0; }
+  size_t second_addr_size() const override { return 0; }
+  size_t first_addr_len() const override { return 0; }
+  size_t second_addr_len() const override { return 0; }
+
+ private:
+  FileDescriptor first_;
+  FileDescriptor second_;
+};
+
+// CalculateUnixSockAddrLen calculates the length returned by recvfrom(2) and
+// recvmsg(2) for Unix sockets.
+size_t CalculateUnixSockAddrLen(const char* sun_path);
+
+// A AddrFDSocketPair is a SocketPair that consists of a pair of file
+// descriptors in addition to a pair of socket addresses.
+class AddrFDSocketPair : public SocketPair {
+ public:
+  AddrFDSocketPair(int first_fd, int second_fd,
+                   const struct sockaddr_un& first_address,
+                   const struct sockaddr_un& second_address)
+      : first_(first_fd),
+        second_(second_fd),
+        first_addr_(to_storage(first_address)),
+        second_addr_(to_storage(second_address)),
+        first_len_(CalculateUnixSockAddrLen(first_address.sun_path)),
+        second_len_(CalculateUnixSockAddrLen(second_address.sun_path)),
+        first_size_(sizeof(first_address)),
+        second_size_(sizeof(second_address)) {}
+
+  AddrFDSocketPair(int first_fd, int second_fd,
+                   const struct sockaddr_in& first_address,
+                   const struct sockaddr_in& second_address)
+      : first_(first_fd),
+        second_(second_fd),
+        first_addr_(to_storage(first_address)),
+        second_addr_(to_storage(second_address)),
+        first_len_(sizeof(first_address)),
+        second_len_(sizeof(second_address)),
+        first_size_(sizeof(first_address)),
+        second_size_(sizeof(second_address)) {}
+
+  AddrFDSocketPair(int first_fd, int second_fd,
+                   const struct sockaddr_in6& first_address,
+                   const struct sockaddr_in6& second_address)
+      : first_(first_fd),
+        second_(second_fd),
+        first_addr_(to_storage(first_address)),
+        second_addr_(to_storage(second_address)),
+        first_len_(sizeof(first_address)),
+        second_len_(sizeof(second_address)),
+        first_size_(sizeof(first_address)),
+        second_size_(sizeof(second_address)) {}
+
+  int first_fd() const override { return first_.get(); }
+  int second_fd() const override { return second_.get(); }
+  int release_first_fd() override { return first_.release(); }
+  int release_second_fd() override { return second_.release(); }
+  const struct sockaddr* first_addr() const override {
+    return reinterpret_cast<const struct sockaddr*>(&first_addr_);
+  }
+  const struct sockaddr* second_addr() const override {
+    return reinterpret_cast<const struct sockaddr*>(&second_addr_);
+  }
+  size_t first_addr_size() const override { return first_size_; }
+  size_t second_addr_size() const override { return second_size_; }
+  size_t first_addr_len() const override { return first_len_; }
+  size_t second_addr_len() const override { return second_len_; }
+
+ private:
+  // to_storage coverts a sockaddr_* to a sockaddr_storage.
+  static struct sockaddr_storage to_storage(const sockaddr_un& addr);
+  static struct sockaddr_storage to_storage(const sockaddr_in& addr);
+  static struct sockaddr_storage to_storage(const sockaddr_in6& addr);
+
+  FileDescriptor first_;
+  FileDescriptor second_;
+  const struct sockaddr_storage first_addr_;
+  const struct sockaddr_storage second_addr_;
+  const size_t first_len_;
+  const size_t second_len_;
+  const size_t first_size_;
+  const size_t second_size_;
+};
+
+// SyscallSocketPairCreator returns a Creator<SocketPair> that obtains file
+// descriptors by invoking the socketpair() syscall.
+Creator<SocketPair> SyscallSocketPairCreator(int domain, int type,
+                                             int protocol);
+
+// SyscallSocketCreator returns a Creator<FileDescriptor> that obtains a file
+// descriptor by invoking the socket() syscall.
+Creator<FileDescriptor> SyscallSocketCreator(int domain, int type,
+                                             int protocol);
+
+// FilesystemBidirectionalBindSocketPairCreator returns a Creator<SocketPair>
+// that obtains file descriptors by invoking the bind() and connect() syscalls
+// on filesystem paths. Only works for DGRAM sockets.
+Creator<SocketPair> FilesystemBidirectionalBindSocketPairCreator(int domain,
+                                                                 int type,
+                                                                 int protocol);
+
+// AbstractBidirectionalBindSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by invoking the bind() and connect() syscalls on
+// abstract namespace paths. Only works for DGRAM sockets.
+Creator<SocketPair> AbstractBidirectionalBindSocketPairCreator(int domain,
+                                                               int type,
+                                                               int protocol);
+
+// SocketpairGoferSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by connect() syscalls on two sockets with socketpair
+// gofer paths.
+Creator<SocketPair> SocketpairGoferSocketPairCreator(int domain, int type,
+                                                     int protocol);
+
+// SocketpairGoferFileSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by open() syscalls on socketpair gofer paths.
+Creator<SocketPair> SocketpairGoferFileSocketPairCreator(int flags);
+
+// FilesystemAcceptBindSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by invoking the accept() and bind() syscalls on
+// a filesystem path. Only works for STREAM and SEQPACKET sockets.
+Creator<SocketPair> FilesystemAcceptBindSocketPairCreator(int domain, int type,
+                                                          int protocol);
+
+// AbstractAcceptBindSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by invoking the accept() and bind() syscalls on a
+// abstract namespace path. Only works for STREAM and SEQPACKET sockets.
+Creator<SocketPair> AbstractAcceptBindSocketPairCreator(int domain, int type,
+                                                        int protocol);
+
+// FilesystemUnboundSocketPairCreator returns a Creator<SocketPair> that obtains
+// file descriptors by invoking the socket() syscall and generates a filesystem
+// path for binding.
+Creator<SocketPair> FilesystemUnboundSocketPairCreator(int domain, int type,
+                                                       int protocol);
+
+// AbstractUnboundSocketPairCreator returns a Creator<SocketPair> that obtains
+// file descriptors by invoking the socket() syscall and generates an abstract
+// path for binding.
+Creator<SocketPair> AbstractUnboundSocketPairCreator(int domain, int type,
+                                                     int protocol);
+
+// TCPAcceptBindSocketPairCreator returns a Creator<SocketPair> that obtains
+// file descriptors by invoking the accept() and bind() syscalls on TCP sockets.
+Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
+                                                   int protocol,
+                                                   bool dual_stack);
+
+// UDPBidirectionalBindSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by invoking the bind() and connect() syscalls on UDP
+// sockets.
+Creator<SocketPair> UDPBidirectionalBindSocketPairCreator(int domain, int type,
+                                                          int protocol,
+                                                          bool dual_stack);
+
+// A SocketPairKind couples a human-readable description of a socket pair with
+// a function that creates such a socket pair.
+struct SocketPairKind {
+  std::string description;
+  Creator<SocketPair> creator;
+
+  // Create creates a socket pair of this kind.
+  PosixErrorOr<std::unique_ptr<SocketPair>> Create() const { return creator(); }
+};
+
+// A SocketKind couples a human-readable description of a socket with
+// a function that creates such a socket.
+struct SocketKind {
+  std::string description;
+  Creator<FileDescriptor> creator;
+
+  // Create creates a socket pair of this kind.
+  PosixErrorOr<std::unique_ptr<FileDescriptor>> Create() const {
+    return creator();
+  }
+};
+
+// A ReversedSocketPair wraps another SocketPair but flips the first and second
+// file descriptors. ReversedSocketPair is used to test socket pairs that
+// should be symmetric.
+class ReversedSocketPair : public SocketPair {
+ public:
+  explicit ReversedSocketPair(std::unique_ptr<SocketPair> base)
+      : base_(std::move(base)) {}
+
+  int first_fd() const override { return base_->second_fd(); }
+  int second_fd() const override { return base_->first_fd(); }
+  int release_first_fd() override { return base_->release_second_fd(); }
+  int release_second_fd() override { return base_->release_first_fd(); }
+  const struct sockaddr* first_addr() const override {
+    return base_->second_addr();
+  }
+  const struct sockaddr* second_addr() const override {
+    return base_->first_addr();
+  }
+  size_t first_addr_size() const override { return base_->second_addr_size(); }
+  size_t second_addr_size() const override { return base_->first_addr_size(); }
+  size_t first_addr_len() const override { return base_->second_addr_len(); }
+  size_t second_addr_len() const override { return base_->first_addr_len(); }
+
+ private:
+  std::unique_ptr<SocketPair> base_;
+};
+
+// Reversed returns a SocketPairKind that represents SocketPairs created by
+// flipping the file descriptors provided by another SocketPair.
+SocketPairKind Reversed(SocketPairKind const& base);
+
+// IncludeReversals returns a vector<SocketPairKind> that returns all
+// SocketPairKinds in `vec` as well as all SocketPairKinds obtained by flipping
+// the file descriptors provided by the kinds in `vec`.
+std::vector<SocketPairKind> IncludeReversals(std::vector<SocketPairKind> vec);
+
+// A Middleware is a function wraps a SocketPairKind.
+using Middleware = std::function<SocketPairKind(SocketPairKind)>;
+
+// Reversed returns a SocketPairKind that represents SocketPairs created by
+// flipping the file descriptors provided by another SocketPair.
+template <typename T>
+Middleware SetSockOpt(int level, int optname, T* value) {
+  return [=](SocketPairKind const& base) {
+    auto const& creator = base.creator;
+    return SocketPairKind{
+        absl::StrCat("setsockopt(", level, ", ", optname, ", ", *value, ") ",
+                     base.description),
+        [creator, level, optname,
+         value]() -> PosixErrorOr<std::unique_ptr<SocketPair>> {
+          ASSIGN_OR_RETURN_ERRNO(auto creator_value, creator());
+          if (creator_value->first_fd() >= 0) {
+            RETURN_ERROR_IF_SYSCALL_FAIL(setsockopt(
+                creator_value->first_fd(), level, optname, value, sizeof(T)));
+          }
+          if (creator_value->second_fd() >= 0) {
+            RETURN_ERROR_IF_SYSCALL_FAIL(setsockopt(
+                creator_value->second_fd(), level, optname, value, sizeof(T)));
+          }
+          return creator_value;
+        }};
+  };
+}
+
+constexpr int kSockOptOn = 1;
+constexpr int kSockOptOff = 0;
+
+// NoOp returns the same SocketPairKind that it is passed.
+SocketPairKind NoOp(SocketPairKind const& base);
+
+// TransferTest tests that data can be send back and fourth between two
+// specified FDs. Note that calls to this function should be wrapped in
+// ASSERT_NO_FATAL_FAILURE().
+void TransferTest(int fd1, int fd2);
+
+// Fills [buf, buf+len) with random bytes.
+void RandomizeBuffer(char* buf, size_t len);
+
+// Base test fixture for tests that operate on pairs of connected sockets.
+class SocketPairTest : public ::testing::TestWithParam<SocketPairKind> {
+ protected:
+  SocketPairTest() {
+    // gUnit uses printf, so so will we.
+    printf("Testing with %s\n", GetParam().description.c_str());
+  }
+
+  PosixErrorOr<std::unique_ptr<SocketPair>> NewSocketPair() const {
+    return GetParam().Create();
+  }
+};
+
+// Base test fixture for tests that operate on simple Sockets.
+class SimpleSocketTest : public ::testing::TestWithParam<SocketKind> {
+ protected:
+  SimpleSocketTest() {
+    // gUnit uses printf, so so will we.
+    printf("Testing with %s\n", GetParam().description.c_str());
+  }
+
+  PosixErrorOr<std::unique_ptr<FileDescriptor>> NewSocket() const {
+    return GetParam().Create();
+  }
+};
+
+SocketKind SimpleSocket(int fam, int type, int proto);
+
+// Send a buffer of size 'size' to sockets->first_fd(), returning the result of
+// sendmsg.
+//
+// If reader, read from second_fd() until size bytes have been read.
+ssize_t SendLargeSendMsg(const std::unique_ptr<SocketPair>& sockets,
+                         size_t size, bool reader);
+
+// Initializes the given buffer with random data.
+void RandomizeBuffer(char* ptr, size_t len);
+
+enum class AddressFamily { kIpv4 = 1, kIpv6 = 2, kDualStack = 3 };
+enum class SocketType { kUdp = 1, kTcp = 2 };
+
+// Returns a PosixError or a port that is available. If 0 is specified as the
+// port it will bind port 0 (and allow the kernel to select any free port).
+// Otherwise, it will try to bind the specified port and validate that it can be
+// used for the requested family and socket type. The final option is
+// reuse_addr. This specifies whether SO_REUSEADDR should be applied before a
+// bind(2) attempt. SO_REUSEADDR means that sockets in TIME_WAIT states or other
+// bound UDP sockets would not cause an error on bind(2). This option should be
+// set if subsequent calls to bind on the returned port will also use
+// SO_REUSEADDR.
+//
+// Note: That this test will attempt to bind the ANY address for the respective
+// protocol.
+PosixErrorOr<int> PortAvailable(int port, AddressFamily family, SocketType type,
+                                bool reuse_addr);
+
+// FreeAvailablePort is used to return a port that was obtained by using
+// the PortAvailable helper with port 0.
+PosixError FreeAvailablePort(int port);
+
+// SendMsg converts a buffer to an iovec and adds it to msg before sending it.
+PosixErrorOr<int> SendMsg(int sock, msghdr* msg, char buf[], int buf_size);
+
+// RecvNoData checks that no data is receivable on sock.
+void RecvNoData(int sock);
+
+// Base test fixture for tests that apply to all kinds of pairs of connected
+// sockets.
+using AllSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_SOCKET_TEST_UTIL_H_
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
new file mode 100644
index 000000000..c60a965ae
--- /dev/null
+++ b/test/syscalls/linux/socket_unix.cc
@@ -0,0 +1,1181 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix.h"
+
+#include <net/if.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+// This file is a generic socket test file. It must be built with another file
+// that provides the test types.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(UnixSocketPairTest, BasicFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+                                       sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairTest, BasicTwoFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  int sent_fds[] = {pair1->second_fd(), pair2->second_fd()};
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int received_fds[] = {-1, -1};
+
+  ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 2,
+                                  received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
+}
+
+TEST_P(UnixSocketPairTest, BasicThreeFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  auto pair3 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int received_fds[] = {-1, -1, -1};
+
+  ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 3,
+                                  received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[2], pair3->first_fd()));
+}
+
+TEST_P(UnixSocketPairTest, BadFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  int sent_fd = -1;
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(sent_fd))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_len = CMSG_LEN(sizeof(sent_fd));
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_RIGHTS;
+  memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
+
+  struct iovec iov;
+  iov.iov_base = sent_data;
+  iov.iov_len = sizeof(sent_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EBADF));
+}
+
+// BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
+// The difference is that when calling recvmsg, no space for FDs is provided,
+// only space for the cmsg header.
+TEST_P(UnixSocketPairTest, BasicFDPassNoSpace) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+
+  struct msghdr msg = {};
+  std::vector<char> control(CMSG_SPACE(0));
+  msg.msg_control = &control[0];
+  msg.msg_controllen = control.size();
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(msg.msg_controllen, 0);
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// BasicFDPassUnalignedRecv starts off by sending a single FD just like
+// BasicFDPass. The difference is that when calling recvmsg, the length of the
+// receive data is only aligned on a 4 byte boundry instead of the normal 8.
+TEST_P(UnixSocketPairTest, BasicFDPassUnalignedRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFDUnaligned(
+      sockets->second_fd(), &fd, received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairTest, ConcurrentBasicFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  int sockfd1 = sockets->first_fd();
+  auto recv_func = [sockfd1, sent_data]() {
+    char received_data[20];
+    int fd = -1;
+    RecvSingleFD(sockfd1, &fd, received_data, sizeof(received_data));
+    ASSERT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+    char buf[20];
+    ASSERT_THAT(ReadFd(fd, buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+    ASSERT_THAT(WriteFd(fd, buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+  };
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->second_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  ScopedThread t(recv_func);
+
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(WriteFd(pair->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[20];
+  ASSERT_THAT(ReadFd(pair->first_fd(), received_data, sizeof(received_data)),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  t.Join();
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// FDPassNoRecv checks that the control message can be safely ignored by using
+// read(2) instead of recvmsg(2).
+TEST_P(UnixSocketPairTest, FDPassNoRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  // Read while ignoring the passed FD.
+  char received_data[20];
+  ASSERT_THAT(
+      ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+      SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  // Check that the socket still works for reads and writes.
+  ASSERT_NO_FATAL_FAILURE(
+      TransferTest(sockets->first_fd(), sockets->second_fd()));
+}
+
+// FDPassInterspersed1 checks that sent control messages cannot be read before
+// their associated data has been read.
+TEST_P(UnixSocketPairTest, FDPassInterspersed1) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char written_data[20];
+  RandomizeBuffer(written_data, sizeof(written_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
+              SyscallSucceedsWithValue(sizeof(written_data)));
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  // Check that we don't get a control message, but do get the data.
+  char received_data[20];
+  RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
+  EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
+}
+
+// FDPassInterspersed2 checks that sent control messages cannot be read after
+// their assocated data has been read while ignoring the control message by
+// using read(2) instead of recvmsg(2).
+TEST_P(UnixSocketPairTest, FDPassInterspersed2) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char written_data[20];
+  RandomizeBuffer(written_data, sizeof(written_data));
+  ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
+              SyscallSucceedsWithValue(sizeof(written_data)));
+
+  char received_data[20];
+  ASSERT_THAT(
+      ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+      SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
+}
+
+TEST_P(UnixSocketPairTest, FDPassNotCoalesced) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
+                                       sent_data1, sizeof(sent_data1)));
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
+                                       sent_data2, sizeof(sent_data2)));
+
+  char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
+  int received_fd1 = -1;
+
+  RecvSingleFD(sockets->second_fd(), &received_fd1, received_data1,
+               sizeof(received_data1), sizeof(sent_data1));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
+  TransferTest(pair1->first_fd(), pair1->second_fd());
+
+  char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
+  int received_fd2 = -1;
+
+  RecvSingleFD(sockets->second_fd(), &received_fd2, received_data2,
+               sizeof(received_data2), sizeof(sent_data2));
+
+  EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
+  TransferTest(pair2->first_fd(), pair2->second_fd());
+}
+
+TEST_P(UnixSocketPairTest, FDPassPeek) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char peek_data[20];
+  int peek_fd = -1;
+  PeekSingleFD(sockets->second_fd(), &peek_fd, peek_data, sizeof(peek_data));
+  EXPECT_EQ(0, memcmp(sent_data, peek_data, sizeof(sent_data)));
+  TransferTest(peek_fd, pair->first_fd());
+  EXPECT_THAT(close(peek_fd), SyscallSucceeds());
+
+  char received_data[20];
+  int received_fd = -1;
+  RecvSingleFD(sockets->second_fd(), &received_fd, received_data,
+               sizeof(received_data));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+  TransferTest(received_fd, pair->first_fd());
+  EXPECT_THAT(close(received_fd), SyscallSucceeds());
+}
+
+TEST_P(UnixSocketPairTest, BasicCredPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+  EXPECT_EQ(sent_creds.pid, received_creds.pid);
+  EXPECT_EQ(sent_creds.uid, received_creds.uid);
+  EXPECT_EQ(sent_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredRecvEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairTest, SendNullCredsAfterSoPassCredRecvEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  SetSoPassCred(sockets->second_fd());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->first_fd());
+
+  char received_data[20];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairTest, SendNullCredsAfterSoPassCredSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  SetSoPassCred(sockets->first_fd());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredRecvEndAfterSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  SetSoPassCred(sockets->first_fd());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredRecvEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairTest, WriteAfterSoPassCredRecvEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[20];
+
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  SetSoPassCred(sockets->first_fd());
+
+  char received_data[20];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairTest, WriteAfterSoPassCredSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->first_fd());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[20];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredRecvEndAfterSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  SetSoPassCred(sockets->first_fd());
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairTest, SoPassCred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int opt;
+  socklen_t optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+
+  SetSoPassCred(sockets->first_fd());
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_TRUE(opt);
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+
+  int zero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &zero,
+                         sizeof(zero)),
+              SyscallSucceeds());
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+}
+
+TEST_P(UnixSocketPairTest, NoDataCredPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct msghdr msg = {};
+
+  struct iovec iov;
+  iov.iov_base = sent_data;
+  iov.iov_len = sizeof(sent_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  char control[CMSG_SPACE(0)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_CREDENTIALS;
+  cmsg->cmsg_len = CMSG_LEN(0);
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UnixSocketPairTest, NoPassCred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairTest, CredAndFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendCredsAndFD(sockets->first_fd(), sent_creds,
+                                         pair->second_fd(), sent_data,
+                                         sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+                                         &fd, received_data,
+                                         sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  EXPECT_EQ(sent_creds.pid, received_creds.pid);
+  EXPECT_EQ(sent_creds.uid, received_creds.uid);
+  EXPECT_EQ(sent_creds.gid, received_creds.gid);
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairTest, FDPassBeforeSoPassCred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+                                         &fd, received_data,
+                                         sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairTest, FDPassAfterSoPassCred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  SetSoPassCred(sockets->second_fd());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  struct ucred received_creds;
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+                                         &fd, received_data,
+                                         sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairTest, CloexecDroppedWhenFDPassed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair = ASSERT_NO_ERRNO_AND_VALUE(
+      UnixDomainSocketPair(SOCK_SEQPACKET | SOCK_CLOEXEC).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+                                       sizeof(received_data)));
+
+  EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UnixSocketPairTest, CloexecRecvFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(int))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  char received_data[20];
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CMSG_CLOEXEC),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+  int fd = -1;
+  memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
+
+  EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST_P(UnixSocketPairTest, FDPassAfterSoPassCredWithoutCredSpace) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  SetSoPassCred(sockets->second_fd());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_LEN(0)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[20];
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// This test will validate that MSG_CTRUNC as an input flag to recvmsg will
+// not appear as an output flag on the control message when truncation doesn't
+// happen.
+TEST_P(UnixSocketPairTest, MsgCtruncInputIsNoop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(int)) /* we're passing a single fd */];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  char received_data[20];
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CTRUNC),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+  // Now we should verify that MSG_CTRUNC wasn't set as an output flag.
+  EXPECT_EQ(msg.msg_flags & MSG_CTRUNC, 0);
+}
+
+TEST_P(UnixSocketPairTest, FDPassAfterSoPassCredWithoutCredHeaderSpace) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  SetSoPassCred(sockets->second_fd());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_LEN(0) / 2];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[20];
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+  EXPECT_EQ(msg.msg_controllen, 0);
+}
+
+TEST_P(UnixSocketPairTest, InvalidGetSockOpt) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  int opt;
+  socklen_t optlen = sizeof(opt);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, -1, &opt, &optlen),
+              SyscallFailsWithErrno(ENOPROTOOPT));
+}
+
+TEST_P(UnixSocketPairTest, BindToBadName) {
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  constexpr char kBadName[] = "/some/path/that/does/not/exist";
+  sockaddr_un sockaddr;
+  sockaddr.sun_family = AF_LOCAL;
+  memcpy(sockaddr.sun_path, kBadName, sizeof(kBadName));
+
+  EXPECT_THAT(
+      bind(pair->first_fd(), reinterpret_cast<struct sockaddr*>(&sockaddr),
+           sizeof(sockaddr)),
+      SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  char received_data[sizeof(sent_data) * 2];
+  std::vector<struct mmsghdr> msgs(2);
+  std::vector<struct iovec> iovs(msgs.size());
+  const int chunk_size = sizeof(received_data) / msgs.size();
+  for (size_t i = 0; i < msgs.size(); i++) {
+    iovs[i].iov_len = chunk_size;
+    iovs[i].iov_base = &received_data[i * chunk_size];
+    msgs[i].msg_hdr.msg_iov = &iovs[i];
+    msgs[i].msg_hdr.msg_iovlen = 1;
+  }
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  struct timespec timeout = {0, 1};
+  ASSERT_THAT(RetryEINTR(recvmmsg)(sockets->second_fd(), &msgs[0], msgs.size(),
+                                   0, &timeout),
+              SyscallSucceedsWithValue(1));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  EXPECT_EQ(chunk_size, msgs[0].msg_len);
+}
+
+TEST_P(UnixSocketPairTest, TIOCINQSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int size = -1;
+  EXPECT_THAT(ioctl(sockets->first_fd(), TIOCINQ, &size), SyscallSucceeds());
+  EXPECT_EQ(size, 0);
+
+  const char some_data[] = "dangerzone";
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->second_fd(), &some_data, sizeof(some_data), 0),
+      SyscallSucceeds());
+  EXPECT_THAT(ioctl(sockets->first_fd(), TIOCINQ, &size), SyscallSucceeds());
+  EXPECT_EQ(size, sizeof(some_data));
+
+  // Linux only reports the first message's size, which is wrong. We test for
+  // the behavior described in the man page.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->second_fd(), &some_data, sizeof(some_data), 0),
+      SyscallSucceeds());
+  EXPECT_THAT(ioctl(sockets->first_fd(), TIOCINQ, &size), SyscallSucceeds());
+  EXPECT_EQ(size, sizeof(some_data) * 2);
+}
+
+TEST_P(UnixSocketPairTest, TIOCOUTQSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int size = -1;
+  EXPECT_THAT(ioctl(sockets->second_fd(), TIOCOUTQ, &size), SyscallSucceeds());
+  EXPECT_EQ(size, 0);
+
+  // Linux reports bogus numbers which are related to its internal allocations.
+  // We test for the behavior described in the man page.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  const char some_data[] = "dangerzone";
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->second_fd(), &some_data, sizeof(some_data), 0),
+      SyscallSucceeds());
+  EXPECT_THAT(ioctl(sockets->second_fd(), TIOCOUTQ, &size), SyscallSucceeds());
+  EXPECT_EQ(size, sizeof(some_data));
+
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->second_fd(), &some_data, sizeof(some_data), 0),
+      SyscallSucceeds());
+  EXPECT_THAT(ioctl(sockets->second_fd(), TIOCOUTQ, &size), SyscallSucceeds());
+  EXPECT_EQ(size, sizeof(some_data) * 2);
+}
+
+TEST_P(UnixSocketPairTest, NetdeviceIoctlsSucceed) {
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_DGRAM, 0));
+
+  // Prepare the request.
+  struct ifreq ifr;
+  snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+  // Check that the ioctl either succeeds or fails with ENODEV.
+  int err = ioctl(sock.get(), SIOCGIFINDEX, &ifr);
+  if (err < 0) {
+    ASSERT_EQ(errno, ENODEV);
+  }
+}
+
+TEST_P(UnixSocketPairTest, SocketShutdown) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char buf[20];
+  const std::string data = "abc";
+  ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), 3),
+              SyscallSucceedsWithValue(3));
+  ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RDWR), SyscallSucceeds());
+  ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RDWR), SyscallSucceeds());
+
+  // Shutting down a socket does not clear the buffer.
+  ASSERT_THAT(ReadFd(sockets->second_fd(), buf, 3),
+              SyscallSucceedsWithValue(3));
+  EXPECT_EQ(data, absl::string_view(buf, 3));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix.h b/test/syscalls/linux/socket_unix.h
new file mode 100644
index 000000000..d2a16afb2
--- /dev/null
+++ b/test/syscalls/linux/socket_unix.h
@@ -0,0 +1,29 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected unix sockets.
+using UnixSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_H_
diff --git a/test/syscalls/linux/socket_unix_abstract.cc b/test/syscalls/linux/socket_unix_abstract.cc
new file mode 100644
index 000000000..0878f63ff
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_abstract.cc
@@ -0,0 +1,38 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      AbstractBoundUnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{0, SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
new file mode 100644
index 000000000..93fb33832
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -0,0 +1,38 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      AbstractBoundUnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
new file mode 100644
index 000000000..c17d3990f
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_dgram.h"
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(DgramUnixSocketPairTest, WriteOneSideClosed) {
+  // FIXME: gVisor datagram sockets return EPIPE instead of
+  // ECONNREFUSED.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  constexpr char kStr[] = "abc";
+  ASSERT_THAT(write(sockets->second_fd(), kStr, 3),
+              SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_dgram.h b/test/syscalls/linux/socket_unix_dgram.h
new file mode 100644
index 000000000..722a3d8e6
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_dgram.h
@@ -0,0 +1,29 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_DGRAM_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_DGRAM_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected dgram unix sockets.
+using DgramUnixSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_DGRAM_H_
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
new file mode 100644
index 000000000..b2fa72b5e
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -0,0 +1,59 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_stream.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix_dgram.h"
+#include "test/syscalls/linux/socket_unix_non_stream.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          UnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          FilesystemBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          AbstractBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC}))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, DgramUnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnixNonStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
new file mode 100644
index 000000000..9152c229c
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -0,0 +1,68 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of connected non-blocking dgram
+// unix sockets.
+using NonBlockingDgramUnixSocketPairTest = SocketPairTest;
+
+TEST_P(NonBlockingDgramUnixSocketPairTest, ReadOneSideClosed) {
+  if (IsRunningOnGvisor()) {
+    // FIXME: gVisor datagram sockets return 0 instead of
+    // EAGAIN.
+    return;
+  }
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  char data[10] = {};
+  ASSERT_THAT(read(sockets->second_fd(), data, sizeof(data)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonBlockingDgramUnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            UnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+                                   List<int>{SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            FilesystemBoundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+                                   List<int>{SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractBoundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+                                   List<int>{SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC}))))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_domain.cc b/test/syscalls/linux/socket_unix_domain.cc
new file mode 100644
index 000000000..f8f0d01eb
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_domain.cc
@@ -0,0 +1,38 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      UnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{0, SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, AllSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem.cc b/test/syscalls/linux/socket_unix_filesystem.cc
new file mode 100644
index 000000000..be873edcb
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_filesystem.cc
@@ -0,0 +1,38 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      FilesystemBoundUnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{0, SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
new file mode 100644
index 000000000..63e85ac11
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -0,0 +1,38 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      FilesystemBoundUnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
new file mode 100644
index 000000000..620397746
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -0,0 +1,229 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_non_stream.h"
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(UnixNonStreamSocketPairTest, RecvMsgTooLarge) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int rcvbuf;
+  socklen_t length = sizeof(rcvbuf);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &rcvbuf, &length),
+      SyscallSucceeds());
+
+  // Make the call larger than the receive buffer.
+  const int recv_size = 3 * rcvbuf;
+
+  // Write a message that does fit in the receive buffer.
+  const int write_size = rcvbuf - kPageSize;
+
+  std::vector<char> write_buf(write_size, 'a');
+  const int ret = RetryEINTR(write)(sockets->second_fd(), write_buf.data(),
+                                    write_buf.size());
+  if (ret < 0 && errno == ENOBUFS) {
+    // NOTE: Linux may stall the write for a long time and
+    // ultimately return ENOBUFS. Allow this error, since a retry will likely
+    // result in the same error.
+    return;
+  }
+  ASSERT_THAT(ret, SyscallSucceeds());
+
+  std::vector<char> recv_buf(recv_size);
+
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sockets->first_fd(), recv_buf.data(),
+                                     recv_buf.size(), write_size));
+
+  recv_buf.resize(write_size);
+  EXPECT_EQ(recv_buf, write_buf);
+}
+
+// Create a region of anonymous memory of size 'size', which is fragmented in
+// FileMem.
+//
+// ptr contains the start address of the region. The returned vector contains
+// all of the mappings to be unmapped when done.
+PosixErrorOr<std::vector<Mapping>> CreateFragmentedRegion(const int size,
+                                                          void** ptr) {
+  Mapping region;
+  ASSIGN_OR_RETURN_ERRNO(region, Mmap(nullptr, size, PROT_NONE,
+                                      MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
+
+  *ptr = region.ptr();
+
+  // Don't save hundreds of times for all of these mmaps.
+  DisableSave ds;
+
+  std::vector<Mapping> pages;
+
+  // Map and commit a single page at a time, mapping and committing an unrelated
+  // page between each call to force FileMem fragmentation.
+  for (uintptr_t addr = region.addr(); addr < region.endaddr();
+       addr += kPageSize) {
+    Mapping page;
+    ASSIGN_OR_RETURN_ERRNO(
+        page,
+        Mmap(reinterpret_cast<void*>(addr), kPageSize, PROT_READ | PROT_WRITE,
+             MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0));
+    *reinterpret_cast<volatile char*>(page.ptr()) = 42;
+
+    pages.emplace_back(std::move(page));
+
+    // Unrelated page elsewhere.
+    ASSIGN_OR_RETURN_ERRNO(page,
+                           Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+                                MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
+    *reinterpret_cast<volatile char*>(page.ptr()) = 42;
+
+    pages.emplace_back(std::move(page));
+  }
+
+  // The mappings above have taken ownership of the region.
+  region.release();
+
+  return pages;
+}
+
+// A contiguous iov that is heavily fragmented in FileMem can still be sent
+// successfully.
+TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  const int buffer_size = UIO_MAXIOV * kPageSize;
+  // Extra page for message header overhead.
+  const int sndbuf = buffer_size + kPageSize;
+  // N.B. setsockopt(SO_SNDBUF) doubles the passed value.
+  const int set_sndbuf = sndbuf / 2;
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+                         &set_sndbuf, sizeof(set_sndbuf)),
+              SyscallSucceeds());
+
+  int actual_sndbuf = 0;
+  socklen_t length = sizeof(actual_sndbuf);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+                         &actual_sndbuf, &length),
+              SyscallSucceeds());
+
+  if (actual_sndbuf != sndbuf) {
+    // Unable to get the sndbuf we want.
+    //
+    // N.B. At minimum, the socketpair gofer should provide a socket that is
+    // already the correct size.
+    //
+    // TODO: When internal UDS support SO_SNDBUF, we can assert that
+    // we always get the right SO_SNDBUF on gVisor.
+    LOG(INFO) << "SO_SNDBUF = " << actual_sndbuf << ", want " << sndbuf
+              << ". Skipping test";
+    return;
+  }
+
+  // Create a contiguous region of memory of 2*UIO_MAXIOV*PAGE_SIZE. We'll call
+  // sendmsg with a single iov, but the goal is to get the sentry to split this
+  // into > UIO_MAXIOV iovs when calling the kernel.
+  void* ptr;
+  std::vector<Mapping> pages =
+      ASSERT_NO_ERRNO_AND_VALUE(CreateFragmentedRegion(buffer_size, &ptr));
+
+  struct iovec iov = {};
+  iov.iov_base = ptr;
+  iov.iov_len = buffer_size;
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  // NOTE: Linux has poor behavior in the presence of
+  // physical memory fragmentation. As a result, this may stall for a long time
+  // and ultimately return ENOBUFS. Allow this error, since it means that we
+  // made it to the host kernel and started the sendmsg.
+  EXPECT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+              AnyOf(SyscallSucceedsWithValue(buffer_size),
+                    SyscallFailsWithErrno(ENOBUFS)));
+}
+
+// A contiguous iov that is heavily fragmented in FileMem can still be received
+// into successfully.
+TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  const int buffer_size = UIO_MAXIOV * kPageSize;
+  // Extra page for message header overhead.
+  const int sndbuf = buffer_size + kPageSize;
+  // N.B. setsockopt(SO_SNDBUF) doubles the passed value.
+  const int set_sndbuf = sndbuf / 2;
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+                         &set_sndbuf, sizeof(set_sndbuf)),
+              SyscallSucceeds());
+
+  int actual_sndbuf = 0;
+  socklen_t length = sizeof(actual_sndbuf);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+                         &actual_sndbuf, &length),
+              SyscallSucceeds());
+
+  if (actual_sndbuf != sndbuf) {
+    // Unable to get the sndbuf we want.
+    //
+    // N.B. At minimum, the socketpair gofer should provide a socket that is
+    // already the correct size.
+    //
+    // TODO: When internal UDS support SO_SNDBUF, we can assert that
+    // we always get the right SO_SNDBUF on gVisor.
+    LOG(INFO) << "SO_SNDBUF = " << actual_sndbuf << ", want " << sndbuf
+              << ". Skipping test";
+    return;
+  }
+
+  std::vector<char> write_buf(buffer_size, 'a');
+  const int ret = RetryEINTR(write)(sockets->first_fd(), write_buf.data(),
+                                    write_buf.size());
+  if (ret < 0 && errno == ENOBUFS) {
+    // NOTE: Linux may stall the write for a long time and
+    // ultimately return ENOBUFS. Allow this error, since a retry will likely
+    // result in the same error.
+    return;
+  }
+  ASSERT_THAT(ret, SyscallSucceeds());
+
+  // Create a contiguous region of memory of 2*UIO_MAXIOV*PAGE_SIZE. We'll call
+  // sendmsg with a single iov, but the goal is to get the sentry to split this
+  // into > UIO_MAXIOV iovs when calling the kernel.
+  void* ptr;
+  std::vector<Mapping> pages =
+      ASSERT_NO_ERRNO_AND_VALUE(CreateFragmentedRegion(buffer_size, &ptr));
+
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(
+      sockets->second_fd(), reinterpret_cast<char*>(ptr), buffer_size));
+
+  EXPECT_EQ(0, memcmp(write_buf.data(), ptr, buffer_size));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_non_stream.h b/test/syscalls/linux/socket_unix_non_stream.h
new file mode 100644
index 000000000..e4214d949
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_non_stream.h
@@ -0,0 +1,30 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_NON_STREAM_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_NON_STREAM_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected non-stream
+// unix-domain sockets.
+using UnixNonStreamSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_NON_STREAM_H_
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
new file mode 100644
index 000000000..c5d525dde
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -0,0 +1,47 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_non_stream_blocking.h"
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          UnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_DGRAM, SOCK_SEQPACKET},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          FilesystemBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_DGRAM, SOCK_SEQPACKET},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          AbstractBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_DGRAM, SOCK_SEQPACKET},
+                                 List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, BlockingNonStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
new file mode 100644
index 000000000..85dd3711b
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -0,0 +1,38 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      UnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{0, SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
new file mode 100644
index 000000000..6a40fe68c
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -0,0 +1,38 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+      UnixDomainSocketPair,
+      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+                             List<int>{SOCK_NONBLOCK},
+                             List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
new file mode 100644
index 000000000..ad0af77e9
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -0,0 +1,49 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_seqpacket.h"
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(SeqpacketUnixSocketPairTest, WriteOneSideClosed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  constexpr char kStr[] = "abc";
+  ASSERT_THAT(write(sockets->second_fd(), kStr, 3),
+              SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(SeqpacketUnixSocketPairTest, ReadOneSideClosed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  char data[10] = {};
+  ASSERT_THAT(read(sockets->second_fd(), data, sizeof(data)),
+              SyscallSucceedsWithValue(0));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_seqpacket.h b/test/syscalls/linux/socket_unix_seqpacket.h
new file mode 100644
index 000000000..da8eb2b2b
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_seqpacket.h
@@ -0,0 +1,30 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_SEQPACKET_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_SEQPACKET_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected seqpacket unix
+// sockets.
+using SeqpacketUnixSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_SEQPACKET_H_
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
new file mode 100644
index 000000000..f9139a754
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -0,0 +1,59 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_stream.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix_non_stream.h"
+#include "test/syscalls/linux/socket_unix_seqpacket.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          UnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          FilesystemBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          AbstractBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC}))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, SeqpacketUnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnixNonStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
new file mode 100644
index 000000000..8232c9e35
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -0,0 +1,69 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of connected stream unix sockets.
+using StreamUnixSocketPairTest = SocketPairTest;
+
+TEST_P(StreamUnixSocketPairTest, WriteOneSideClosed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  constexpr char kStr[] = "abc";
+  ASSERT_THAT(write(sockets->second_fd(), kStr, 3),
+              SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(StreamUnixSocketPairTest, ReadOneSideClosed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  char data[10] = {};
+  ASSERT_THAT(read(sockets->second_fd(), data, sizeof(data)),
+              SyscallSucceedsWithValue(0));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, StreamUnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            UnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            FilesystemBoundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractBoundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC}))))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
new file mode 100644
index 000000000..1cdeadd27
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -0,0 +1,47 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_stream_blocking.h"
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          UnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          FilesystemBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          AbstractBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, BlockingStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
new file mode 100644
index 000000000..9f11e2d49
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -0,0 +1,49 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_stream.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          UnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          FilesystemBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          AbstractBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{0, SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, StreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
new file mode 100644
index 000000000..4c3d3a187
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -0,0 +1,49 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "test/syscalls/linux/socket_stream_nonblock.h"
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return VecCat<SocketPairKind>(
+      ApplyVec<SocketPairKind>(
+          UnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          FilesystemBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})),
+      ApplyVec<SocketPairKind>(
+          AbstractBoundUnixDomainSocketPair,
+          AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                 List<int>{SOCK_NONBLOCK},
+                                 List<int>{0, SOCK_CLOEXEC})));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, NonBlockingStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc
new file mode 100644
index 000000000..a35b3b9bd
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc
@@ -0,0 +1,116 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of unbound abstract unix sockets.
+using UnboundAbstractUnixSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundAbstractUnixSocketPairTest, AddressAfterNull) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct sockaddr_un addr =
+      *reinterpret_cast<const struct sockaddr_un*>(sockets->first_addr());
+  ASSERT_EQ(addr.sun_path[sizeof(addr.sun_path) - 1], 0);
+  SKIP_IF(addr.sun_path[sizeof(addr.sun_path) - 2] != 0 ||
+          addr.sun_path[sizeof(addr.sun_path) - 3] != 0);
+
+  addr.sun_path[sizeof(addr.sun_path) - 2] = 'a';
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->second_fd(),
+                   reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+              SyscallSucceeds());
+}
+
+TEST_P(UnboundAbstractUnixSocketPairTest, ShortAddressNotExtended) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct sockaddr_un addr =
+      *reinterpret_cast<const struct sockaddr_un*>(sockets->first_addr());
+  ASSERT_EQ(addr.sun_path[sizeof(addr.sun_path) - 1], 0);
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size() - 1),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->second_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
+TEST_P(UnboundAbstractUnixSocketPairTest, BindNothing) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  struct sockaddr_un addr = {.sun_family = AF_UNIX};
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+              SyscallSucceeds());
+}
+
+TEST_P(UnboundAbstractUnixSocketPairTest, GetSockNameFullLength) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  sockaddr_storage addr = {};
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, sockets->first_addr_size());
+}
+
+TEST_P(UnboundAbstractUnixSocketPairTest, GetSockNamePartialLength) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size() - 1),
+              SyscallSucceeds());
+
+  sockaddr_storage addr = {};
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, sockets->first_addr_size() - 1);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnboundAbstractUnixSocketPairTest,
+    ::testing::ValuesIn(ApplyVec<SocketPairKind>(
+        AbstractUnboundUnixDomainSocketPair,
+        AllBitwiseCombinations(
+            List<int>{SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM},
+            List<int>{0, SOCK_NONBLOCK}, List<int>{0, SOCK_CLOEXEC}))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
new file mode 100644
index 000000000..a01b7f644
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -0,0 +1,162 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of unbound dgram unix sockets.
+using UnboundDgramUnixSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundDgramUnixSocketPairTest, BindConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, SelfConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(sockets->first_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, DoubleConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, GetRemoteAddress) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  socklen_t addressLength = sockets->first_addr_size();
+  struct sockaddr_storage address = {};
+  ASSERT_THAT(getpeername(sockets->second_fd(), (struct sockaddr*)(&address),
+                          &addressLength),
+              SyscallSucceeds());
+  EXPECT_EQ(
+      0, memcmp(&address, sockets->first_addr(), sockets->first_addr_size()));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, Sendto) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(sendto(sockets->second_fd(), sent_data, sizeof(sent_data), 0,
+                     sockets->first_addr(), sockets->first_addr_size()),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data)];
+  ASSERT_THAT(ReadFd(sockets->first_fd(), received_data, sizeof(received_data)),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, ZeroWriteAllowed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  char sent_data[3];
+  // Send a zero length packet.
+  ASSERT_THAT(write(sockets->second_fd(), sent_data, 0),
+              SyscallSucceedsWithValue(0));
+  // Receive the packet.
+  char received_data[sizeof(sent_data)];
+  ASSERT_THAT(read(sockets->first_fd(), received_data, sizeof(received_data)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, Listen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(listen(sockets->first_fd(), 0), SyscallFailsWithErrno(ENOTSUP));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, Accept) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  ASSERT_THAT(accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallFailsWithErrno(ENOTSUP));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  char data = 'a';
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->second_fd(), &data, sizeof(data), 0,
+                         sockets->first_addr(), sockets->first_addr_size()),
+      SyscallSucceedsWithValue(sizeof(data)));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnboundDgramUnixSocketPairTest,
+    ::testing::ValuesIn(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            FilesystemUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
new file mode 100644
index 000000000..56d882643
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
@@ -0,0 +1,84 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of unbound filesystem unix
+// sockets.
+using UnboundFilesystemUnixSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundFilesystemUnixSocketPairTest, AddressAfterNull) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct sockaddr_un addr =
+      *reinterpret_cast<const struct sockaddr_un*>(sockets->first_addr());
+  ASSERT_EQ(addr.sun_path[sizeof(addr.sun_path) - 1], 0);
+  SKIP_IF(addr.sun_path[sizeof(addr.sun_path) - 2] != 0 ||
+          addr.sun_path[sizeof(addr.sun_path) - 3] != 0);
+
+  addr.sun_path[sizeof(addr.sun_path) - 2] = 'a';
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(sockets->second_fd(),
+                   reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(UnboundFilesystemUnixSocketPairTest, GetSockNameLength) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  sockaddr_storage got_addr = {};
+  socklen_t got_addr_len = sizeof(got_addr);
+  ASSERT_THAT(
+      getsockname(sockets->first_fd(),
+                  reinterpret_cast<struct sockaddr*>(&got_addr), &got_addr_len),
+      SyscallSucceeds());
+
+  sockaddr_un want_addr =
+      *reinterpret_cast<const struct sockaddr_un*>(sockets->first_addr());
+
+  EXPECT_EQ(got_addr_len,
+            strlen(want_addr.sun_path) + 1 + sizeof(want_addr.sun_family));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnboundFilesystemUnixSocketPairTest,
+    ::testing::ValuesIn(ApplyVec<SocketPairKind>(
+        FilesystemUnboundUnixDomainSocketPair,
+        AllBitwiseCombinations(
+            List<int>{SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM},
+            List<int>{0, SOCK_NONBLOCK}, List<int>{0, SOCK_CLOEXEC}))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
new file mode 100644
index 000000000..fa3b99490
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -0,0 +1,91 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of unbound seqpacket unix sockets.
+using UnboundUnixSeqpacketSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundUnixSeqpacketSocketPairTest, SendtoWithoutConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  char data = 'a';
+  ASSERT_THAT(sendto(sockets->second_fd(), &data, sizeof(data), 0,
+                     sockets->first_addr(), sockets->first_addr_size()),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UnboundUnixSeqpacketSocketPairTest, SendtoWithoutConnectIgnoresAddr) {
+  // FIXME: gVisor tries to find /foo/bar and thus returns ENOENT.
+  if (IsRunningOnGvisor()) {
+    return;
+  }
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  // Even a bogus address is completely ignored.
+  constexpr char kPath[] = "/foo/bar";
+
+  // Sanity check that kPath doesn't exist.
+  struct stat s;
+  ASSERT_THAT(stat(kPath, &s), SyscallFailsWithErrno(ENOENT));
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+  char data = 'a';
+  ASSERT_THAT(
+      sendto(sockets->second_fd(), &data, sizeof(data), 0,
+             reinterpret_cast<const struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(ENOTCONN));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnboundUnixSeqpacketSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            FilesystemUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC}))))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
new file mode 100644
index 000000000..99636b221
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -0,0 +1,738 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of connected unix stream sockets.
+using UnixStreamSocketPairTest = SocketPairTest;
+
+// FDPassPartialRead checks that sent control messages cannot be read after
+// any of their assocated data has been read while ignoring the control message
+// by using read(2) instead of recvmsg(2).
+TEST_P(UnixStreamSocketPairTest, FDPassPartialRead) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data) / 2];
+  ASSERT_THAT(
+      ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+      SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+
+  RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
+  EXPECT_EQ(0, memcmp(sent_data + sizeof(received_data), received_data,
+                      sizeof(received_data)));
+}
+
+TEST_P(UnixStreamSocketPairTest, FDPassCoalescedRead) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
+                                       sent_data1, sizeof(sent_data1)));
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
+                                       sent_data2, sizeof(sent_data2)));
+
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+  ASSERT_THAT(
+      ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+      SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+  EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+                      sizeof(sent_data2)));
+}
+
+// ZeroLengthMessageFDDiscarded checks that control messages associated with
+// zero length messages are discarded.
+TEST_P(UnixStreamSocketPairTest, ZeroLengthMessageFDDiscarded) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Zero length arrays are invalid in ISO C++, so allocate one of size 1 and
+  // send a length of 0.
+  char sent_data1[1] = {};
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendSingleFD(sockets->first_fd(), pair->second_fd(), sent_data1, 0));
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  char received_data[sizeof(sent_data2)] = {};
+
+  RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
+  EXPECT_EQ(0, memcmp(sent_data2, received_data, sizeof(received_data)));
+}
+
+// FDPassCoalescedRecv checks that control messages not in the first message are
+// preserved in a coalesced recv.
+TEST_P(UnixStreamSocketPairTest, FDPassCoalescedRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data) / 2),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data + sizeof(sent_data) / 2,
+                                       sizeof(sent_data) / 2));
+
+  char received_data[sizeof(sent_data)];
+
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+                                       sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+// ReadsNotCoalescedAfterFDPass checks that messages after a message containing
+// an FD control message are not coalesced.
+TEST_P(UnixStreamSocketPairTest, ReadsNotCoalescedAfterFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data) / 2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data + sizeof(sent_data) / 2,
+                      sizeof(sent_data) / 2),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+  char received_data[sizeof(sent_data)];
+
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+                                       sizeof(received_data),
+                                       sizeof(sent_data) / 2));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(sent_data) / 2));
+
+  EXPECT_EQ(0, memcmp(sent_data + sizeof(sent_data) / 2, received_data,
+                      sizeof(sent_data) / 2));
+}
+
+// FDPassNotCombined checks that FD control messages are not combined in a
+// coalesced read.
+TEST_P(UnixStreamSocketPairTest, FDPassNotCombined) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
+                                       sent_data, sizeof(sent_data) / 2));
+
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
+                                       sent_data + sizeof(sent_data) / 2,
+                                       sizeof(sent_data) / 2));
+
+  char received_data[sizeof(sent_data)];
+
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+                                       sizeof(received_data),
+                                       sizeof(sent_data) / 2));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair1->first_fd()));
+
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  fd = -1;
+
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+                                       sizeof(received_data),
+                                       sizeof(sent_data) / 2));
+
+  EXPECT_EQ(0, memcmp(sent_data + sizeof(sent_data) / 2, received_data,
+                      sizeof(sent_data) / 2));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair2->first_fd()));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_P(UnixStreamSocketPairTest, CredPassPartialRead) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  int one = 1;
+  ASSERT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &one,
+                         sizeof(one)),
+              SyscallSucceeds());
+
+  for (int i = 0; i < 2; i++) {
+    char received_data[10];
+    struct ucred received_creds;
+    ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                      received_data, sizeof(received_data),
+                                      sizeof(received_data)));
+
+    EXPECT_EQ(0, memcmp(sent_data + i * sizeof(received_data), received_data,
+                        sizeof(received_data)));
+    EXPECT_EQ(sent_creds.pid, received_creds.pid);
+    EXPECT_EQ(sent_creds.uid, received_creds.uid);
+    EXPECT_EQ(sent_creds.gid, received_creds.gid);
+  }
+}
+
+// Unix stream sockets peek in the same way as datagram sockets.
+//
+// SinglePeek checks that only a single message is peekable in a single recv.
+TEST_P(UnixStreamSocketPairTest, SinglePeek) {
+  if (!IsRunningOnGvisor()) {
+    // Don't run this test on linux kernels newer than 4.3.x Linux kernel commit
+    // 9f389e35674f5b086edd70ed524ca0f287259725 which changes this behavior. We
+    // used to target 3.11 compatibility, so disable this test on newer kernels.
+    //
+    // NOTE: Bring this up to Linux 4.4 compatibility.
+    auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+    SKIP_IF(version.major > 4 || (version.major == 4 && version.minor >= 3));
+  }
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char sent_data[40];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), sent_data,
+                               sizeof(sent_data) / 2, 0),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data + sizeof(sent_data) / 2,
+                       sizeof(sent_data) / 2, 0),
+      SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+  char received_data[sizeof(sent_data)];
+  for (int i = 0; i < 3; i++) {
+    memset(received_data, 0, sizeof(received_data));
+    ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                                 sizeof(received_data), MSG_PEEK),
+                SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+    EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+  }
+  memset(received_data, 0, sizeof(received_data));
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(sent_data) / 2, 0),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+  memset(received_data, 0, sizeof(received_data));
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(sent_data) / 2, 0),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+  EXPECT_EQ(0, memcmp(sent_data + sizeof(sent_data) / 2, received_data,
+                      sizeof(sent_data) / 2));
+}
+
+TEST_P(UnixStreamSocketPairTest, CredsNotCoalescedUp) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data),
+                                    sizeof(sent_data1)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+
+  struct ucred want_creds {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data),
+                                    sizeof(sent_data2)));
+
+  EXPECT_EQ(0, memcmp(sent_data2, received_data, sizeof(sent_data2)));
+
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, CredsNotCoalescedDown) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+  UnsetSoPassCred(sockets->second_fd());
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+  struct ucred received_creds;
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data),
+                                    sizeof(sent_data1)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data),
+                                    sizeof(sent_data2)));
+
+  EXPECT_EQ(0, memcmp(sent_data2, received_data, sizeof(sent_data2)));
+
+  want_creds = {0, 65534, 65534};
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, CoalescedCredsNoPasscred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+  UnsetSoPassCred(sockets->second_fd());
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+  EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+                      sizeof(sent_data2)));
+}
+
+TEST_P(UnixStreamSocketPairTest, CoalescedCreds1) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+  struct ucred received_creds;
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+  EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+                      sizeof(sent_data2)));
+
+  struct ucred want_creds {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, CoalescedCreds2) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+  struct ucred received_creds;
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+  EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+                      sizeof(sent_data2)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, NonCoalescedDifferingCreds1) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
+  struct ucred received_creds1;
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds1,
+                                    received_data1, sizeof(sent_data1)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
+
+  struct ucred want_creds1 {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds1.pid, received_creds1.pid);
+  EXPECT_EQ(want_creds1.uid, received_creds1.uid);
+  EXPECT_EQ(want_creds1.gid, received_creds1.gid);
+
+  char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
+  struct ucred received_creds2;
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds2,
+                                    received_data2, sizeof(sent_data2)));
+
+  EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
+
+  struct ucred want_creds2;
+  ASSERT_THAT(want_creds2.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds2.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds2.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds2.pid, received_creds2.pid);
+  EXPECT_EQ(want_creds2.uid, received_creds2.uid);
+  EXPECT_EQ(want_creds2.gid, received_creds2.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, NonCoalescedDifferingCreds2) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+  UnsetSoPassCred(sockets->second_fd());
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
+  struct ucred received_creds1;
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds1,
+                                    received_data1, sizeof(sent_data1)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
+
+  struct ucred want_creds1;
+  ASSERT_THAT(want_creds1.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds1.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds1.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds1.pid, received_creds1.pid);
+  EXPECT_EQ(want_creds1.uid, received_creds1.uid);
+  EXPECT_EQ(want_creds1.gid, received_creds1.gid);
+
+  char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
+  struct ucred received_creds2;
+
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds2,
+                                    received_data2, sizeof(sent_data2)));
+
+  EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
+
+  struct ucred want_creds2 {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds2.pid, received_creds2.pid);
+  EXPECT_EQ(want_creds2.uid, received_creds2.uid);
+  EXPECT_EQ(want_creds2.gid, received_creds2.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, CoalescedDifferingCreds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+              SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+              SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+  UnsetSoPassCred(sockets->second_fd());
+
+  char sent_data3[20];
+  RandomizeBuffer(sent_data3, sizeof(sent_data3));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data3, sizeof(sent_data3)),
+              SyscallSucceedsWithValue(sizeof(sent_data3)));
+
+  char received_data[sizeof(sent_data1) + sizeof(sent_data2) +
+                     sizeof(sent_data3)];
+
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+  EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+                      sizeof(sent_data2)));
+  EXPECT_EQ(0, memcmp(sent_data3,
+                      received_data + sizeof(sent_data1) + sizeof(sent_data2),
+                      sizeof(sent_data3)));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnixStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            UnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            FilesystemBoundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractBoundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC}))))));
+
+// Test fixture for tests that apply to pairs of unbound unix stream sockets.
+using UnboundUnixStreamSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundUnixStreamSocketPairTest, SendtoWithoutConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  char data = 'a';
+  ASSERT_THAT(sendto(sockets->second_fd(), &data, sizeof(data), 0,
+                     sockets->first_addr(), sockets->first_addr_size()),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_P(UnboundUnixStreamSocketPairTest, SendtoWithoutConnectIgnoresAddr) {
+  // FIXME: gVisor tries to find /foo/bar and thus returns ENOENT.
+  if (IsRunningOnGvisor()) {
+    return;
+  }
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  // Even a bogus address is completely ignored.
+  constexpr char kPath[] = "/foo/bar";
+
+  // Sanity check that kPath doesn't exist.
+  struct stat s;
+  ASSERT_THAT(stat(kPath, &s), SyscallFailsWithErrno(ENOENT));
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+  char data = 'a';
+  ASSERT_THAT(
+      sendto(sockets->second_fd(), &data, sizeof(data), 0,
+             reinterpret_cast<const struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllUnixDomainSockets, UnboundUnixStreamSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+        ApplyVec<SocketPairKind>(
+            FilesystemUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC})),
+        ApplyVec<SocketPairKind>(
+            AbstractUnboundUnixDomainSocketPair,
+            AllBitwiseCombinations(List<int>{SOCK_STREAM},
+                                   List<int>{0, SOCK_NONBLOCK},
+                                   List<int>{0, SOCK_CLOEXEC}))))));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
new file mode 100644
index 000000000..aea19dbff
--- /dev/null
+++ b/test/syscalls/linux/stat.cc
@@ -0,0 +1,410 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <unistd.h>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class StatTest : public FileTest {};
+
+TEST_F(StatTest, FstatatAbs) {
+  struct stat st;
+
+  // Check that the stat works.
+  EXPECT_THAT(fstatat(AT_FDCWD, test_file_name_.c_str(), &st, 0),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(st.st_mode));
+}
+
+TEST_F(StatTest, FstatatEmptyPath) {
+  struct stat st;
+  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+  // Check that the stat works.
+  EXPECT_THAT(fstatat(fd.get(), "", &st, AT_EMPTY_PATH), SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(st.st_mode));
+}
+
+TEST_F(StatTest, FstatatRel) {
+  struct stat st;
+  int dirfd;
+  auto filename = std::string(Basename(test_file_name_));
+
+  // Open the temporary directory read-only.
+  ASSERT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_RDONLY),
+              SyscallSucceeds());
+
+  // Check that the stat works.
+  EXPECT_THAT(fstatat(dirfd, filename.c_str(), &st, 0), SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(st.st_mode));
+  close(dirfd);
+}
+
+TEST_F(StatTest, FstatatSymlink) {
+  struct stat st;
+
+  // Check that the link is followed.
+  EXPECT_THAT(fstatat(AT_FDCWD, "/proc/self", &st, 0), SyscallSucceeds());
+  EXPECT_TRUE(S_ISDIR(st.st_mode));
+  EXPECT_FALSE(S_ISLNK(st.st_mode));
+
+  // Check that the flag works.
+  EXPECT_THAT(fstatat(AT_FDCWD, "/proc/self", &st, AT_SYMLINK_NOFOLLOW),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISLNK(st.st_mode));
+  EXPECT_FALSE(S_ISDIR(st.st_mode));
+}
+
+TEST_F(StatTest, Nlinks) {
+  TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  // Directory is initially empty, it should contain 2 links (one from itself,
+  // one from ".").
+  EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(2));
+
+  // Create a file in the test directory. Files shouldn't increase the link
+  // count on the base directory.
+  TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(basedir.path()));
+  EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(2));
+
+  // Create subdirectories. This should increase the link count by 1 per
+  // subdirectory.
+  TempPath dir1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(basedir.path()));
+  EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(3));
+  TempPath dir2 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(basedir.path()));
+  EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(4));
+
+  // Removing directories should reduce the link count.
+  dir1.reset();
+  EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(3));
+  dir2.reset();
+  EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(2));
+
+  // Removing files should have no effect on link count.
+  file1.reset();
+  EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(2));
+}
+
+TEST_F(StatTest, BlocksIncreaseOnWrite) {
+  struct stat st;
+
+  // Stat the empty file.
+  ASSERT_THAT(fstat(test_file_fd_.get(), &st), SyscallSucceeds());
+
+  const int initial_blocks = st.st_blocks;
+
+  // Write to the file, making sure to exceed the block size.
+  std::vector<char> buf(2 * st.st_blksize, 'a');
+  ASSERT_THAT(write(test_file_fd_.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Stat the file again, and verify that number of allocated blocks has
+  // increased.
+  ASSERT_THAT(fstat(test_file_fd_.get(), &st), SyscallSucceeds());
+  EXPECT_GT(st.st_blocks, initial_blocks);
+}
+
+TEST_F(StatTest, PathNotCleaned) {
+  TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  // Create a file in the basedir.
+  TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(basedir.path()));
+
+  // Stating the file directly should succeed.
+  struct stat buf;
+  EXPECT_THAT(lstat(file.path().c_str(), &buf), SyscallSucceeds());
+
+  // Try to stat the file using a directory that does not exist followed by
+  // "..".  If the path is cleaned prior to stating (which it should not be)
+  // then this will succeed.
+  const std::string bad_path = JoinPath("/does_not_exist/..", file.path());
+  EXPECT_THAT(lstat(bad_path.c_str(), &buf), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_F(StatTest, PathCanContainDotDot) {
+  TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath subdir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(basedir.path()));
+  const std::string subdir_name = std::string(Basename(subdir.path()));
+
+  // Create a file in the subdir.
+  TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(subdir.path()));
+  const std::string file_name = std::string(Basename(file.path()));
+
+  // Stat the file through a path that includes '..' and '.' but still resolves
+  // to the file.
+  const std::string good_path =
+      JoinPath(basedir.path(), subdir_name, "..", subdir_name, ".", file_name);
+  struct stat buf;
+  EXPECT_THAT(lstat(good_path.c_str(), &buf), SyscallSucceeds());
+}
+
+TEST_F(StatTest, PathCanContainEmptyComponent) {
+  TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  // Create a file in the basedir.
+  TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(basedir.path()));
+  const std::string file_name = std::string(Basename(file.path()));
+
+  // Stat the file through a path that includes an empty component.  We have to
+  // build this ourselves because JoinPath automatically removes empty
+  // components.
+  const std::string good_path = absl::StrCat(basedir.path(), "//", file_name);
+  struct stat buf;
+  EXPECT_THAT(lstat(good_path.c_str(), &buf), SyscallSucceeds());
+}
+
+TEST_F(StatTest, TrailingSlashNotCleanedReturnsENOTDIR) {
+  TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  // Create a file in the basedir.
+  TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(basedir.path()));
+
+  // Stat the file with an extra "/" on the end of it.  Since file is not a
+  // directory, this should return ENOTDIR.
+  const std::string bad_path = absl::StrCat(file.path(), "/");
+  struct stat buf;
+  EXPECT_THAT(lstat(bad_path.c_str(), &buf), SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST_F(StatTest, LeadingDoubleSlash) {
+  // Create a file, and make sure we can stat it.
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  struct stat st;
+  ASSERT_THAT(lstat(file.path().c_str(), &st), SyscallSucceeds());
+
+  // Now add an extra leading slash.
+  const std::string double_slash_path = absl::StrCat("/", file.path());
+  ASSERT_TRUE(absl::StartsWith(double_slash_path, "//"));
+
+  // We should be able to stat the new path, and it should resolve to the same
+  // file (same device and inode).
+  struct stat double_slash_st;
+  ASSERT_THAT(lstat(double_slash_path.c_str(), &double_slash_st),
+              SyscallSucceeds());
+  EXPECT_EQ(st.st_dev, double_slash_st.st_dev);
+  EXPECT_EQ(st.st_ino, double_slash_st.st_ino);
+}
+
+// Test that a rename doesn't change the underlying file.
+TEST_F(StatTest, StatDoesntChangeAfterRename) {
+  const TempPath old_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath new_path(NewTempAbsPath());
+
+  struct stat st_old = {};
+  struct stat st_new = {};
+
+  ASSERT_THAT(stat(old_dir.path().c_str(), &st_old), SyscallSucceeds());
+  ASSERT_THAT(rename(old_dir.path().c_str(), new_path.path().c_str()),
+              SyscallSucceeds());
+  ASSERT_THAT(stat(new_path.path().c_str(), &st_new), SyscallSucceeds());
+
+  EXPECT_EQ(st_old.st_nlink, st_new.st_nlink);
+  EXPECT_EQ(st_old.st_dev, st_new.st_dev);
+  EXPECT_EQ(st_old.st_ino, st_new.st_ino);
+  EXPECT_EQ(st_old.st_mode, st_new.st_mode);
+  EXPECT_EQ(st_old.st_uid, st_new.st_uid);
+  EXPECT_EQ(st_old.st_gid, st_new.st_gid);
+  EXPECT_EQ(st_old.st_size, st_new.st_size);
+}
+
+// Test link counts with a regular file as the child.
+TEST_F(StatTest, LinkCountsWithRegularFileChild) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  struct stat st_parent_before = {};
+  ASSERT_THAT(stat(dir.path().c_str(), &st_parent_before), SyscallSucceeds());
+  EXPECT_EQ(st_parent_before.st_nlink, 2);
+
+  // Adding a regular file doesn't adjust the parent's link count.
+  const TempPath child =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+  struct stat st_parent_after = {};
+  ASSERT_THAT(stat(dir.path().c_str(), &st_parent_after), SyscallSucceeds());
+  EXPECT_EQ(st_parent_after.st_nlink, 2);
+
+  // The child should have a single link from the parent.
+  struct stat st_child = {};
+  ASSERT_THAT(stat(child.path().c_str(), &st_child), SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(st_child.st_mode));
+  EXPECT_EQ(st_child.st_nlink, 1);
+
+  // Finally unlinking the child should not affect the parent's link count.
+  ASSERT_THAT(unlink(child.path().c_str()), SyscallSucceeds());
+  ASSERT_THAT(stat(dir.path().c_str(), &st_parent_after), SyscallSucceeds());
+  EXPECT_EQ(st_parent_after.st_nlink, 2);
+}
+
+// This test verifies that inodes remain around when there is an open fd
+// after link count hits 0.
+TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild) {
+  // Setting the enviornment variable GVISOR_GOFER_UNCACHED to any value
+  // will prevent this test from running, see the tmpfs lifecycle.
+  //
+  // We need to support this because when a file is unlinked and we forward
+  // the stat to the gofer it would return ENOENT.
+  const char* uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
+  SKIP_IF(uncached_gofer != nullptr);
+
+  // We don't support saving unlinked files.
+  const DisableSave ds;
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      dir.path(), "hello", TempPath::kDefaultFileMode));
+
+  // The child should have a single link from the parent.
+  struct stat st_child_before = {};
+  ASSERT_THAT(stat(child.path().c_str(), &st_child_before), SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(st_child_before.st_mode));
+  EXPECT_EQ(st_child_before.st_nlink, 1);
+  EXPECT_EQ(st_child_before.st_size, 5);  // Hello is 5 bytes.
+
+  // Open the file so we can fstat after unlinking.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(child.path(), O_RDONLY));
+
+  // Now a stat should return ENOENT but we should still be able to stat
+  // via the open fd and fstat.
+  ASSERT_THAT(unlink(child.path().c_str()), SyscallSucceeds());
+
+  // Since the file has no more links stat should fail.
+  struct stat st_child_after = {};
+  ASSERT_THAT(stat(child.path().c_str(), &st_child_after),
+              SyscallFailsWithErrno(ENOENT));
+
+  // Fstat should still allow us to access the same file via the fd.
+  struct stat st_child_fd = {};
+  ASSERT_THAT(fstat(fd.get(), &st_child_fd), SyscallSucceeds());
+  EXPECT_EQ(st_child_before.st_dev, st_child_fd.st_dev);
+  EXPECT_EQ(st_child_before.st_ino, st_child_fd.st_ino);
+  EXPECT_EQ(st_child_before.st_mode, st_child_fd.st_mode);
+  EXPECT_EQ(st_child_before.st_uid, st_child_fd.st_uid);
+  EXPECT_EQ(st_child_before.st_gid, st_child_fd.st_gid);
+  EXPECT_EQ(st_child_before.st_size, st_child_fd.st_size);
+
+  // TODO: This isn't ideal but since fstatfs(2) will always return
+  // OVERLAYFS_SUPER_MAGIC we have no way to know if this fs is backed by a
+  // gofer which doesn't support links.
+  EXPECT_TRUE(st_child_fd.st_nlink == 0 || st_child_fd.st_nlink == 1);
+}
+
+// Test link counts with a directory as the child.
+TEST_F(StatTest, LinkCountsWithDirChild) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  // Before a child is added the two links are "." and the link from the parent.
+  struct stat st_parent_before = {};
+  ASSERT_THAT(stat(dir.path().c_str(), &st_parent_before), SyscallSucceeds());
+  EXPECT_EQ(st_parent_before.st_nlink, 2);
+
+  // Create a subdirectory and stat for the parent link counts.
+  const TempPath sub_dir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+
+  // The three links are ".", the link from the parent, and the link from
+  // the child as "..".
+  struct stat st_parent_after = {};
+  ASSERT_THAT(stat(dir.path().c_str(), &st_parent_after), SyscallSucceeds());
+  EXPECT_EQ(st_parent_after.st_nlink, 3);
+
+  // The child will have 1 link from the parent and 1 link which represents ".".
+  struct stat st_child = {};
+  ASSERT_THAT(stat(sub_dir.path().c_str(), &st_child), SyscallSucceeds());
+  EXPECT_TRUE(S_ISDIR(st_child.st_mode));
+  EXPECT_EQ(st_child.st_nlink, 2);
+
+  // Finally delete the child dir and the parent link count should return to 2.
+  ASSERT_THAT(rmdir(sub_dir.path().c_str()), SyscallSucceeds());
+  ASSERT_THAT(stat(dir.path().c_str(), &st_parent_after), SyscallSucceeds());
+
+  // Now we should only have links from the parent and "." since the subdir
+  // has been removed.
+  EXPECT_EQ(st_parent_after.st_nlink, 2);
+}
+
+// Test statting a child of a non-directory.
+TEST_F(StatTest, ChildOfNonDir) {
+  // Create a path that has a child of a regular file.
+  const std::string filename = JoinPath(test_file_name_, "child");
+
+  // Statting the path should return ENOTDIR.
+  struct stat st;
+  EXPECT_THAT(lstat(filename.c_str(), &st), SyscallFailsWithErrno(ENOTDIR));
+}
+
+// Verify that we get an ELOOP from too many symbolic links even when there
+// are directories in the middle.
+TEST_F(StatTest, LstatELOOPPath) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  std::string subdir_base = "subdir";
+  ASSERT_THAT(mkdir(JoinPath(dir.path(), subdir_base).c_str(), 0755),
+              SyscallSucceeds());
+
+  std::string target = JoinPath(dir.path(), subdir_base, subdir_base);
+  std::string dst = JoinPath("..", subdir_base);
+  ASSERT_THAT(symlink(dst.c_str(), target.c_str()), SyscallSucceeds());
+  auto cleanup = Cleanup(
+      [&target]() { EXPECT_THAT(unlink(target.c_str()), SyscallSucceeds()); });
+
+  // Now build a path which is /subdir/subdir/... repeated many times so that
+  // we can build a path that is shorter than PATH_MAX but can still cause
+  // too many symbolic links. Note: Every other subdir is actually a directory
+  // so we're not in a situation where it's a -> b -> a -> b, where a and b
+  // are symbolic links.
+  std::string path = dir.path();
+  std::string subdir_append = absl::StrCat("/", subdir_base);
+  do {
+    absl::StrAppend(&path, subdir_append);
+    // Keep appending /subdir until we would overflow PATH_MAX.
+  } while ((path.size() + subdir_append.size()) < PATH_MAX);
+
+  struct stat s = {};
+  ASSERT_THAT(lstat(path.c_str(), &s), SyscallFailsWithErrno(ELOOP));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/stat_times.cc b/test/syscalls/linux/stat_times.cc
new file mode 100644
index 000000000..442957c65
--- /dev/null
+++ b/test/syscalls/linux/stat_times.cc
@@ -0,0 +1,220 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include <tuple>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::IsEmpty;
+using ::testing::Not;
+
+class StatTimesTest : public ::testing::Test {
+ protected:
+  std::tuple<absl::Time, absl::Time, absl::Time> GetTime(const TempPath& file) {
+    struct stat statbuf = {};
+    EXPECT_THAT(stat(file.path().c_str(), &statbuf), SyscallSucceeds());
+
+    const auto atime = absl::TimeFromTimespec(statbuf.st_atim);
+    const auto mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+    const auto ctime = absl::TimeFromTimespec(statbuf.st_ctim);
+    return std::make_tuple(atime, mtime, ctime);
+  }
+};
+
+TEST_F(StatTimesTest, FileCreationTimes) {
+  const DisableSave ds;  // Timing-related test.
+
+  // Get a time for when the file is created.
+  const absl::Time before = absl::Now() - absl::Seconds(1);
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const absl::Time after = absl::Now() + absl::Seconds(1);
+
+  absl::Time atime, mtime, ctime;
+  std::tie(atime, mtime, ctime) = GetTime(file);
+
+  EXPECT_LE(before, atime);
+  EXPECT_LE(before, mtime);
+  EXPECT_LE(before, ctime);
+  EXPECT_GE(after, atime);
+  EXPECT_GE(after, mtime);
+  EXPECT_GE(after, ctime);
+}
+
+TEST_F(StatTimesTest, FileCtimeChanges) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  MaybeSave();  // FIXME: ctime is inconsistent.
+
+  absl::Time atime, mtime, ctime;
+  std::tie(atime, mtime, ctime) = GetTime(file);
+
+  absl::SleepFor(absl::Seconds(1));
+
+  // Chmod should only change ctime.
+  EXPECT_THAT(chmod(file.path().c_str(), 0666), SyscallSucceeds());
+
+  absl::Time atime2, mtime2, ctime2;
+  std::tie(atime2, mtime2, ctime2) = GetTime(file);
+  EXPECT_EQ(atime2, atime);
+  EXPECT_EQ(mtime2, mtime);
+  EXPECT_GT(ctime2, ctime);
+
+  absl::SleepFor(absl::Seconds(1));
+
+  // Rename should only change ctime.
+  const auto newpath = NewTempAbsPath();
+  EXPECT_THAT(rename(file.path().c_str(), newpath.c_str()), SyscallSucceeds());
+  file.reset(newpath);
+
+  std::tie(atime, mtime, ctime) = GetTime(file);
+  EXPECT_EQ(atime, atime2);
+  EXPECT_EQ(mtime, mtime2);
+  EXPECT_GT(ctime, ctime2);
+
+  absl::SleepFor(absl::Seconds(1));
+
+  // Utimes should only change ctime and the time that we ask to change (atime
+  // to now in this case).
+  const absl::Time before = absl::Now() - absl::Seconds(1);
+  const struct timespec ts[2] = {{0, UTIME_NOW}, {0, UTIME_OMIT}};
+  ASSERT_THAT(utimensat(AT_FDCWD, file.path().c_str(), ts, 0),
+              SyscallSucceeds());
+  const absl::Time after = absl::Now() + absl::Seconds(1);
+
+  std::tie(atime2, mtime2, ctime2) = GetTime(file);
+  EXPECT_LE(before, atime2);
+  EXPECT_GE(after, atime2);
+  EXPECT_EQ(mtime2, mtime);
+  EXPECT_GT(ctime2, ctime);
+}
+
+TEST_F(StatTimesTest, FileMtimeChanges) {
+  const auto file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "yaaass", 0666));
+
+  absl::Time atime, mtime, ctime;
+  std::tie(atime, mtime, ctime) = GetTime(file);
+
+  absl::SleepFor(absl::Seconds(1));
+
+  // Truncate should only change mtime and ctime.
+  EXPECT_THAT(truncate(file.path().c_str(), 0), SyscallSucceeds());
+
+  absl::Time atime2, mtime2, ctime2;
+  std::tie(atime2, mtime2, ctime2) = GetTime(file);
+  EXPECT_EQ(atime2, atime);
+  EXPECT_GT(mtime2, mtime);
+  EXPECT_GT(ctime2, ctime);
+
+  absl::SleepFor(absl::Seconds(1));
+
+  // Write should only change mtime and ctime.
+  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0));
+  const std::string contents = "all the single dollars";
+  EXPECT_THAT(write(fd.get(), contents.data(), contents.size()),
+              SyscallSucceeds());
+
+  std::tie(atime, mtime, ctime) = GetTime(file);
+  EXPECT_EQ(atime, atime2);
+  EXPECT_GT(mtime, mtime2);
+  EXPECT_GT(ctime, ctime2);
+}
+
+TEST_F(StatTimesTest, FileAtimeChanges) {
+  const std::string contents = "bills bills bills";
+  const auto file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), contents, 0666));
+
+  MaybeSave();  // FIXME: ctime is inconsistent.
+
+  absl::Time atime, mtime, ctime;
+  std::tie(atime, mtime, ctime) = GetTime(file);
+
+  absl::SleepFor(absl::Seconds(1));
+
+  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY, 0));
+
+  // Read should only change atime.
+  char buf[20];
+  const absl::Time before = absl::Now() - absl::Seconds(1);
+  int read_result;
+  ASSERT_THAT(read_result = read(fd.get(), buf, sizeof(buf)),
+              SyscallSucceeds());
+  const absl::Time after = absl::Now() + absl::Seconds(1);
+
+  EXPECT_EQ(std::string(buf, read_result), contents);
+
+  absl::Time atime2, mtime2, ctime2;
+  std::tie(atime2, mtime2, ctime2) = GetTime(file);
+
+  EXPECT_LE(before, atime2);
+  EXPECT_GE(after, atime2);
+  EXPECT_GT(atime2, atime);
+  EXPECT_EQ(mtime2, mtime);
+  EXPECT_EQ(ctime2, ctime);
+}
+
+TEST_F(StatTimesTest, DirAtimeChanges) {
+  const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const auto file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+  MaybeSave();  // FIXME: ctime is inconsistent.
+
+  absl::Time atime, mtime, ctime;
+  std::tie(atime, mtime, ctime) = GetTime(dir);
+
+  absl::SleepFor(absl::Seconds(1));
+
+  const absl::Time before = absl::Now() - absl::Seconds(1);
+
+  // NOTE: Keep an fd open. This ensures that the inode backing the
+  // directory won't be destroyed before the final GetTime to avoid writing out
+  // timestamps and causing side effects.
+  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0));
+
+  // Listing the directory contents should only change atime.
+  auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(dir.path(), false));
+  EXPECT_THAT(contents, Not(IsEmpty()));
+
+  const absl::Time after = absl::Now() + absl::Seconds(1);
+
+  absl::Time atime2, mtime2, ctime2;
+  std::tie(atime2, mtime2, ctime2) = GetTime(dir);
+
+  EXPECT_LE(before, atime2);
+  EXPECT_GE(after, atime2);
+  EXPECT_GT(atime2, atime);
+  EXPECT_EQ(mtime2, mtime);
+  EXPECT_EQ(ctime2, ctime);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc
new file mode 100644
index 000000000..1fc9758c9
--- /dev/null
+++ b/test/syscalls/linux/statfs.cc
@@ -0,0 +1,81 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/statfs.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(StatfsTest, CannotStatBadPath) {
+  auto temp_file = NewTempAbsPathInDir("/tmp");
+
+  struct statfs st;
+  EXPECT_THAT(statfs(temp_file.c_str(), &st), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(StatfsTest, InternalTmpfs) {
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  struct statfs st;
+  EXPECT_THAT(statfs(temp_file.path().c_str(), &st), SyscallSucceeds());
+}
+
+TEST(StatfsTest, InternalDevShm) {
+  struct statfs st;
+  EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
+}
+
+TEST(StatfsTest, NameLen) {
+  struct statfs st;
+  EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
+
+  EXPECT_EQ(st.f_namelen, NAME_MAX);
+}
+
+TEST(FstatfsTest, CannotStatBadFd) {
+  struct statfs st;
+  EXPECT_THAT(fstatfs(-1, &st), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FstatfsTest, InternalTmpfs) {
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY));
+
+  struct statfs st;
+  EXPECT_THAT(fstatfs(fd.get(), &st), SyscallSucceeds());
+}
+
+TEST(FstatfsTest, InternalDevShm) {
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/shm", O_RDONLY));
+
+  struct statfs st;
+  EXPECT_THAT(fstatfs(fd.get(), &st), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
new file mode 100644
index 000000000..563763d10
--- /dev/null
+++ b/test/syscalls/linux/sticky.cc
@@ -0,0 +1,116 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <grp.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_int32(scratch_uid1, 65534, "first scratch UID");
+DEFINE_int32(scratch_gid, 65534, "first scratch GID");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(StickyTest, StickyBitPermDenied) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
+  std::string path = JoinPath(dir.path(), "NewDir");
+  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+
+  // Drop privileges and change IDs only in child thread, or else this parent
+  // thread won't be able to open some log files after the test ends.
+  ScopedThread([&] {
+    // Drop privileges.
+    if (HaveCapability(CAP_FOWNER).ValueOrDie()) {
+      EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, false));
+    }
+
+    // Change EUID and EGID.
+    EXPECT_THAT(syscall(SYS_setresgid, -1, FLAGS_scratch_gid, -1),
+                SyscallSucceeds());
+    EXPECT_THAT(syscall(SYS_setresuid, -1, FLAGS_scratch_uid1, -1),
+                SyscallSucceeds());
+
+    EXPECT_THAT(rmdir(path.c_str()), SyscallFailsWithErrno(EPERM));
+  });
+}
+
+TEST(StickyTest, StickyBitSameUID) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
+  std::string path = JoinPath(dir.path(), "NewDir");
+  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+
+  // Drop privileges and change IDs only in child thread, or else this parent
+  // thread won't be able to open some log files after the test ends.
+  ScopedThread([&] {
+    // Drop privileges.
+    if (HaveCapability(CAP_FOWNER).ValueOrDie()) {
+      EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, false));
+    }
+
+    // Change EGID.
+    EXPECT_THAT(syscall(SYS_setresgid, -1, FLAGS_scratch_gid, -1),
+                SyscallSucceeds());
+
+    // We still have the same EUID.
+    EXPECT_THAT(rmdir(path.c_str()), SyscallSucceeds());
+  });
+}
+
+TEST(StickyTest, StickyBitCapFOWNER) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
+  std::string path = JoinPath(dir.path(), "NewDir");
+  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+
+  // Drop privileges and change IDs only in child thread, or else this parent
+  // thread won't be able to open some log files after the test ends.
+  ScopedThread([&] {
+    // Set PR_SET_KEEPCAPS.
+    EXPECT_THAT(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0), SyscallSucceeds());
+
+    // Change EUID and EGID.
+    EXPECT_THAT(syscall(SYS_setresgid, -1, FLAGS_scratch_gid, -1),
+                SyscallSucceeds());
+    EXPECT_THAT(syscall(SYS_setresuid, -1, FLAGS_scratch_uid1, -1),
+                SyscallSucceeds());
+
+    EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, true));
+    EXPECT_THAT(rmdir(path.c_str()), SyscallSucceeds());
+  });
+}
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
new file mode 100644
index 000000000..cfc87bc8f
--- /dev/null
+++ b/test/syscalls/linux/symlink.cc
@@ -0,0 +1,288 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+mode_t FilePermission(const std::string& path) {
+  struct stat buf = {0};
+  TEST_CHECK(lstat(path.c_str(), &buf) == 0);
+  return buf.st_mode & 0777;
+}
+
+// Test that name collisions are checked on the new link path, not the source
+// path.
+TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) {
+  const std::string srcname = NewTempAbsPath();
+  const std::string newname = NewTempAbsPath();
+  const std::string basedir = std::string(Dirname(srcname));
+  ASSERT_EQ(basedir, Dirname(newname));
+
+  ASSERT_THAT(chdir(basedir.c_str()), SyscallSucceeds());
+
+  // Open the source node to cause the underlying dirent to be cached. It will
+  // remain cached while we have the file open.
+  int fd;
+  ASSERT_THAT(fd = open(srcname.c_str(), O_CREAT | O_RDWR, 0666),
+              SyscallSucceeds());
+  FileDescriptor fd_closer(fd);
+
+  // Attempt to create a symlink. If the bug exists, this will fail since the
+  // dirent link creation code will check for a name collision on the source
+  // link name.
+  EXPECT_THAT(symlink(std::string(Basename(srcname)).c_str(),
+                      std::string(Basename(newname)).c_str()),
+              SyscallSucceeds());
+}
+
+TEST(SymlinkTest, CanCreateSymlinkFile) {
+  const std::string oldname = NewTempAbsPath();
+  const std::string newname = NewTempAbsPath();
+
+  int fd;
+  ASSERT_THAT(fd = open(oldname.c_str(), O_CREAT | O_RDWR, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(symlink(oldname.c_str(), newname.c_str()), SyscallSucceeds());
+  EXPECT_EQ(FilePermission(newname), 0777);
+
+  auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink(newname));
+  EXPECT_EQ(oldname, link);
+
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, CanCreateSymlinkDir) {
+  const std::string olddir = NewTempAbsPath();
+  const std::string newdir = NewTempAbsPath();
+
+  EXPECT_THAT(mkdir(olddir.c_str(), 0777), SyscallSucceeds());
+  EXPECT_THAT(symlink(olddir.c_str(), newdir.c_str()), SyscallSucceeds());
+  EXPECT_EQ(FilePermission(newdir), 0777);
+
+  auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink(newdir));
+  EXPECT_EQ(olddir, link);
+
+  EXPECT_THAT(unlink(newdir.c_str()), SyscallSucceeds());
+
+  ASSERT_THAT(rmdir(olddir.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, CannotCreateSymlinkInReadOnlyDir) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  const std::string olddir = NewTempAbsPath();
+  ASSERT_THAT(mkdir(olddir.c_str(), 0444), SyscallSucceeds());
+
+  const std::string newdir = NewTempAbsPathInDir(olddir);
+  EXPECT_THAT(symlink(olddir.c_str(), newdir.c_str()),
+              SyscallFailsWithErrno(EACCES));
+
+  ASSERT_THAT(rmdir(olddir.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, CannotSymlinkOverExistingFile) {
+  const std::string oldname = NewTempAbsPath();
+  const std::string newname = NewTempAbsPath();
+
+  int oldfd;
+  int newfd;
+  ASSERT_THAT(oldfd = open(oldname.c_str(), O_CREAT | O_RDWR, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(oldfd), SyscallSucceeds());
+  ASSERT_THAT(newfd = open(newname.c_str(), O_CREAT | O_RDWR, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(newfd), SyscallSucceeds());
+
+  EXPECT_THAT(symlink(oldname.c_str(), newname.c_str()),
+              SyscallFailsWithErrno(EEXIST));
+
+  EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, OldnameIsEmpty) {
+  const std::string newname = NewTempAbsPath();
+  EXPECT_THAT(symlink("", newname.c_str()), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(SymlinkTest, OldnameIsDangling) {
+  const std::string newname = NewTempAbsPath();
+  EXPECT_THAT(symlink("/dangling", newname.c_str()), SyscallSucceeds());
+
+  // This is required for S/R random save tests, which pre-run this test
+  // in the same TEST_TMPDIR, which means that we need to clean it for any
+  // operations exclusively creating files, like symlink above.
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, NewnameCannotExist) {
+  const std::string newname =
+      JoinPath(GetAbsoluteTestTmpdir(), "thisdoesnotexist", "foo");
+  EXPECT_THAT(symlink("/thisdoesnotmatter", newname.c_str()),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(SymlinkTest, CanEvaluateLink) {
+  const std::string oldname = NewTempAbsPath();
+  const std::string newname = NewTempAbsPath();
+
+  int fd;
+  ASSERT_THAT(fd = open(oldname.c_str(), O_CREAT | O_RDWR, 0666),
+              SyscallSucceeds());
+  struct stat old;
+  EXPECT_THAT(fstat(fd, &old), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(symlink(oldname.c_str(), newname.c_str()), SyscallSucceeds());
+  EXPECT_EQ(FilePermission(newname), 0777);
+
+  EXPECT_THAT(fd = open(newname.c_str(), O_RDWR, 0666), SyscallSucceeds());
+  struct stat old_linked;
+  EXPECT_THAT(fstat(fd, &old_linked), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  // Check that in fact newname points to the file we expect.
+  // FIXME: use only inodes here once they are consistent,
+  // but this is better than nothing.
+  EXPECT_EQ(old.st_dev, old_linked.st_dev);
+  EXPECT_EQ(old.st_mode, old_linked.st_mode);
+  EXPECT_EQ(old.st_size, old_linked.st_size);
+
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, TargetIsNotMapped) {
+  const std::string oldname = NewTempAbsPath();
+  const std::string newname = NewTempAbsPath();
+
+  int fd;
+  // Create the target so that when we read the link, it exists.
+  ASSERT_THAT(fd = open(oldname.c_str(), O_CREAT | O_RDWR, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  // Create a symlink called newname that points to oldname.
+  EXPECT_THAT(symlink(oldname.c_str(), newname.c_str()), SyscallSucceeds());
+
+  std::vector<char> buf(1024);
+  int linksize;
+  // Read the link and assert that the oldname is still the same.
+  EXPECT_THAT(linksize = readlink(newname.c_str(), buf.data(), 1024),
+              SyscallSucceeds());
+  EXPECT_EQ(0, strncmp(oldname.c_str(), buf.data(), linksize));
+
+  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, PreadFromSymlink) {
+  std::string name = NewTempAbsPath();
+  int fd;
+  ASSERT_THAT(fd = open(name.c_str(), O_CREAT, 0644), SyscallSucceeds());
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+
+  std::string linkname = NewTempAbsPath();
+  ASSERT_THAT(symlink(name.c_str(), linkname.c_str()), SyscallSucceeds());
+
+  ASSERT_THAT(fd = open(linkname.c_str(), O_RDONLY), SyscallSucceeds());
+
+  char buf[1024];
+  EXPECT_THAT(pread64(fd, buf, 1024, 0), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(unlink(name.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(linkname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, SymlinkAtDegradedPermissions_NoRandomSave) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+  int dirfd;
+  ASSERT_THAT(dirfd = open(dir.path().c_str(), O_DIRECTORY, 0),
+              SyscallSucceeds());
+
+  const DisableSave ds;  // Permissions are dropped.
+  EXPECT_THAT(fchmod(dirfd, 0), SyscallSucceeds());
+
+  std::string basename = std::string(Basename(file.path()));
+  EXPECT_THAT(symlinkat("/dangling", dirfd, basename.c_str()),
+              SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, ReadlinkAtDegradedPermissions_NoRandomSave) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string oldpath = NewTempAbsPathInDir(dir.path());
+  const std::string oldbase = std::string(Basename(oldpath));
+  ASSERT_THAT(symlink("/dangling", oldpath.c_str()), SyscallSucceeds());
+
+  int dirfd;
+  EXPECT_THAT(dirfd = open(dir.path().c_str(), O_DIRECTORY, 0),
+              SyscallSucceeds());
+
+  const DisableSave ds;  // Permissions are dropped.
+  EXPECT_THAT(fchmod(dirfd, 0), SyscallSucceeds());
+
+  char buf[1024];
+  int linksize;
+  EXPECT_THAT(linksize = readlinkat(dirfd, oldbase.c_str(), buf, 1024),
+              SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, ChmodSymlink) {
+  auto target = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string newpath = NewTempAbsPath();
+  ASSERT_THAT(symlink(target.path().c_str(), newpath.c_str()),
+              SyscallSucceeds());
+  EXPECT_EQ(FilePermission(newpath), 0777);
+  EXPECT_THAT(chmod(newpath.c_str(), 0666), SyscallSucceeds());
+  EXPECT_EQ(FilePermission(newpath), 0777);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sync.cc b/test/syscalls/linux/sync.cc
new file mode 100644
index 000000000..5b777b6eb
--- /dev/null
+++ b/test/syscalls/linux/sync.cc
@@ -0,0 +1,60 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SyncTest, SyncEverything) {
+  ASSERT_THAT(syscall(SYS_sync), SyscallSucceeds());
+}
+
+TEST(SyncTest, SyncFileSytem) {
+  int fd;
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  ASSERT_THAT(fd = open(f.path().c_str(), O_RDONLY), SyscallSucceeds());
+  EXPECT_THAT(syncfs(fd), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(SyncTest, SyncFromPipe) {
+  int pipes[2];
+  EXPECT_THAT(pipe(pipes), SyscallSucceeds());
+  EXPECT_THAT(syncfs(pipes[0]), SyscallSucceeds());
+  EXPECT_THAT(syncfs(pipes[1]), SyscallSucceeds());
+  EXPECT_THAT(close(pipes[0]), SyscallSucceeds());
+  EXPECT_THAT(close(pipes[1]), SyscallSucceeds());
+}
+
+TEST(SyncTest, CannotSyncFileSytemAtBadFd) {
+  EXPECT_THAT(syncfs(-1), SyscallFailsWithErrno(EBADF));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sync_file_range.cc b/test/syscalls/linux/sync_file_range.cc
new file mode 100644
index 000000000..ebe4ca171
--- /dev/null
+++ b/test/syscalls/linux/sync_file_range.cc
@@ -0,0 +1,111 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SyncFileRangeTest, TempFileSucceeds) {
+  auto tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path(), O_RDWR));
+  constexpr char data[] = "some data to sync";
+  int fd = f.get();
+
+  EXPECT_THAT(write(fd, data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+  EXPECT_THAT(sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE),
+              SyscallSucceeds());
+  EXPECT_THAT(sync_file_range(fd, 0, 0, 0), SyscallSucceeds());
+  EXPECT_THAT(
+      sync_file_range(fd, 0, 0,
+                      SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER |
+                          SYNC_FILE_RANGE_WAIT_BEFORE),
+      SyscallSucceeds());
+  EXPECT_THAT(sync_file_range(
+                  fd, 0, 1, SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER),
+              SyscallSucceeds());
+  EXPECT_THAT(sync_file_range(
+                  fd, 1, 0, SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER),
+              SyscallSucceeds());
+}
+
+TEST(SyncFileRangeTest, CannotSyncFileRangeOnUnopenedFd) {
+  auto tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path(), O_RDWR));
+  constexpr char data[] = "some data to sync";
+  int fd = f.get();
+
+  EXPECT_THAT(write(fd, data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+
+  pid_t pid = fork();
+  if (pid == 0) {
+    f.reset();
+
+    // fd is now invalid.
+    TEST_CHECK(sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE) == -1);
+    TEST_PCHECK(errno == EBADF);
+    _exit(0);
+  }
+
+  int status = 0;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST(SyncFileRangeTest, BadArgs) {
+  auto tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path(), O_RDWR));
+  int fd = f.get();
+
+  EXPECT_THAT(sync_file_range(fd, -1, 0, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(sync_file_range(fd, 0, -1, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(sync_file_range(fd, 8912, INT64_MAX - 4096, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SyncFileRangeTest, CannotSyncFileRangeWithWaitBefore) {
+  auto tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path(), O_RDWR));
+  constexpr char data[] = "some data to sync";
+  int fd = f.get();
+
+  EXPECT_THAT(write(fd, data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+  if (IsRunningOnGvisor()) {
+    EXPECT_THAT(sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE),
+                SyscallFailsWithErrno(ENOSYS));
+    EXPECT_THAT(
+        sync_file_range(fd, 0, 0,
+                        SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE),
+        SyscallFailsWithErrno(ENOSYS));
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sysinfo.cc b/test/syscalls/linux/sysinfo.cc
new file mode 100644
index 000000000..a0dd82640
--- /dev/null
+++ b/test/syscalls/linux/sysinfo.cc
@@ -0,0 +1,86 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is a very simple sanity test to validate that the sysinfo syscall is
+// supported by gvisor and returns sane values.
+#include <sys/syscall.h>
+#include <sys/sysinfo.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SysinfoTest, SysinfoIsCallable) {
+  struct sysinfo ignored = {};
+  EXPECT_THAT(syscall(SYS_sysinfo, &ignored), SyscallSucceedsWithValue(0));
+}
+
+TEST(SysinfoTest, EfaultProducedOnBadAddress) {
+  // Validate that we return EFAULT when a bad address is provided.
+  // specified by man 2 sysinfo
+  EXPECT_THAT(syscall(SYS_sysinfo, nullptr), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(SysinfoTest, TotalRamSaneValue) {
+  struct sysinfo s = {};
+  EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+  EXPECT_GT(s.totalram, 0);
+}
+
+TEST(SysinfoTest, MemunitSet) {
+  struct sysinfo s = {};
+  EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+  EXPECT_GE(s.mem_unit, 1);
+}
+
+TEST(SysinfoTest, UptimeSaneValue) {
+  struct sysinfo s = {};
+  EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+  EXPECT_GE(s.uptime, 0);
+}
+
+TEST(SysinfoTest, UptimeIncreasingValue) {
+  struct sysinfo s = {};
+  EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+  absl::SleepFor(absl::Seconds(2));
+  struct sysinfo s2 = {};
+  EXPECT_THAT(sysinfo(&s2), SyscallSucceedsWithValue(0));
+  EXPECT_LT(s.uptime, s2.uptime);
+}
+
+TEST(SysinfoTest, FreeRamSaneValue) {
+  struct sysinfo s = {};
+  EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+  EXPECT_GT(s.freeram, 0);
+  EXPECT_LT(s.freeram, s.totalram);
+}
+
+TEST(SysinfoTest, NumProcsSaneValue) {
+  struct sysinfo s = {};
+  EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+  EXPECT_GT(s.procs, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/syslog.cc b/test/syscalls/linux/syslog.cc
new file mode 100644
index 000000000..5bd0d1cc3
--- /dev/null
+++ b/test/syscalls/linux/syslog.cc
@@ -0,0 +1,51 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/klog.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int SYSLOG_ACTION_READ_ALL = 3;
+constexpr int SYSLOG_ACTION_SIZE_BUFFER = 10;
+
+int Syslog(int type, char* buf, int len) {
+  return syscall(__NR_syslog, type, buf, len);
+}
+
+// Only SYSLOG_ACTION_SIZE_BUFFER and SYSLOG_ACTION_READ_ALL are implemented in
+// gVisor.
+
+TEST(Syslog, Size) {
+  EXPECT_THAT(Syslog(SYSLOG_ACTION_SIZE_BUFFER, nullptr, 0), SyscallSucceeds());
+}
+
+TEST(Syslog, ReadAll) {
+  // There might not be anything to read, so we can't check the write count.
+  char buf[100];
+  EXPECT_THAT(Syslog(SYSLOG_ACTION_READ_ALL, buf, sizeof(buf)),
+              SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
new file mode 100644
index 000000000..8e10220eb
--- /dev/null
+++ b/test/syscalls/linux/sysret.cc
@@ -0,0 +1,113 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tests to verify that the behavior of linux and gvisor matches when
+// 'sysret' returns to bad (aka non-canonical) %rip or %rsp.
+#include <sys/ptrace.h>
+#include <sys/user.h>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr uint64_t kNonCanonicalRip = 0xCCCC000000000000;
+constexpr uint64_t kNonCanonicalRsp = 0xFFFF000000000000;
+
+class SysretTest : public ::testing::Test {
+ protected:
+  struct user_regs_struct regs_;
+  pid_t child_;
+
+  void SetUp() override {
+    pid_t pid = fork();
+
+    // Child.
+    if (pid == 0) {
+      TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+      MaybeSave();
+      TEST_PCHECK(raise(SIGSTOP) == 0);
+      MaybeSave();
+      _exit(0);
+    }
+
+    // Parent.
+    int status;
+    ASSERT_THAT(pid, SyscallSucceeds());  // Might still be < 0.
+    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+    EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+    ASSERT_THAT(ptrace(PTRACE_GETREGS, pid, 0, &regs_), SyscallSucceeds());
+
+    child_ = pid;
+  }
+
+  void Detach() {
+    ASSERT_THAT(ptrace(PTRACE_DETACH, child_, 0, 0), SyscallSucceeds());
+  }
+
+  void SetRip(uint64_t newrip) {
+    regs_.rip = newrip;
+    ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
+  }
+
+  void SetRsp(uint64_t newrsp) {
+    regs_.rsp = newrsp;
+    ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
+  }
+
+  // Wait waits for the child pid and returns the exit status.
+  int Wait() {
+    int status;
+    while (true) {
+      int rval = wait4(child_, &status, 0, NULL);
+      if (rval < 0) {
+        return rval;
+      }
+      if (rval == child_) {
+        return status;
+      }
+    }
+  }
+};
+
+TEST_F(SysretTest, JustDetach) {
+  Detach();
+  int status = Wait();
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status = " << status;
+}
+
+TEST_F(SysretTest, BadRip) {
+  SetRip(kNonCanonicalRip);
+  Detach();
+  int status = Wait();
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
+      << "status = " << status;
+}
+
+TEST_F(SysretTest, BadRsp) {
+  SetRsp(kNonCanonicalRsp);
+  Detach();
+  int status = Wait();
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGBUS)
+      << "status = " << status;
+}
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
new file mode 100644
index 000000000..e6fe84ded
--- /dev/null
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -0,0 +1,759 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <limits>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
+  struct sockaddr_storage addr;
+  memset(&addr, 0, sizeof(addr));
+  addr.ss_family = family;
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<struct sockaddr_in*>(&addr)->sin_addr.s_addr =
+          htonl(INADDR_LOOPBACK);
+      break;
+    case AF_INET6:
+      reinterpret_cast<struct sockaddr_in6*>(&addr)->sin6_addr =
+          in6addr_loopback;
+      break;
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+  return addr;
+}
+
+// Fixture for tests parameterized by the address family to use (AF_INET and
+// AF_INET6) when creating sockets.
+class TcpSocketTest : public ::testing::TestWithParam<int> {
+ protected:
+  // Creates three sockets that will be used by test cases -- a listener, one
+  // that connects, and the accepted one.
+  void SetUp() override;
+
+  // Closes the sockets created by SetUp().
+  void TearDown() override;
+
+  // Listening socket.
+  int listener_ = -1;
+
+  // Socket connected via connect().
+  int s_ = -1;
+
+  // Socket connected via accept().
+  int t_ = -1;
+
+  // Initial size of the send buffer.
+  int sendbuf_size_ = -1;
+};
+
+void TcpSocketTest::SetUp() {
+  ASSERT_THAT(listener_ = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+              SyscallSucceeds());
+
+  ASSERT_THAT(s_ = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+              SyscallSucceeds());
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(
+      bind(listener_, reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  ASSERT_THAT(listen(listener_, SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listener_, reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(RetryEINTR(connect)(s_, reinterpret_cast<struct sockaddr*>(&addr),
+                                  addrlen),
+              SyscallSucceeds());
+
+  // Get the initial send buffer size.
+  socklen_t optlen = sizeof(sendbuf_size_);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &sendbuf_size_, &optlen),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  ASSERT_THAT(t_ = RetryEINTR(accept)(listener_, nullptr, nullptr),
+              SyscallSucceeds());
+}
+
+void TcpSocketTest::TearDown() {
+  EXPECT_THAT(close(listener_), SyscallSucceeds());
+  if (s_ >= 0) {
+    EXPECT_THAT(close(s_), SyscallSucceeds());
+  }
+  if (t_ >= 0) {
+    EXPECT_THAT(close(t_), SyscallSucceeds());
+  }
+}
+
+TEST_P(TcpSocketTest, DataCoalesced) {
+  char buf[10];
+
+  // Write in two steps.
+  ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf) / 2),
+              SyscallSucceedsWithValue(sizeof(buf) / 2));
+  ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf) / 2),
+              SyscallSucceedsWithValue(sizeof(buf) / 2));
+
+  // Allow stack to process both packets.
+  absl::SleepFor(absl::Seconds(1));
+
+  // Read in one shot.
+  EXPECT_THAT(RetryEINTR(recv)(t_, buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(TcpSocketTest, SenderAddressIgnored) {
+  char buf[3];
+  ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  memset(&addr, 0, sizeof(addr));
+
+  ASSERT_THAT(
+      RetryEINTR(recvfrom)(t_, buf, sizeof(buf), 0,
+                           reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+      SyscallSucceedsWithValue(3));
+
+  // Check that addr remains zeroed-out.
+  const char* ptr = reinterpret_cast<char*>(&addr);
+  for (size_t i = 0; i < sizeof(addr); i++) {
+    EXPECT_EQ(ptr[i], 0);
+  }
+}
+
+TEST_P(TcpSocketTest, SenderAddressIgnoredOnPeek) {
+  char buf[3];
+  ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  memset(&addr, 0, sizeof(addr));
+
+  ASSERT_THAT(
+      RetryEINTR(recvfrom)(t_, buf, sizeof(buf), MSG_PEEK,
+                           reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+      SyscallSucceedsWithValue(3));
+
+  // Check that addr remains zeroed-out.
+  const char* ptr = reinterpret_cast<char*>(&addr);
+  for (size_t i = 0; i < sizeof(addr); i++) {
+    EXPECT_EQ(ptr[i], 0);
+  }
+}
+
+TEST_P(TcpSocketTest, SendtoAddressIgnored) {
+  struct sockaddr_storage addr;
+  memset(&addr, 0, sizeof(addr));
+  addr.ss_family = GetParam();  // FIXME
+
+  char data = '\0';
+  EXPECT_THAT(
+      RetryEINTR(sendto)(s_, &data, sizeof(data), 0,
+                         reinterpret_cast<sockaddr*>(&addr), sizeof(addr)),
+      SyscallSucceedsWithValue(1));
+}
+
+TEST_P(TcpSocketTest, WritevZeroIovec) {
+  // 2 bytes just to be safe and have vecs[1] not point to something random
+  // (even though length is 0).
+  char buf[2];
+  char recv_buf[1];
+
+  // Construct a vec where the final vector is of length 0.
+  iovec vecs[2] = {};
+  vecs[0].iov_base = buf;
+  vecs[0].iov_len = 1;
+  vecs[1].iov_base = buf + 1;
+  vecs[1].iov_len = 0;
+
+  EXPECT_THAT(RetryEINTR(writev)(s_, vecs, 2), SyscallSucceedsWithValue(1));
+
+  EXPECT_THAT(RetryEINTR(recv)(t_, recv_buf, 1, 0),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(memcmp(recv_buf, buf, 1), 0);
+}
+
+TEST_P(TcpSocketTest, ZeroWriteAllowed) {
+  char buf[3];
+  // Send a zero length packet.
+  ASSERT_THAT(RetryEINTR(write)(s_, buf, 0), SyscallSucceedsWithValue(0));
+  // Verify that there is no packet available.
+  EXPECT_THAT(RetryEINTR(recv)(t_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Test that a non-blocking write with a buffer that is larger than the send
+// buffer size will not actually write the whole thing at once.
+TEST_P(TcpSocketTest, NonblockingLargeWrite) {
+  // Set the FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(s_, F_SETFL, opts), SyscallSucceeds());
+
+  // Allocate a buffer three times the size of the send buffer. We do this with
+  // a vector to avoid allocating on the stack.
+  int size = 3 * sendbuf_size_;
+  std::vector<char> buf(size);
+
+  // Try to write the whole thing.
+  int n;
+  ASSERT_THAT(n = RetryEINTR(write)(s_, buf.data(), size), SyscallSucceeds());
+
+  // We should have written something, but not the whole thing.
+  EXPECT_GT(n, 0);
+  EXPECT_LT(n, size);
+}
+
+// Test that a blocking write with a buffer that is larger than the send buffer
+// will block until the entire buffer is sent.
+TEST_P(TcpSocketTest, BlockingLargeWrite_NoRandomSave) {
+  // Allocate a buffer three times the size of the send buffer on the heap. We
+  // do this as a vector to avoid allocating on the stack.
+  int size = 3 * sendbuf_size_;
+  std::vector<char> writebuf(size);
+
+  // Start reading the response in a loop.
+  int read_bytes = 0;
+  ScopedThread t([this, &read_bytes]() {
+    // Avoid interrupting the blocking write in main thread.
+    const DisableSave ds;
+    char readbuf[2500] = {};
+    int n = -1;
+    while (n != 0) {
+      EXPECT_THAT(n = RetryEINTR(read)(t_, &readbuf, sizeof(readbuf)),
+                  SyscallSucceeds());
+      read_bytes += n;
+    }
+  });
+
+  // Try to write the whole thing.
+  int n;
+  ASSERT_THAT(n = WriteFd(s_, writebuf.data(), size), SyscallSucceeds());
+
+  // We should have written the whole thing.
+  EXPECT_EQ(n, size);
+  EXPECT_THAT(close(s_), SyscallSucceedsWithValue(0));
+  s_ = -1;
+  t.Join();
+
+  // We should have read the whole thing.
+  EXPECT_EQ(read_bytes, size);
+}
+
+// Test that a send with MSG_DONTWAIT flag and buffer that larger than the send
+// buffer size will not write the whole thing.
+TEST_P(TcpSocketTest, LargeSendDontWait) {
+  // Allocate a buffer three times the size of the send buffer. We do this on
+  // with a vector to avoid allocating on the stack.
+  int size = 3 * sendbuf_size_;
+  std::vector<char> buf(size);
+
+  // Try to write the whole thing with MSG_DONTWAIT flag, which can
+  // return a partial write.
+  int n;
+  ASSERT_THAT(n = RetryEINTR(send)(s_, buf.data(), size, MSG_DONTWAIT),
+              SyscallSucceeds());
+
+  // We should have written something, but not the whole thing.
+  EXPECT_GT(n, 0);
+  EXPECT_LT(n, size);
+}
+
+// Test that a send on a non-blocking socket with a buffer that larger than the
+// send buffer will not write the whole thing at once.
+TEST_P(TcpSocketTest, NonblockingLargeSend) {
+  // Set the FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(s_, F_SETFL, opts), SyscallSucceeds());
+
+  // Allocate a buffer three times the size of the send buffer. We do this on
+  // with a vector to avoid allocating on the stack.
+  int size = 3 * sendbuf_size_;
+  std::vector<char> buf(size);
+
+  // Try to write the whole thing.
+  int n;
+  ASSERT_THAT(n = RetryEINTR(send)(s_, buf.data(), size, 0), SyscallSucceeds());
+
+  // We should have written something, but not the whole thing.
+  EXPECT_GT(n, 0);
+  EXPECT_LT(n, size);
+}
+
+// Same test as above, but calls send instead of write.
+TEST_P(TcpSocketTest, BlockingLargeSend_NoRandomSave) {
+  // Allocate a buffer three times the size of the send buffer. We do this on
+  // with a vector to avoid allocating on the stack.
+  int size = 3 * sendbuf_size_;
+  std::vector<char> writebuf(size);
+
+  // Start reading the response in a loop.
+  int read_bytes = 0;
+  ScopedThread t([this, &read_bytes]() {
+    // Avoid interrupting the blocking write in main thread.
+    const DisableSave ds;
+    char readbuf[2500] = {};
+    int n = -1;
+    while (n != 0) {
+      EXPECT_THAT(n = RetryEINTR(read)(t_, &readbuf, sizeof(readbuf)),
+                  SyscallSucceeds());
+      read_bytes += n;
+    }
+  });
+
+  // Try to send the whole thing.
+  int n;
+  ASSERT_THAT(n = SendFd(s_, writebuf.data(), size, 0), SyscallSucceeds());
+
+  // We should have written the whole thing.
+  EXPECT_EQ(n, size);
+  EXPECT_THAT(close(s_), SyscallSucceedsWithValue(0));
+  s_ = -1;
+  t.Join();
+
+  // We should have read the whole thing.
+  EXPECT_EQ(read_bytes, size);
+}
+
+// Test that polling on a socket with a full send buffer will block.
+TEST_P(TcpSocketTest, PollWithFullBufferBlocks) {
+  // Set the FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(s_, F_SETFL, opts), SyscallSucceeds());
+
+  // Set TCP_NODELAY, which will cause linux to fill the receive buffer from the
+  // send buffer as quickly as possibly. This way we can fill up both buffers
+  // faster.
+  constexpr int tcp_nodelay_flag = 1;
+  ASSERT_THAT(setsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &tcp_nodelay_flag,
+                         sizeof(tcp_nodelay_flag)),
+              SyscallSucceeds());
+
+  // Create a large buffer that will be used for sending.
+  std::vector<char> buf(5 * sendbuf_size_);
+
+  // Write until we receive an error.
+  while (RetryEINTR(send)(s_, buf.data(), buf.size(), 0) != -1) {
+    // Sleep to give linux a chance to move data from the send buffer to the
+    // receive buffer.
+    usleep(10000);  // 10ms.
+  }
+  // The last error should have been EWOULDBLOCK.
+  ASSERT_EQ(errno, EWOULDBLOCK);
+}
+
+TEST_P(TcpSocketTest, MsgTrunc) {
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(t_, received_data, sizeof(received_data) / 2, MSG_TRUNC),
+      SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+  // Check that we didn't get anything.
+  char zeros[sizeof(received_data)] = {};
+  EXPECT_EQ(0, memcmp(zeros, received_data, sizeof(received_data)));
+}
+
+// MSG_CTRUNC is a return flag but linux allows it to be set on input flags
+// without returning an error.
+TEST_P(TcpSocketTest, MsgTruncWithCtrunc) {
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(t_, received_data, sizeof(received_data) / 2,
+                               MSG_TRUNC | MSG_CTRUNC),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+  // Check that we didn't get anything.
+  char zeros[sizeof(received_data)] = {};
+  EXPECT_EQ(0, memcmp(zeros, received_data, sizeof(received_data)));
+}
+
+// This test will verify that MSG_CTRUNC doesn't do anything when specified
+// on input.
+TEST_P(TcpSocketTest, MsgTruncWithCtruncOnly) {
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(t_, received_data, sizeof(received_data) / 2,
+                               MSG_CTRUNC),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+  // Since MSG_CTRUNC here had no affect, it should not behave like MSG_TRUNC.
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+}
+
+TEST_P(TcpSocketTest, MsgTruncLargeSize) {
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data) * 2] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(t_, received_data, sizeof(received_data), MSG_TRUNC),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // Check that we didn't get anything.
+  char zeros[sizeof(received_data)] = {};
+  EXPECT_EQ(0, memcmp(zeros, received_data, sizeof(received_data)));
+}
+
+TEST_P(TcpSocketTest, MsgTruncPeek) {
+  char sent_data[512];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  char received_data[sizeof(sent_data)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(t_, received_data, sizeof(received_data) / 2,
+                               MSG_TRUNC | MSG_PEEK),
+              SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+  // Check that we didn't get anything.
+  char zeros[sizeof(received_data)] = {};
+  EXPECT_EQ(0, memcmp(zeros, received_data, sizeof(received_data)));
+
+  // Check that we can still get all of the data.
+  ASSERT_THAT(RetryEINTR(recv)(t_, received_data, sizeof(received_data), 0),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(TcpSocketTest, NoDelayDefault) {
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TcpSocketTest, SetNoDelay) {
+  ASSERT_THAT(
+      setsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(setsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  EXPECT_THAT(getsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+INSTANTIATE_TEST_CASE_P(AllInetTests, TcpSocketTest,
+                        ::testing::Values(AF_INET, AF_INET6));
+
+// Fixture for tests parameterized by address family that don't want the fixture
+// to do things.
+using SimpleTcpSocketTest = ::testing::TestWithParam<int>;
+
+TEST_P(SimpleTcpSocketTest, SendUnconnected) {
+  int fd;
+  ASSERT_THAT(fd = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+              SyscallSucceeds());
+  FileDescriptor sock_fd(fd);
+
+  char data = '\0';
+  EXPECT_THAT(RetryEINTR(send)(fd, &data, sizeof(data), 0),
+              SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(SimpleTcpSocketTest, SendtoWithoutAddressUnconnected) {
+  int fd;
+  ASSERT_THAT(fd = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+              SyscallSucceeds());
+  FileDescriptor sock_fd(fd);
+
+  char data = '\0';
+  EXPECT_THAT(RetryEINTR(sendto)(fd, &data, sizeof(data), 0, nullptr, 0),
+              SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(SimpleTcpSocketTest, SendtoWithAddressUnconnected) {
+  int fd;
+  ASSERT_THAT(fd = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+              SyscallSucceeds());
+  FileDescriptor sock_fd(fd);
+
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  char data = '\0';
+  EXPECT_THAT(
+      RetryEINTR(sendto)(fd, &data, sizeof(data), 0,
+                         reinterpret_cast<sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(SimpleTcpSocketTest, GetPeerNameUnconnected) {
+  int fd;
+  ASSERT_THAT(fd = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+              SyscallSucceeds());
+  FileDescriptor sock_fd(fd);
+
+  sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(fd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(TcpSocketTest, FullBuffer) {
+  // Set both FDs to be blocking.
+  int flags = 0;
+  ASSERT_THAT(flags = fcntl(s_, F_GETFL), SyscallSucceeds());
+  EXPECT_THAT(fcntl(s_, F_SETFL, flags & ~O_NONBLOCK), SyscallSucceeds());
+  flags = 0;
+  ASSERT_THAT(flags = fcntl(t_, F_GETFL), SyscallSucceeds());
+  EXPECT_THAT(fcntl(t_, F_SETFL, flags & ~O_NONBLOCK), SyscallSucceeds());
+
+  // 2500 was chosen as a small value that can be set on Linux.
+  int set_snd = 2500;
+  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &set_snd, sizeof(set_snd)),
+              SyscallSucceedsWithValue(0));
+  int get_snd = -1;
+  socklen_t get_snd_len = sizeof(get_snd);
+  EXPECT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &get_snd, &get_snd_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_snd_len, sizeof(get_snd));
+  EXPECT_GT(get_snd, 0);
+
+  // 2500 was chosen as a small value that can be set on Linux and gVisor.
+  int set_rcv = 2500;
+  EXPECT_THAT(setsockopt(t_, SOL_SOCKET, SO_RCVBUF, &set_rcv, sizeof(set_rcv)),
+              SyscallSucceedsWithValue(0));
+  int get_rcv = -1;
+  socklen_t get_rcv_len = sizeof(get_rcv);
+  EXPECT_THAT(getsockopt(t_, SOL_SOCKET, SO_RCVBUF, &get_rcv, &get_rcv_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_rcv_len, sizeof(get_rcv));
+  EXPECT_GE(get_rcv, 2500);
+
+  // Quick sanity test.
+  EXPECT_LT(get_snd + get_rcv, 2500 * IOV_MAX);
+
+  char data[2500] = {};
+  std::vector<struct iovec> iovecs;
+  for (int i = 0; i < IOV_MAX; i++) {
+    struct iovec iov = {};
+    iov.iov_base = data;
+    iov.iov_len = sizeof(data);
+    iovecs.push_back(iov);
+  }
+  ScopedThread t([this, &iovecs]() {
+    int result = -1;
+    EXPECT_THAT(result = RetryEINTR(writev)(s_, iovecs.data(), iovecs.size()),
+                SyscallSucceeds());
+    EXPECT_GT(result, 1);
+    EXPECT_LT(result, sizeof(data) * iovecs.size());
+  });
+
+  char recv = 0;
+  EXPECT_THAT(RetryEINTR(read)(t_, &recv, 1), SyscallSucceedsWithValue(1));
+  EXPECT_THAT(close(t_), SyscallSucceedsWithValue(0));
+  t_ = -1;
+}
+
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
+  // Initialize address to the loopback one.
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  const FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Set the FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds());
+
+  ASSERT_THAT(RetryEINTR(connect)(
+                  s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallFailsWithErrno(EINPROGRESS));
+
+  // Now polling on the FD with a timeout should return 0 corresponding to no
+  // FDs ready.
+  struct pollfd poll_fd = {s.get(), POLLOUT, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+              SyscallSucceedsWithValue(1));
+
+  int err;
+  socklen_t optlen = sizeof(err);
+  ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(err, ECONNREFUSED);
+}
+
+TEST_P(SimpleTcpSocketTest, NonBlockingConnect) {
+  const FileDescriptor listener =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(
+      bind(listener.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  ASSERT_THAT(listen(listener.get(), SOMAXCONN), SyscallSucceeds());
+
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Set the FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds());
+
+  ASSERT_THAT(getsockname(listener.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(RetryEINTR(connect)(
+                  s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallFailsWithErrno(EINPROGRESS));
+
+  int t;
+  ASSERT_THAT(t = RetryEINTR(accept)(listener.get(), nullptr, nullptr),
+              SyscallSucceeds());
+
+  // Now polling on the FD with a timeout should return 0 corresponding to no
+  // FDs ready.
+  struct pollfd poll_fd = {s.get(), POLLOUT, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+              SyscallSucceedsWithValue(1));
+
+  int err;
+  socklen_t optlen = sizeof(err);
+  ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(err, 0);
+
+  EXPECT_THAT(close(t), SyscallSucceeds());
+}
+
+// Test that we get an ECONNREFUSED with a blocking socket when no one is
+// listening on the other end.
+TEST_P(SimpleTcpSocketTest, BlockingConnectRefused) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  ASSERT_THAT(RetryEINTR(connect)(
+                  s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallFailsWithErrno(ECONNREFUSED));
+
+  // Avoiding triggering save in destructor of s.
+  EXPECT_THAT(close(s.release()), SyscallSucceeds());
+}
+
+// Test that we get an ECONNREFUSED with a nonblocking socket.
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  ASSERT_THAT(RetryEINTR(connect)(
+                  s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallFailsWithErrno(EINPROGRESS));
+
+  // We don't need to specify any events to get POLLHUP or POLLERR as these
+  // are added before the poll.
+  struct pollfd poll_fd = {s.get(), /*events=*/0, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 1000), SyscallSucceedsWithValue(1));
+
+  // The ECONNREFUSED should cause us to be woken up with POLLHUP.
+  EXPECT_NE(poll_fd.revents & (POLLHUP | POLLERR), 0);
+
+  // Avoiding triggering save in destructor of s.
+  EXPECT_THAT(close(s.release()), SyscallSucceeds());
+}
+
+INSTANTIATE_TEST_CASE_P(AllInetTests, SimpleTcpSocketTest,
+                        ::testing::Values(AF_INET, AF_INET6));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/temp_umask.h b/test/syscalls/linux/temp_umask.h
new file mode 100644
index 000000000..f202dfa59
--- /dev/null
+++ b/test/syscalls/linux/temp_umask.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
+#define GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace gvisor {
+namespace testing {
+
+class TempUmask {
+ public:
+  // Sets the process umask to `mask`.
+  explicit TempUmask(mode_t mask) : old_mask_(umask(mask)) {}
+
+  // Sets the process umask to its previous value.
+  ~TempUmask() { umask(old_mask_); }
+
+ private:
+  mode_t old_mask_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
diff --git a/test/syscalls/linux/tgkill.cc b/test/syscalls/linux/tgkill.cc
new file mode 100644
index 000000000..2d258ef11
--- /dev/null
+++ b/test/syscalls/linux/tgkill.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(TgkillTest, InvalidTID) {
+  EXPECT_THAT(tgkill(getpid(), -1, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(tgkill(getpid(), 0, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TgkillTest, InvalidTGID) {
+  EXPECT_THAT(tgkill(-1, gettid(), 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(tgkill(0, gettid(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TgkillTest, ValidInput) {
+  EXPECT_THAT(tgkill(getpid(), gettid(), 0), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
new file mode 100644
index 000000000..3abcd8098
--- /dev/null
+++ b/test/syscalls/linux/time.cc
@@ -0,0 +1,103 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "test/util/proc_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr long kFudgeSeconds = 5;
+
+// Mimics the time(2) wrapper from glibc prior to 2.15.
+time_t vsyscall_time(time_t* t) {
+  constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
+  return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
+}
+
+TEST(TimeTest, VsyscallTime_Succeeds) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+  time_t t1, t2;
+
+  {
+    const DisableSave ds;  // Timing assertions.
+    EXPECT_THAT(time(&t1), SyscallSucceeds());
+    EXPECT_THAT(vsyscall_time(&t2), SyscallSucceeds());
+  }
+
+  // Time should be monotonic.
+  EXPECT_LE(static_cast<long>(t1), static_cast<long>(t2));
+
+  // Check that it's within kFudge seconds.
+  EXPECT_LE(static_cast<long>(t2), static_cast<long>(t1) + kFudgeSeconds);
+
+  // Redo with save.
+  EXPECT_THAT(time(&t1), SyscallSucceeds());
+  EXPECT_THAT(vsyscall_time(&t2), SyscallSucceeds());
+
+  // Time should be monotonic.
+  EXPECT_LE(static_cast<long>(t1), static_cast<long>(t2));
+}
+
+TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
+  EXPECT_EXIT(vsyscall_time(reinterpret_cast<time_t*>(0x1)),
+              ::testing::KilledBySignal(SIGSEGV), "");
+}
+int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
+  constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
+  return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
+      kVsyscallGettimeofdayEntry)(tv, tz);
+}
+
+TEST(TimeTest, VsyscallGettimeofday_Succeeds) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+  struct timeval tv1, tv2;
+  struct timezone tz1, tz2;
+
+  {
+    const DisableSave ds;  // Timing assertions.
+    EXPECT_THAT(gettimeofday(&tv1, &tz1), SyscallSucceeds());
+    EXPECT_THAT(vsyscall_gettimeofday(&tv2, &tz2), SyscallSucceeds());
+  }
+
+  // See above.
+  EXPECT_LE(static_cast<long>(tv1.tv_sec), static_cast<long>(tv2.tv_sec));
+  EXPECT_LE(static_cast<long>(tv2.tv_sec),
+            static_cast<long>(tv1.tv_sec) + kFudgeSeconds);
+
+  // Redo with save.
+  EXPECT_THAT(gettimeofday(&tv1, &tz1), SyscallSucceeds());
+  EXPECT_THAT(vsyscall_gettimeofday(&tv2, &tz2), SyscallSucceeds());
+}
+
+TEST(TimeTest, VsyscallGettimeofday_InvalidAddressSIGSEGV) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+  EXPECT_EXIT(vsyscall_gettimeofday(reinterpret_cast<struct timeval*>(0x1),
+                                    reinterpret_cast<struct timezone*>(0x1)),
+              ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/timerfd.cc b/test/syscalls/linux/timerfd.cc
new file mode 100644
index 000000000..b85321795
--- /dev/null
+++ b/test/syscalls/linux/timerfd.cc
@@ -0,0 +1,238 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <poll.h>
+#include <sys/timerfd.h>
+#include <time.h>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Wrapper around timerfd_create(2) that returns a FileDescriptor.
+PosixErrorOr<FileDescriptor> TimerfdCreate(int clockid, int flags) {
+  int fd = timerfd_create(clockid, flags);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "timerfd_create failed");
+  }
+  return FileDescriptor(fd);
+}
+
+// In tests that race a timerfd with a sleep, some slack is required because:
+//
+// - Timerfd expirations are asynchronous with respect to nanosleeps.
+//
+// - Because clock_gettime(CLOCK_MONOTONIC) is implemented through the VDSO,
+// it technically uses a closely-related, but distinct, time domain from the
+// CLOCK_MONOTONIC used to trigger timerfd expirations.
+absl::Duration TimerSlack() { return absl::Milliseconds(500); }
+
+TEST(TimerfdTest, IsInitiallyStopped) {
+  auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_MONOTONIC, 0));
+  struct itimerspec its = {};
+  ASSERT_THAT(timerfd_gettime(tfd.get(), &its), SyscallSucceeds());
+  EXPECT_EQ(0, its.it_value.tv_sec);
+  EXPECT_EQ(0, its.it_value.tv_nsec);
+}
+
+TEST(TimerfdTest, SingleShot) {
+  constexpr absl::Duration kDelay = absl::Seconds(1);
+
+  auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_MONOTONIC, 0));
+  struct itimerspec its = {};
+  its.it_value = absl::ToTimespec(kDelay);
+  ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+              SyscallSucceeds());
+
+  // The timer should fire exactly once since the interval is zero.
+  absl::SleepFor(kDelay + TimerSlack());
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
+  EXPECT_EQ(1, val);
+}
+
+TEST(TimerfdTest, Periodic) {
+  constexpr absl::Duration kDelay = absl::Seconds(1);
+  constexpr int kPeriods = 3;
+
+  auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_MONOTONIC, 0));
+  struct itimerspec its = {};
+  its.it_value = absl::ToTimespec(kDelay);
+  its.it_interval = absl::ToTimespec(kDelay);
+  ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+              SyscallSucceeds());
+
+  // Expect to see at least kPeriods expirations. More may occur due to the
+  // timer slack, or due to delays from scheduling or save/restore.
+  absl::SleepFor(kPeriods * kDelay + TimerSlack());
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
+  EXPECT_GE(val, kPeriods);
+}
+
+TEST(TimerfdTest, BlockingRead) {
+  constexpr absl::Duration kDelay = absl::Seconds(3);
+
+  auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_MONOTONIC, 0));
+  struct itimerspec its = {};
+  its.it_value.tv_sec = absl::ToInt64Seconds(kDelay);
+  auto const start_time = absl::Now();
+  ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+              SyscallSucceeds());
+
+  // read should block until the timer fires.
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
+  auto const end_time = absl::Now();
+  EXPECT_EQ(1, val);
+  EXPECT_GE((end_time - start_time) + TimerSlack(), kDelay);
+}
+
+TEST(TimerfdTest, NonblockingRead_NoRandomSave) {
+  constexpr absl::Duration kDelay = absl::Seconds(5);
+
+  auto const tfd =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_MONOTONIC, TFD_NONBLOCK));
+
+  // Since the timer is initially disabled and has never fired, read should
+  // return EAGAIN.
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallFailsWithErrno(EAGAIN));
+
+  DisableSave ds;  // Timing-sensitive.
+
+  // Arm the timer.
+  struct itimerspec its = {};
+  its.it_value.tv_sec = absl::ToInt64Seconds(kDelay);
+  ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+              SyscallSucceeds());
+
+  // Since the timer has not yet fired, read should return EAGAIN.
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallFailsWithErrno(EAGAIN));
+
+  ds.reset();  // No longer timing-sensitive.
+
+  // After the timer fires, read should indicate 1 expiration.
+  absl::SleepFor(kDelay + TimerSlack());
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
+  EXPECT_EQ(1, val);
+
+  // The successful read should have reset the number of expirations.
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(TimerfdTest, BlockingPoll_SetTimeResetsExpirations) {
+  constexpr absl::Duration kDelay = absl::Seconds(3);
+
+  auto const tfd =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_MONOTONIC, TFD_NONBLOCK));
+  struct itimerspec its = {};
+  its.it_value.tv_sec = absl::ToInt64Seconds(kDelay);
+  auto const start_time = absl::Now();
+  ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+              SyscallSucceeds());
+
+  // poll should block until the timer fires.
+  struct pollfd pfd = {};
+  pfd.fd = tfd.get();
+  pfd.events = POLLIN;
+  ASSERT_THAT(poll(&pfd, /* nfds = */ 1,
+                   /* timeout = */ 2 * absl::ToInt64Seconds(kDelay) * 1000),
+              SyscallSucceedsWithValue(1));
+  auto const end_time = absl::Now();
+  EXPECT_EQ(POLLIN, pfd.revents);
+  EXPECT_GE((end_time - start_time) + TimerSlack(), kDelay);
+
+  // Call timerfd_settime again with a value of 0. This should reset the number
+  // of expirations to 0, causing read to return EAGAIN since the timerfd is
+  // non-blocking.
+  its.it_value.tv_sec = 0;
+  ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+              SyscallSucceeds());
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(TimerfdTest, SetAbsoluteTime) {
+  constexpr absl::Duration kDelay = absl::Seconds(3);
+
+  // Use a non-blocking timerfd so that if TFD_TIMER_ABSTIME is incorrectly
+  // non-functional, we get EAGAIN rather than a test timeout.
+  auto const tfd =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_MONOTONIC, TFD_NONBLOCK));
+  struct itimerspec its = {};
+  ASSERT_THAT(clock_gettime(CLOCK_MONOTONIC, &its.it_value), SyscallSucceeds());
+  its.it_value.tv_sec += absl::ToInt64Seconds(kDelay);
+  ASSERT_THAT(timerfd_settime(tfd.get(), TFD_TIMER_ABSTIME, &its, nullptr),
+              SyscallSucceeds());
+
+  absl::SleepFor(kDelay + TimerSlack());
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
+  EXPECT_EQ(1, val);
+}
+
+TEST(TimerfdTest, ClockRealtime) {
+  // Since CLOCK_REALTIME can, by definition, change, we can't make any
+  // non-flaky assertions about the amount of time it takes for a
+  // CLOCK_REALTIME-based timer to expire. Just check that it expires at all,
+  // and hope it happens before the test times out.
+  constexpr int kDelaySecs = 1;
+
+  auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_REALTIME, 0));
+  struct itimerspec its = {};
+  its.it_value.tv_sec = kDelaySecs;
+  ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+              SyscallSucceeds());
+
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
+  EXPECT_EQ(1, val);
+}
+
+TEST(TimerfdTest, IllegalReadWrite) {
+  auto const tfd =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_MONOTONIC, TFD_NONBLOCK));
+  uint64_t val = 0;
+  EXPECT_THAT(PreadFd(tfd.get(), &val, sizeof(val), 0),
+              SyscallFailsWithErrno(ESPIPE));
+  EXPECT_THAT(WriteFd(tfd.get(), &val, sizeof(val)),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(PwriteFd(tfd.get(), &val, sizeof(val), 0),
+              SyscallFailsWithErrno(ESPIPE));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
new file mode 100644
index 000000000..dfe231575
--- /dev/null
+++ b/test/syscalls/linux/timers.cc
@@ -0,0 +1,642 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <atomic>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_bool(timers_test_sleep, false,
+            "If true, sleep forever instead of running tests.");
+
+using ::testing::_;
+using ::testing::AnyOf;
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+#ifndef CPUCLOCK_PROF
+#define CPUCLOCK_PROF 0
+#endif  // CPUCLOCK_PROF
+
+PosixErrorOr<absl::Duration> ProcessCPUTime(pid_t pid) {
+  // Use pid-specific CPUCLOCK_PROF, which is the clock used to enforce
+  // RLIMIT_CPU.
+  clockid_t clockid = (~static_cast<clockid_t>(pid) << 3) | CPUCLOCK_PROF;
+
+  struct timespec ts;
+  int ret = clock_gettime(clockid, &ts);
+  if (ret < 0) {
+    return PosixError(errno, "clock_gettime failed");
+  }
+
+  return absl::DurationFromTimespec(ts);
+}
+
+void NoopSignalHandler(int signo) {
+  TEST_CHECK_MSG(SIGXCPU == signo,
+                 "NoopSigHandler did not receive expected signal");
+}
+
+void UninstallingSignalHandler(int signo) {
+  TEST_CHECK_MSG(SIGXCPU == signo,
+                 "UninstallingSignalHandler did not receive expected signal");
+  struct sigaction rev_action;
+  rev_action.sa_handler = SIG_DFL;
+  rev_action.sa_flags = 0;
+  sigemptyset(&rev_action.sa_mask);
+  sigaction(SIGXCPU, &rev_action, nullptr);
+}
+
+TEST(TimerTest, ProcessKilledOnCPUSoftLimit) {
+  constexpr absl::Duration kSoftLimit = absl::Seconds(1);
+  constexpr absl::Duration kHardLimit = absl::Seconds(3);
+
+  struct rlimit cpu_limits;
+  cpu_limits.rlim_cur = absl::ToInt64Seconds(kSoftLimit);
+  cpu_limits.rlim_max = absl::ToInt64Seconds(kHardLimit);
+
+  int pid = fork();
+  MaybeSave();
+  if (pid == 0) {
+    TEST_PCHECK(setrlimit(RLIMIT_CPU, &cpu_limits) == 0);
+    MaybeSave();
+    for (;;) {
+    }
+  }
+  auto c = Cleanup([pid] {
+    int status;
+    EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+    EXPECT_TRUE(WIFSIGNALED(status));
+    EXPECT_EQ(WTERMSIG(status), SIGXCPU);
+  });
+
+  // Wait for the child to exit, but do not reap it. This will allow us to check
+  // its CPU usage while it is zombied.
+  EXPECT_THAT(waitid(P_PID, pid, nullptr, WEXITED | WNOWAIT),
+              SyscallSucceeds());
+
+  // Assert that the child spent 1s of CPU before getting killed.
+  //
+  // We must be careful to use CPUCLOCK_PROF, the same clock used for RLIMIT_CPU
+  // enforcement, to get correct results. Note that this is slightly different
+  // from rusage-reported CPU usage:
+  //
+  // RLIMIT_CPU, CPUCLOCK_PROF use kernel/sched/cputime.c:thread_group_cputime.
+  // rusage uses kernel/sched/cputime.c:thread_group_cputime_adjusted.
+  absl::Duration cpu = ASSERT_NO_ERRNO_AND_VALUE(ProcessCPUTime(pid));
+  EXPECT_GE(cpu, kSoftLimit);
+
+  // Child did not make it to the hard limit.
+  //
+  // Linux sends SIGXCPU synchronously with CPU tick updates. See
+  // kernel/time/timer.c:update_process_times:
+  //   => account_process_tick  // update task CPU usage.
+  //   => run_posix_cpu_timers  // enforce RLIMIT_CPU, sending signal.
+  //
+  // Thus, only chance for this to flake is if the system time required to
+  // deliver the signal exceeds 2s.
+  EXPECT_LT(cpu, kHardLimit);
+}
+
+TEST(TimerTest, ProcessPingedRepeatedlyAfterCPUSoftLimit) {
+  struct sigaction new_action;
+  new_action.sa_handler = UninstallingSignalHandler;
+  new_action.sa_flags = 0;
+  sigemptyset(&new_action.sa_mask);
+
+  constexpr absl::Duration kSoftLimit = absl::Seconds(1);
+  constexpr absl::Duration kHardLimit = absl::Seconds(10);
+
+  struct rlimit cpu_limits;
+  cpu_limits.rlim_cur = absl::ToInt64Seconds(kSoftLimit);
+  cpu_limits.rlim_max = absl::ToInt64Seconds(kHardLimit);
+
+  int pid = fork();
+  MaybeSave();
+  if (pid == 0) {
+    TEST_PCHECK(sigaction(SIGXCPU, &new_action, nullptr) == 0);
+    MaybeSave();
+    TEST_PCHECK(setrlimit(RLIMIT_CPU, &cpu_limits) == 0);
+    MaybeSave();
+    for (;;) {
+    }
+  }
+  auto c = Cleanup([pid] {
+    int status;
+    EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+    EXPECT_TRUE(WIFSIGNALED(status));
+    EXPECT_EQ(WTERMSIG(status), SIGXCPU);
+  });
+
+  // Wait for the child to exit, but do not reap it. This will allow us to check
+  // its CPU usage while it is zombied.
+  EXPECT_THAT(waitid(P_PID, pid, nullptr, WEXITED | WNOWAIT),
+              SyscallSucceeds());
+
+  absl::Duration cpu = ASSERT_NO_ERRNO_AND_VALUE(ProcessCPUTime(pid));
+  // Following signals come every CPU second.
+  EXPECT_GE(cpu, kSoftLimit + absl::Seconds(1));
+
+  // Child did not make it to the hard limit.
+  //
+  // As above, should not flake.
+  EXPECT_LT(cpu, kHardLimit);
+}
+
+TEST(TimerTest, ProcessKilledOnCPUHardLimit) {
+  struct sigaction new_action;
+  new_action.sa_handler = NoopSignalHandler;
+  new_action.sa_flags = 0;
+  sigemptyset(&new_action.sa_mask);
+
+  constexpr absl::Duration kSoftLimit = absl::Seconds(1);
+  constexpr absl::Duration kHardLimit = absl::Seconds(3);
+
+  struct rlimit cpu_limits;
+  cpu_limits.rlim_cur = absl::ToInt64Seconds(kSoftLimit);
+  cpu_limits.rlim_max = absl::ToInt64Seconds(kHardLimit);
+
+  int pid = fork();
+  MaybeSave();
+  if (pid == 0) {
+    TEST_PCHECK(sigaction(SIGXCPU, &new_action, nullptr) == 0);
+    MaybeSave();
+    TEST_PCHECK(setrlimit(RLIMIT_CPU, &cpu_limits) == 0);
+    MaybeSave();
+    for (;;) {
+    }
+  }
+  auto c = Cleanup([pid] {
+    int status;
+    EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+    EXPECT_TRUE(WIFSIGNALED(status));
+    EXPECT_EQ(WTERMSIG(status), SIGKILL);
+  });
+
+  // Wait for the child to exit, but do not reap it. This will allow us to check
+  // its CPU usage while it is zombied.
+  EXPECT_THAT(waitid(P_PID, pid, nullptr, WEXITED | WNOWAIT),
+              SyscallSucceeds());
+
+  absl::Duration cpu = ASSERT_NO_ERRNO_AND_VALUE(ProcessCPUTime(pid));
+  EXPECT_GE(cpu, kHardLimit);
+}
+
+// RAII type for a kernel "POSIX" interval timer. (The kernel provides system
+// calls such as timer_create that behave very similarly, but not identically,
+// to those described by timer_create(2); in particular, the kernel does not
+// implement SIGEV_THREAD. glibc builds POSIX-compliant interval timers based on
+// these kernel interval timers.)
+//
+// Compare implementation to FileDescriptor.
+class IntervalTimer {
+ public:
+  IntervalTimer() = default;
+
+  explicit IntervalTimer(int id) { set_id(id); }
+
+  IntervalTimer(IntervalTimer&& orig) : id_(orig.release()) {}
+
+  IntervalTimer& operator=(IntervalTimer&& orig) {
+    if (this == &orig) return *this;
+    reset(orig.release());
+    return *this;
+  }
+
+  IntervalTimer(const IntervalTimer& other) = delete;
+  IntervalTimer& operator=(const IntervalTimer& other) = delete;
+
+  ~IntervalTimer() { reset(); }
+
+  int get() const { return id_; }
+
+  int release() {
+    int const id = id_;
+    id_ = -1;
+    return id;
+  }
+
+  void reset() { reset(-1); }
+
+  void reset(int id) {
+    if (id_ >= 0) {
+      TEST_PCHECK(syscall(SYS_timer_delete, id_) == 0);
+      MaybeSave();
+    }
+    set_id(id);
+  }
+
+  PosixErrorOr<struct itimerspec> Set(
+      int flags, const struct itimerspec& new_value) const {
+    struct itimerspec old_value = {};
+    if (syscall(SYS_timer_settime, id_, flags, &new_value, &old_value) < 0) {
+      return PosixError(errno, "timer_settime");
+    }
+    MaybeSave();
+    return old_value;
+  }
+
+  PosixErrorOr<struct itimerspec> Get() const {
+    struct itimerspec curr_value = {};
+    if (syscall(SYS_timer_gettime, id_, &curr_value) < 0) {
+      return PosixError(errno, "timer_gettime");
+    }
+    MaybeSave();
+    return curr_value;
+  }
+
+  PosixErrorOr<int> Overruns() const {
+    int rv = syscall(SYS_timer_getoverrun, id_);
+    if (rv < 0) {
+      return PosixError(errno, "timer_getoverrun");
+    }
+    MaybeSave();
+    return rv;
+  }
+
+ private:
+  void set_id(int id) { id_ = std::max(id, -1); }
+
+  // Kernel timer_t is int; glibc timer_t is void*.
+  int id_ = -1;
+};
+
+PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
+                                        const struct sigevent& sev) {
+  int timerid;
+  if (syscall(SYS_timer_create, clockid, &sev, &timerid) < 0) {
+    return PosixError(errno, "timer_create");
+  }
+  MaybeSave();
+  return IntervalTimer(timerid);
+}
+
+// See timerfd.cc:TimerSlack() for rationale.
+constexpr absl::Duration kTimerSlack = absl::Milliseconds(500);
+
+TEST(IntervalTimerTest, IsInitiallyStopped) {
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_NONE;
+  const auto timer =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+  const struct itimerspec its = ASSERT_NO_ERRNO_AND_VALUE(timer.Get());
+  EXPECT_EQ(0, its.it_value.tv_sec);
+  EXPECT_EQ(0, its.it_value.tv_nsec);
+}
+
+TEST(IntervalTimerTest, SingleShotSilent) {
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_NONE;
+  const auto timer =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+  constexpr absl::Duration kDelay = absl::Seconds(1);
+  struct itimerspec its = {};
+  its.it_value = absl::ToTimespec(kDelay);
+  ASSERT_NO_ERRNO(timer.Set(0, its));
+
+  // The timer should count down to 0 and stop since the interval is zero. No
+  // overruns should be counted.
+  absl::SleepFor(kDelay + kTimerSlack);
+  its = ASSERT_NO_ERRNO_AND_VALUE(timer.Get());
+  EXPECT_EQ(0, its.it_value.tv_sec);
+  EXPECT_EQ(0, its.it_value.tv_nsec);
+  EXPECT_THAT(timer.Overruns(), IsPosixErrorOkAndHolds(0));
+}
+
+TEST(IntervalTimerTest, PeriodicSilent) {
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_NONE;
+  const auto timer =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+  constexpr absl::Duration kPeriod = absl::Seconds(1);
+  struct itimerspec its = {};
+  its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+  ASSERT_NO_ERRNO(timer.Set(0, its));
+
+  absl::SleepFor(kPeriod * 3 + kTimerSlack);
+
+  // The timer should still be running.
+  its = ASSERT_NO_ERRNO_AND_VALUE(timer.Get());
+  EXPECT_TRUE(its.it_value.tv_nsec != 0 || its.it_value.tv_sec != 0);
+
+  // Timer expirations are not counted as overruns under SIGEV_NONE.
+  EXPECT_THAT(timer.Overruns(), IsPosixErrorOkAndHolds(0));
+}
+
+std::atomic<int> counted_signals;
+
+void IntervalTimerCountingSignalHandler(int sig, siginfo_t* info,
+                                        void* ucontext) {
+  counted_signals.fetch_add(1 + info->si_overrun);
+}
+
+TEST(IntervalTimerTest, PeriodicGroupDirectedSignal) {
+  constexpr int kSigno = SIGUSR1;
+  constexpr int kSigvalue = 42;
+
+  // Install our signal handler.
+  counted_signals.store(0);
+  struct sigaction sa = {};
+  sa.sa_sigaction = IntervalTimerCountingSignalHandler;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  const auto scoped_sigaction =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kSigno, sa));
+
+  // Ensure that kSigno is unblocked on at least one thread.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, kSigno));
+
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_SIGNAL;
+  sev.sigev_signo = kSigno;
+  sev.sigev_value.sival_int = kSigvalue;
+  auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+  constexpr absl::Duration kPeriod = absl::Seconds(1);
+  constexpr int kCycles = 3;
+  struct itimerspec its = {};
+  its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+  ASSERT_NO_ERRNO(timer.Set(0, its));
+
+  absl::SleepFor(kPeriod * kCycles + kTimerSlack);
+  EXPECT_GE(counted_signals.load(), kCycles);
+}
+
+// From Linux's include/uapi/asm-generic/siginfo.h.
+#ifndef sigev_notify_thread_id
+#define sigev_notify_thread_id _sigev_un._tid
+#endif
+
+TEST(IntervalTimerTest, PeriodicThreadDirectedSignal) {
+  constexpr int kSigno = SIGUSR1;
+  constexpr int kSigvalue = 42;
+
+  // Block kSigno so that we can accumulate overruns.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, mask));
+
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_THREAD_ID;
+  sev.sigev_signo = kSigno;
+  sev.sigev_value.sival_int = kSigvalue;
+  sev.sigev_notify_thread_id = gettid();
+  auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+  constexpr absl::Duration kPeriod = absl::Seconds(1);
+  constexpr int kCycles = 3;
+  struct itimerspec its = {};
+  its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+  ASSERT_NO_ERRNO(timer.Set(0, its));
+  absl::SleepFor(kPeriod * kCycles + kTimerSlack);
+
+  // At least kCycles expirations should have occurred, resulting in kCycles-1
+  // overruns (the first expiration sent the signal successfully).
+  siginfo_t si;
+  struct timespec zero_ts = absl::ToTimespec(absl::ZeroDuration());
+  ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+              SyscallSucceedsWithValue(kSigno));
+  EXPECT_EQ(si.si_signo, kSigno);
+  EXPECT_EQ(si.si_code, SI_TIMER);
+  EXPECT_EQ(si.si_timerid, timer.get());
+  EXPECT_GE(si.si_overrun, kCycles - 1);
+  EXPECT_EQ(si.si_int, kSigvalue);
+
+  // Kill the timer, then drain any additional signal it may have enqueued. We
+  // can't do this before the preceding sigtimedwait because stopping or
+  // deleting the timer resets si_overrun to 0.
+  timer.reset();
+  sigtimedwait(&mask, &si, &zero_ts);
+}
+
+TEST(IntervalTimerTest, OtherThreadGroup) {
+  constexpr int kSigno = SIGUSR1;
+
+  // Create a subprocess that does nothing until killed.
+  pid_t child_pid;
+  const auto sp = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+      "/proc/self/exe", ExecveArray({"timers", "--timers_test_sleep"}),
+      ExecveArray(), &child_pid, nullptr));
+
+  // Verify that we can't create a timer that would send signals to it.
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_THREAD_ID;
+  sev.sigev_signo = kSigno;
+  sev.sigev_notify_thread_id = child_pid;
+  EXPECT_THAT(TimerCreate(CLOCK_MONOTONIC, sev), PosixErrorIs(EINVAL, _));
+}
+
+TEST(IntervalTimerTest, RealTimeSignalsAreNotDuplicated) {
+  const int kSigno = SIGRTMIN;
+  constexpr int kSigvalue = 42;
+
+  // Block signo so that we can accumulate overruns.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  const auto scoped_sigmask = ScopedSignalMask(SIG_BLOCK, mask);
+
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_THREAD_ID;
+  sev.sigev_signo = kSigno;
+  sev.sigev_value.sival_int = kSigvalue;
+  sev.sigev_notify_thread_id = gettid();
+  const auto timer =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+  constexpr absl::Duration kPeriod = absl::Seconds(1);
+  constexpr int kCycles = 3;
+  struct itimerspec its = {};
+  its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+  ASSERT_NO_ERRNO(timer.Set(0, its));
+  absl::SleepFor(kPeriod * kCycles + kTimerSlack);
+
+  // Stop the timer so that no further signals are enqueued after sigtimedwait.
+  struct timespec zero_ts = absl::ToTimespec(absl::ZeroDuration());
+  its.it_value = its.it_interval = zero_ts;
+  ASSERT_NO_ERRNO(timer.Set(0, its));
+
+  // The timer should have sent only a single signal, even though the kernel
+  // supports enqueueing of multiple RT signals.
+  siginfo_t si;
+  ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+              SyscallSucceedsWithValue(kSigno));
+  EXPECT_EQ(si.si_signo, kSigno);
+  EXPECT_EQ(si.si_code, SI_TIMER);
+  EXPECT_EQ(si.si_timerid, timer.get());
+  // si_overrun was reset by timer_settime.
+  EXPECT_EQ(si.si_overrun, 0);
+  EXPECT_EQ(si.si_int, kSigvalue);
+  EXPECT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(IntervalTimerTest, AlreadyPendingSignal) {
+  constexpr int kSigno = SIGUSR1;
+  constexpr int kSigvalue = 42;
+
+  // Block kSigno so that we can accumulate overruns.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, mask));
+
+  // Send ourselves a signal, preventing the timer from enqueuing.
+  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_THREAD_ID;
+  sev.sigev_signo = kSigno;
+  sev.sigev_value.sival_int = kSigvalue;
+  sev.sigev_notify_thread_id = gettid();
+  auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+  constexpr absl::Duration kPeriod = absl::Seconds(1);
+  constexpr int kCycles = 3;
+  struct itimerspec its = {};
+  its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+  ASSERT_NO_ERRNO(timer.Set(0, its));
+
+  // End the sleep one cycle short; we will sleep for one more cycle below.
+  absl::SleepFor(kPeriod * (kCycles - 1));
+
+  // Dequeue the first signal, which we sent to ourselves with tgkill.
+  siginfo_t si;
+  struct timespec zero_ts = absl::ToTimespec(absl::ZeroDuration());
+  ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+              SyscallSucceedsWithValue(kSigno));
+  EXPECT_EQ(si.si_signo, kSigno);
+  // glibc sigtimedwait silently replaces SI_TKILL with SI_USER:
+  // sysdeps/unix/sysv/linux/sigtimedwait.c:__sigtimedwait(). This isn't
+  // documented, so we don't depend on it.
+  EXPECT_THAT(si.si_code, AnyOf(SI_USER, SI_TKILL));
+
+  // Sleep for 1 more cycle to give the timer time to send a signal.
+  absl::SleepFor(kPeriod + kTimerSlack);
+
+  // At least kCycles expirations should have occurred, resulting in kCycles-1
+  // overruns (the last expiration sent the signal successfully).
+  ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+              SyscallSucceedsWithValue(kSigno));
+  EXPECT_EQ(si.si_signo, kSigno);
+  EXPECT_EQ(si.si_code, SI_TIMER);
+  EXPECT_EQ(si.si_timerid, timer.get());
+  EXPECT_GE(si.si_overrun, kCycles - 1);
+  EXPECT_EQ(si.si_int, kSigvalue);
+
+  // Kill the timer, then drain any additional signal it may have enqueued. We
+  // can't do this before the preceding sigtimedwait because stopping or
+  // deleting the timer resets si_overrun to 0.
+  timer.reset();
+  sigtimedwait(&mask, &si, &zero_ts);
+}
+
+TEST(IntervalTimerTest, IgnoredSignalCountsAsOverrun) {
+  constexpr int kSigno = SIGUSR1;
+  constexpr int kSigvalue = 42;
+
+  // Ignore kSigno.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_IGN;
+  const auto scoped_sigaction =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kSigno, sa));
+
+  // Unblock kSigno so that ignored signals will be discarded.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, mask));
+
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_THREAD_ID;
+  sev.sigev_signo = kSigno;
+  sev.sigev_value.sival_int = kSigvalue;
+  sev.sigev_notify_thread_id = gettid();
+  auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+  constexpr absl::Duration kPeriod = absl::Seconds(1);
+  constexpr int kCycles = 3;
+  struct itimerspec its = {};
+  its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+  ASSERT_NO_ERRNO(timer.Set(0, its));
+
+  // End the sleep one cycle short; we will sleep for one more cycle below.
+  absl::SleepFor(kPeriod * (kCycles - 1));
+
+  // Block kSigno so that ignored signals will be enqueued.
+  scoped_sigmask.Release()();
+  scoped_sigmask = ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, mask));
+
+  // Sleep for 1 more cycle to give the timer time to send a signal.
+  absl::SleepFor(kPeriod + kTimerSlack);
+
+  // At least kCycles expirations should have occurred, resulting in kCycles-1
+  // overruns (the last expiration sent the signal successfully).
+  siginfo_t si;
+  struct timespec zero_ts = absl::ToTimespec(absl::ZeroDuration());
+  ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+              SyscallSucceedsWithValue(kSigno));
+  EXPECT_EQ(si.si_signo, kSigno);
+  EXPECT_EQ(si.si_code, SI_TIMER);
+  EXPECT_EQ(si.si_timerid, timer.get());
+  EXPECT_GE(si.si_overrun, kCycles - 1);
+  EXPECT_EQ(si.si_int, kSigvalue);
+
+  // Kill the timer, then drain any additional signal it may have enqueued. We
+  // can't do this before the preceding sigtimedwait because stopping or
+  // deleting the timer resets si_overrun to 0.
+  timer.reset();
+  sigtimedwait(&mask, &si, &zero_ts);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  gvisor::testing::TestInit(&argc, &argv);
+
+  if (FLAGS_timers_test_sleep) {
+    while (true) {
+      absl::SleepFor(absl::Seconds(10));
+    }
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
new file mode 100644
index 000000000..9842ccc9b
--- /dev/null
+++ b/test/syscalls/linux/tkill.cc
@@ -0,0 +1,75 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <csignal>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+static int tkill(pid_t tid, int sig) {
+  int ret;
+  do {
+    // NOTE: tkill(2) could return EAGAIN for RT signals.
+    ret = syscall(SYS_tkill, tid, sig);
+  } while (ret == -1 && errno == EAGAIN);
+  return ret;
+}
+
+TEST(TkillTest, InvalidTID) {
+  EXPECT_THAT(tkill(-1, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(tkill(0, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TkillTest, ValidTID) {
+  EXPECT_THAT(tkill(gettid(), 0), SyscallSucceeds());
+}
+
+void SigHandler(int sig, siginfo_t* info, void* context) {
+  TEST_CHECK(sig == SIGRTMAX);
+  TEST_CHECK(info->si_pid == getpid());
+  TEST_CHECK(info->si_uid == getuid());
+  TEST_CHECK(info->si_code == SI_TKILL);
+}
+
+// Test with a real signal.
+TEST(TkillTest, ValidTIDAndRealSignal) {
+  struct sigaction sa;
+  sa.sa_sigaction = SigHandler;
+  sigfillset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  ASSERT_THAT(sigaction(SIGRTMAX, &sa, nullptr), SyscallSucceeds());
+  // InitGoogle blocks all RT signals, so we need undo it.
+  sigset_t unblock;
+  sigemptyset(&unblock);
+  sigaddset(&unblock, SIGRTMAX);
+  ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &unblock, nullptr), SyscallSucceeds());
+  EXPECT_THAT(tkill(gettid(), SIGRTMAX), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/truncate.cc b/test/syscalls/linux/truncate.cc
new file mode 100644
index 000000000..2616a9147
--- /dev/null
+++ b/test/syscalls/linux/truncate.cc
@@ -0,0 +1,217 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <time.h>
+#include <unistd.h>
+#include <iostream>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class FixtureTruncateTest : public FileTest {
+  void SetUp() override { FileTest::SetUp(); }
+};
+
+TEST_F(FixtureTruncateTest, Truncate) {
+  // Get the current rlimit and restore after test run.
+  struct rlimit initial_lim;
+  ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  auto cleanup = Cleanup([&initial_lim] {
+    EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  });
+
+  // Check that it starts at size zero.
+  struct stat buf;
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 0);
+
+  // Stay at size zero.
+  EXPECT_THAT(truncate(test_file_name_.c_str(), 0), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 0);
+
+  // Grow to ten bytes.
+  EXPECT_THAT(truncate(test_file_name_.c_str(), 10), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 10);
+
+  // Can't be truncated to a negative number.
+  EXPECT_THAT(truncate(test_file_name_.c_str(), -1),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Try growing past the file size limit.
+  sigset_t new_mask;
+  sigemptyset(&new_mask);
+  sigaddset(&new_mask, SIGXFSZ);
+  sigprocmask(SIG_BLOCK, &new_mask, nullptr);
+  struct timespec timelimit;
+  timelimit.tv_sec = 10;
+  timelimit.tv_nsec = 0;
+
+  struct rlimit setlim;
+  setlim.rlim_cur = 1024;
+  setlim.rlim_max = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+  EXPECT_THAT(truncate(test_file_name_.c_str(), 1025),
+              SyscallFailsWithErrno(EFBIG));
+  EXPECT_EQ(sigtimedwait(&new_mask, nullptr, &timelimit), SIGXFSZ);
+  ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds());
+
+  // Shrink back down to zero.
+  EXPECT_THAT(truncate(test_file_name_.c_str(), 0), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 0);
+}
+
+TEST_F(FixtureTruncateTest, Ftruncate) {
+  // Get the current rlimit and restore after test run.
+  struct rlimit initial_lim;
+  ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  auto cleanup = Cleanup([&initial_lim] {
+    EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  });
+
+  // Check that it starts at size zero.
+  struct stat buf;
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 0);
+
+  // Stay at size zero.
+  EXPECT_THAT(ftruncate(test_file_fd_.get(), 0), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 0);
+
+  // Grow to ten bytes.
+  EXPECT_THAT(ftruncate(test_file_fd_.get(), 10), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 10);
+
+  // Can't be truncated to a negative number.
+  EXPECT_THAT(ftruncate(test_file_fd_.get(), -1),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Try growing past the file size limit.
+  sigset_t new_mask;
+  sigemptyset(&new_mask);
+  sigaddset(&new_mask, SIGXFSZ);
+  sigprocmask(SIG_BLOCK, &new_mask, nullptr);
+  struct timespec timelimit;
+  timelimit.tv_sec = 10;
+  timelimit.tv_nsec = 0;
+
+  struct rlimit setlim;
+  setlim.rlim_cur = 1024;
+  setlim.rlim_max = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+  EXPECT_THAT(ftruncate(test_file_fd_.get(), 1025),
+              SyscallFailsWithErrno(EFBIG));
+  EXPECT_EQ(sigtimedwait(&new_mask, nullptr, &timelimit), SIGXFSZ);
+  ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds());
+
+  // Shrink back down to zero.
+  EXPECT_THAT(ftruncate(test_file_fd_.get(), 0), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 0);
+}
+
+// Truncating a file down clears that portion of the file.
+TEST_F(FixtureTruncateTest, FtruncateShrinkGrow) {
+  std::vector<char> buf(10, 'a');
+  EXPECT_THAT(WriteFd(test_file_fd_.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Shrink then regrow the file. This should clear the second half of the file.
+  EXPECT_THAT(ftruncate(test_file_fd_.get(), 5), SyscallSucceeds());
+  EXPECT_THAT(ftruncate(test_file_fd_.get(), 10), SyscallSucceeds());
+
+  EXPECT_THAT(lseek(test_file_fd_.get(), 0, SEEK_SET), SyscallSucceeds());
+
+  std::vector<char> buf2(10);
+  EXPECT_THAT(ReadFd(test_file_fd_.get(), buf2.data(), buf2.size()),
+              SyscallSucceedsWithValue(buf2.size()));
+
+  std::vector<char> expect = {'a',  'a',  'a',  'a',  'a',
+                              '\0', '\0', '\0', '\0', '\0'};
+  EXPECT_EQ(expect, buf2);
+}
+
+TEST(TruncateTest, TruncateDir) {
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(truncate(temp_dir.path().c_str(), 0),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(TruncateTest, FtruncateDir) {
+  auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(temp_dir.path(), O_DIRECTORY | O_RDONLY));
+  EXPECT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TruncateTest, TruncateNonWriteable) {
+  // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+  // always override write permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::string_view(), 0555 /* mode */));
+  EXPECT_THAT(truncate(temp_file.path().c_str(), 0),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(TruncateTest, FtruncateNonWriteable) {
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::string_view(), 0555 /* mode */));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY));
+  EXPECT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TruncateTest, TruncateNonExist) {
+  EXPECT_THAT(truncate("/foo/bar", 0), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(TruncateTest, FtruncateVirtualTmp_NoRandomSave) {
+  auto temp_file = NewTempAbsPathInDir("/dev/shm");
+  const DisableSave ds;  // Incompatible permissions.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file, O_RDWR | O_CREAT | O_EXCL, 0));
+  EXPECT_THAT(ftruncate(fd.get(), 100), SyscallSucceeds());
+}
+
+// NOTE: There are additional truncate(2)/ftruncate(2) tests in mknod.cc
+// which are there to avoid running the tests on a number of different
+// filesystems which may not support mknod.
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/udp_bind.cc b/test/syscalls/linux/udp_bind.cc
new file mode 100644
index 000000000..419aaac76
--- /dev/null
+++ b/test/syscalls/linux/udp_bind.cc
@@ -0,0 +1,316 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+struct sockaddr_in_common {
+  sa_family_t sin_family;
+  in_port_t sin_port;
+};
+
+struct SendtoTestParam {
+  // Human readable description of test parameter.
+  std::string description;
+
+  // Test is broken in gVisor, skip.
+  bool skip_on_gvisor;
+
+  // Domain for the socket that will do the sending.
+  int send_domain;
+
+  // Address to bind for the socket that will do the sending.
+  struct sockaddr_storage send_addr;
+  socklen_t send_addr_len;  // 0 for unbound.
+
+  // Address to connect to for the socket that will do the sending.
+  struct sockaddr_storage connect_addr;
+  socklen_t connect_addr_len;  // 0 for no connection.
+
+  // Domain for the socket that will do the receiving.
+  int recv_domain;
+
+  // Address to bind for the socket that will do the receiving.
+  struct sockaddr_storage recv_addr;
+  socklen_t recv_addr_len;
+
+  // Address to send to.
+  struct sockaddr_storage sendto_addr;
+  socklen_t sendto_addr_len;
+
+  // Expected errno for the sendto call.
+  std::vector<int> sendto_errnos;  // empty on success.
+};
+
+class SendtoTest : public ::testing::TestWithParam<SendtoTestParam> {
+ protected:
+  SendtoTest() {
+    // gUnit uses printf, so so will we.
+    printf("Testing with %s\n", GetParam().description.c_str());
+  }
+};
+
+TEST_P(SendtoTest, Sendto) {
+  auto param = GetParam();
+
+  SKIP_IF(param.skip_on_gvisor && IsRunningOnGvisor());
+
+  const FileDescriptor s1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(param.send_domain, SOCK_DGRAM, 0));
+  const FileDescriptor s2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(param.recv_domain, SOCK_DGRAM, 0));
+
+  if (param.send_addr_len > 0) {
+    ASSERT_THAT(bind(s1.get(), reinterpret_cast<sockaddr*>(&param.send_addr),
+                     param.send_addr_len),
+                SyscallSucceeds());
+  }
+
+  if (param.connect_addr_len > 0) {
+    ASSERT_THAT(
+        connect(s1.get(), reinterpret_cast<sockaddr*>(&param.connect_addr),
+                param.connect_addr_len),
+        SyscallSucceeds());
+  }
+
+  ASSERT_THAT(bind(s2.get(), reinterpret_cast<sockaddr*>(&param.recv_addr),
+                   param.recv_addr_len),
+              SyscallSucceeds());
+
+  struct sockaddr_storage real_recv_addr = {};
+  socklen_t real_recv_addr_len = param.recv_addr_len;
+  ASSERT_THAT(
+      getsockname(s2.get(), reinterpret_cast<sockaddr*>(&real_recv_addr),
+                  &real_recv_addr_len),
+      SyscallSucceeds());
+
+  ASSERT_EQ(real_recv_addr_len, param.recv_addr_len);
+
+  int recv_port =
+      reinterpret_cast<sockaddr_in_common*>(&real_recv_addr)->sin_port;
+
+  struct sockaddr_storage sendto_addr = param.sendto_addr;
+  reinterpret_cast<sockaddr_in_common*>(&sendto_addr)->sin_port = recv_port;
+
+  char buf[20] = {};
+  if (!param.sendto_errnos.empty()) {
+    ASSERT_THAT(RetryEINTR(sendto)(s1.get(), buf, sizeof(buf), 0,
+                                   reinterpret_cast<sockaddr*>(&sendto_addr),
+                                   param.sendto_addr_len),
+                SyscallFailsWithErrno(ElementOf(param.sendto_errnos)));
+    return;
+  }
+
+  ASSERT_THAT(RetryEINTR(sendto)(s1.get(), buf, sizeof(buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr),
+                                 param.sendto_addr_len),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  struct sockaddr_storage got_addr = {};
+  socklen_t got_addr_len = sizeof(sockaddr_storage);
+  ASSERT_THAT(RetryEINTR(recvfrom)(s2.get(), buf, sizeof(buf), 0,
+                                   reinterpret_cast<sockaddr*>(&got_addr),
+                                   &got_addr_len),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  ASSERT_GT(got_addr_len, sizeof(sockaddr_in_common));
+  int got_port = reinterpret_cast<sockaddr_in_common*>(&got_addr)->sin_port;
+
+  struct sockaddr_storage sender_addr = {};
+  socklen_t sender_addr_len = sizeof(sockaddr_storage);
+  ASSERT_THAT(getsockname(s1.get(), reinterpret_cast<sockaddr*>(&sender_addr),
+                          &sender_addr_len),
+              SyscallSucceeds());
+
+  ASSERT_GT(sender_addr_len, sizeof(sockaddr_in_common));
+  int sender_port =
+      reinterpret_cast<sockaddr_in_common*>(&sender_addr)->sin_port;
+
+  EXPECT_EQ(got_port, sender_port);
+}
+
+socklen_t Ipv4Addr(sockaddr_storage* addr, int port = 0) {
+  auto addr4 = reinterpret_cast<sockaddr_in*>(addr);
+  addr4->sin_family = AF_INET;
+  addr4->sin_port = port;
+  inet_pton(AF_INET, "127.0.0.1", &addr4->sin_addr.s_addr);
+  return sizeof(struct sockaddr_in);
+}
+
+socklen_t Ipv6Addr(sockaddr_storage* addr, int port = 0) {
+  auto addr6 = reinterpret_cast<sockaddr_in6*>(addr);
+  addr6->sin6_family = AF_INET6;
+  addr6->sin6_port = port;
+  inet_pton(AF_INET6, "::1", &addr6->sin6_addr.s6_addr);
+  return sizeof(struct sockaddr_in6);
+}
+
+socklen_t Ipv4MappedIpv6Addr(sockaddr_storage* addr, int port = 0) {
+  auto addr6 = reinterpret_cast<sockaddr_in6*>(addr);
+  addr6->sin6_family = AF_INET6;
+  addr6->sin6_port = port;
+  inet_pton(AF_INET6, "::ffff:127.0.0.1", &addr6->sin6_addr.s6_addr);
+  return sizeof(struct sockaddr_in6);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    UdpBindTest, SendtoTest,
+    ::testing::Values(
+        []() {
+          SendtoTestParam param = {};
+          param.description = "IPv4 mapped IPv6 sendto IPv4 mapped IPv6";
+          param.send_domain = AF_INET6;
+          param.send_addr_len = Ipv4MappedIpv6Addr(&param.send_addr);
+          param.recv_domain = AF_INET6;
+          param.recv_addr_len = Ipv4MappedIpv6Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "IPv6 sendto IPv6";
+          param.send_domain = AF_INET6;
+          param.send_addr_len = Ipv6Addr(&param.send_addr);
+          param.recv_domain = AF_INET6;
+          param.recv_addr_len = Ipv6Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv6Addr(&param.sendto_addr);
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "IPv4 sendto IPv4";
+          param.send_domain = AF_INET;
+          param.send_addr_len = Ipv4Addr(&param.send_addr);
+          param.recv_domain = AF_INET;
+          param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4Addr(&param.sendto_addr);
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "IPv4 mapped IPv6 sendto IPv4";
+          param.send_domain = AF_INET6;
+          param.send_addr_len = Ipv4MappedIpv6Addr(&param.send_addr);
+          param.recv_domain = AF_INET;
+          param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "IPv4 sendto IPv4 mapped IPv6";
+          param.send_domain = AF_INET;
+          param.send_addr_len = Ipv4Addr(&param.send_addr);
+          param.recv_domain = AF_INET6;
+          param.recv_addr_len = Ipv4MappedIpv6Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4Addr(&param.sendto_addr);
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "unbound IPv6 sendto IPv4 mapped IPv6";
+          param.send_domain = AF_INET6;
+          param.recv_domain = AF_INET6;
+          param.recv_addr_len = Ipv4MappedIpv6Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "unbound IPv6 sendto IPv4";
+          param.send_domain = AF_INET6;
+          param.recv_domain = AF_INET;
+          param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "IPv6 sendto IPv4";
+          param.send_domain = AF_INET6;
+          param.send_addr_len = Ipv6Addr(&param.send_addr);
+          param.recv_domain = AF_INET;
+          param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+          param.sendto_errnos = {ENETUNREACH};
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "IPv4 mapped IPv6 sendto IPv6";
+          param.send_domain = AF_INET6;
+          param.send_addr_len = Ipv4MappedIpv6Addr(&param.send_addr);
+          param.recv_domain = AF_INET6;
+          param.recv_addr_len = Ipv6Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv6Addr(&param.sendto_addr);
+          param.sendto_errnos = {EAFNOSUPPORT};
+          // The errno returned changed in Linux commit c8e6ad0829a723.
+          param.sendto_errnos = {EINVAL, EAFNOSUPPORT};
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "connected IPv4 mapped IPv6 sendto IPv6";
+          param.send_domain = AF_INET6;
+          param.connect_addr_len =
+              Ipv4MappedIpv6Addr(&param.connect_addr, 5000);
+          param.recv_domain = AF_INET6;
+          param.recv_addr_len = Ipv6Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv6Addr(&param.sendto_addr);
+          // The errno returned changed in Linux commit c8e6ad0829a723.
+          param.sendto_errnos = {EINVAL, EAFNOSUPPORT};
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "connected IPv6 sendto IPv4 mapped IPv6";
+          // TODO: Determine if this inconsistent behavior is worth
+          // implementing.
+          param.skip_on_gvisor = true;
+          param.send_domain = AF_INET6;
+          param.connect_addr_len = Ipv6Addr(&param.connect_addr, 5000);
+          param.recv_domain = AF_INET6;
+          param.recv_addr_len = Ipv4MappedIpv6Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+          return param;
+        }(),
+        []() {
+          SendtoTestParam param = {};
+          param.description = "connected IPv6 sendto IPv4";
+          // TODO: Determine if this inconsistent behavior is worth
+          // implementing.
+          param.skip_on_gvisor = true;
+          param.send_domain = AF_INET6;
+          param.connect_addr_len = Ipv6Addr(&param.connect_addr, 5000);
+          param.recv_domain = AF_INET;
+          param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+          param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+          return param;
+        }()));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
new file mode 100644
index 000000000..a02b418a3
--- /dev/null
+++ b/test/syscalls/linux/udp_socket.cc
@@ -0,0 +1,941 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/errqueue.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// The initial port to be be used on gvisor.
+constexpr int TestPort = 40000;
+
+// Fixture for tests parameterized by the address family to use (AF_INET and
+// AF_INET6) when creating sockets.
+class UdpSocketTest : public ::testing::TestWithParam<int> {
+ protected:
+  // Creates two sockets that will be used by test cases.
+  void SetUp() override;
+
+  // Closes the sockets created by SetUp().
+  void TearDown() override {
+    EXPECT_THAT(close(s_), SyscallSucceeds());
+    EXPECT_THAT(close(t_), SyscallSucceeds());
+
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i]));
+    }
+  }
+
+  // First UDP socket.
+  int s_;
+
+  // Second UDP socket.
+  int t_;
+
+  // The length of the socket address.
+  socklen_t addrlen_;
+
+  // Initialized address pointing to loopback and port TestPort+i.
+  struct sockaddr* addr_[3];
+
+  // Initialize "any" address.
+  struct sockaddr* anyaddr_;
+
+  // Used ports.
+  int ports_[3];
+
+ private:
+  // Storage for the loopback addresses.
+  struct sockaddr_storage addr_storage_[3];
+
+  // Storage for the "any" address.
+  struct sockaddr_storage anyaddr_storage_;
+};
+
+// Gets a pointer to the port component of the given address.
+uint16_t* Port(struct sockaddr_storage* addr) {
+  switch (addr->ss_family) {
+    case AF_INET: {
+      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
+      return &sin->sin_port;
+    }
+    case AF_INET6: {
+      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
+      return &sin6->sin6_port;
+    }
+  }
+
+  return nullptr;
+}
+
+void UdpSocketTest::SetUp() {
+  ASSERT_THAT(s_ = socket(GetParam(), SOCK_DGRAM, IPPROTO_UDP),
+              SyscallSucceeds());
+
+  ASSERT_THAT(t_ = socket(GetParam(), SOCK_DGRAM, IPPROTO_UDP),
+              SyscallSucceeds());
+
+  memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_));
+  anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_);
+  anyaddr_->sa_family = GetParam();
+
+  // Initialize address-family-specific values.
+  switch (GetParam()) {
+    case AF_INET: {
+      auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_);
+      addrlen_ = sizeof(*sin);
+      sin->sin_addr.s_addr = htonl(INADDR_ANY);
+      break;
+    }
+    case AF_INET6: {
+      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_);
+      addrlen_ = sizeof(*sin6);
+      sin6->sin6_addr = in6addr_any;
+      break;
+    }
+  }
+
+  if (gvisor::testing::IsRunningOnGvisor()) {
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      ports_[i] = TestPort + i;
+    }
+  } else {
+    // When not under gvisor, use utility function to pick port. Assert that
+    // all ports are different.
+    std::string error;
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      // Find an unused port, we specify port 0 to allow the kernel to provide
+      // the port.
+      bool unique = true;
+      do {
+        ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable(
+            0, AddressFamily::kDualStack, SocketType::kUdp, false));
+        ASSERT_GT(ports_[i], 0);
+        for (size_t j = 0; j < i; ++j) {
+          if (ports_[j] == ports_[i]) {
+            unique = false;
+            break;
+          }
+        }
+      } while (!unique);
+    }
+  }
+
+  // Initialize the sockaddrs.
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) {
+    memset(&addr_storage_[i], 0, sizeof(addr_storage_[i]));
+
+    addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]);
+    addr_[i]->sa_family = GetParam();
+
+    switch (GetParam()) {
+      case AF_INET: {
+        auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]);
+        sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+        sin->sin_port = htons(ports_[i]);
+        break;
+      }
+      case AF_INET6: {
+        auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]);
+        sin6->sin6_addr = in6addr_loopback;
+        sin6->sin6_port = htons(ports_[i]);
+        break;
+      }
+    }
+  }
+}
+
+TEST_P(UdpSocketTest, Creation) {
+  int s_;
+
+  ASSERT_THAT(s_ = socket(GetParam(), SOCK_DGRAM, IPPROTO_UDP),
+              SyscallSucceeds());
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+
+  ASSERT_THAT(s_ = socket(GetParam(), SOCK_DGRAM, 0), SyscallSucceeds());
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+
+  ASSERT_THAT(s_ = socket(GetParam(), SOCK_STREAM, IPPROTO_UDP),
+              SyscallFails());
+}
+
+TEST_P(UdpSocketTest, Getsockname) {
+  // Check that we're not bound.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0);
+
+  // Bind, then check that we get the right address.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Getpeername) {
+  // Check that we're not connected.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+
+  // Connect, then check that we get the right address.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, SendNotConnected) {
+  // Do send & write, they must fail.
+  char buf[512];
+  EXPECT_THAT(send(s_, buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EDESTADDRREQ));
+
+  EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ));
+
+  // Use sendto.
+  ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Check that we're bound now.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ConnectBinds) {
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that we're bound now.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveNotBound) {
+  char buf[512];
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, Bind) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Try to bind again.
+  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
+
+  // Check that we're still bound to the original address.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, BindInUse) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Try to bind again.
+  EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterConnect) {
+  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Get the address s_ was bound to during connect.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+
+  // Send from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
+                     reinterpret_cast<sockaddr*>(&addr), addrlen),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, Connect) {
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that we're connected to the right peer.
+  struct sockaddr_storage peer;
+  socklen_t peerlen = sizeof(peer);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+              SyscallSucceeds());
+  EXPECT_EQ(peerlen, addrlen_);
+  EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0);
+
+  // Try to bind after connect.
+  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
+
+  // Try to connect again.
+  EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Check that peer name changed.
+  peerlen = sizeof(peer);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+              SyscallSucceeds());
+  EXPECT_EQ(peerlen, addrlen_);
+  EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send to a different destination than we're connected to.
+  char buf[512];
+  EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from s_ to t_.
+  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+  // Receive the packet.
+  char received[3];
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Set t_ to non-blocking.
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from s_ to t_.
+  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+  // Receive the packet.
+  char received[3];
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
+  // Bind s_ to loopback.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send some data to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveConnected) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Check that the data isn't_ received because it was sent from a different
+  // address than we're connected.
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Connect to loopback:TestPort+1.
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Receive the data. It works because it was sent before the connect.
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+  // Send again. This time it should not be received.
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveFrom) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data and sender address.
+  char received[512];
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0,
+                       reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Listen) {
+  ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_P(UdpSocketTest, Accept) {
+  ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+// This test validates that a read shutdown with pending data allows the read
+// to proceed with the data before returning EAGAIN.
+TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
+  char received[512];
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Verify that we get EWOULDBLOCK when there is nothing to read.
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  const char* buf = "abc";
+  EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3));
+
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  ASSERT_NE(opts & O_NONBLOCK, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  // We should get the data even though read has been shutdown.
+  EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
+
+  // Because we read less than the entire packet length, since it's a packet
+  // based socket any subsequent reads should return EWOULDBLOCK.
+  EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+// This test is validating that even after a socket is shutdown if it's
+// reconnected it will reset the shutdown state.
+TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReadShutdown) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then shutdown from another thread.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Milliseconds(200));
+    EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds());
+  });
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+  t.Join();
+
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, WriteShutdown) {
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, SynchronousReceive) {
+  // Bind s_ to loopback.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send some data to s_ from another thread.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  // Receive the data prior to actually starting the other thread.
+  char received[512];
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Start the thread.
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Milliseconds(200));
+    ASSERT_THAT(
+        sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_),
+        SyscallSucceedsWithValue(sizeof(buf)));
+  });
+
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(512));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(psize));
+  }
+
+  // Receive the data as 3 separate packets.
+  char received[6 * psize];
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0),
+                SyscallSucceedsWithValue(psize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Direct writes from t_ to s_.
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 2 packets from t_ to s_, where each packet's data consists of 2
+  // discontiguous iovecs.
+  constexpr size_t kPieceSize = 100;
+  char buf[4 * kPieceSize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[2];
+    for (int j = 0; j < 2; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+
+  // Receive the data as 2 separate packets.
+  char received[6 * kPieceSize];
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[3];
+    for (int j = 0; j < 3; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 2 packets from t_ to s_, where each packet's data consists of 2
+  // discontiguous iovecs.
+  constexpr size_t kPieceSize = 100;
+  char buf[4 * kPieceSize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[2];
+    for (int j = 0; j < 2; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    struct msghdr msg = {};
+    msg.msg_name = addr_[0];
+    msg.msg_namelen = addrlen_;
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 2;
+    ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+
+  // Receive the data as 2 separate packets.
+  char received[6 * kPieceSize];
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[3];
+    for (int j = 0; j < 3; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    struct msghdr msg = {};
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 3;
+    ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  const char str[] = "abc";
+  ASSERT_THAT(send(s_, str, sizeof(str), 0),
+              SyscallSucceedsWithValue(sizeof(str)));
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(str));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(str));
+}
+
+TEST_P(UdpSocketTest, FIONREAD) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that the bound socket with an empty buffer reports an empty first
+  // packet.
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(psize));
+
+    // Check that regardless of how many packets are in the queue, the size
+    // reported is that of a single packet.
+    n = -1;
+    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+    EXPECT_EQ(n, psize);
+  }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that the bound socket with an empty buffer reports an empty first
+  // packet.
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(0));
+
+    // Check that regardless of how many packets are in the queue, the size
+    // reported is that of a single packet.
+    n = -1;
+    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+    EXPECT_EQ(n, 0);
+  }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  const char str[] = "abc";
+  ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0));
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, ErrorQueue) {
+  char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
+  msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  iovec iov;
+  memset(&iov, 0, sizeof(iov));
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+
+  // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
+  EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(UdpSocketTest, SoTimestamp) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  int v = 1;
+  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+              SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+  msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  iovec iov;
+  memset(&iov, 0, sizeof(iov));
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
+
+  struct timeval tv = {};
+  memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
+
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+}
+
+TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+INSTANTIATE_TEST_CASE_P(AllInetTests, UdpSocketTest,
+                        ::testing::Values(AF_INET, AF_INET6));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
new file mode 100644
index 000000000..c0c1f2960
--- /dev/null
+++ b/test/syscalls/linux/uidgid.cc
@@ -0,0 +1,277 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <grp.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "test/util/capability_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+DEFINE_int32(scratch_uid1, 65534, "first scratch UID");
+DEFINE_int32(scratch_uid2, 65533, "second scratch UID");
+DEFINE_int32(scratch_gid1, 65534, "first scratch GID");
+DEFINE_int32(scratch_gid2, 65533, "second scratch GID");
+
+using ::testing::UnorderedElementsAreArray;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(UidGidTest, Getuid) {
+  uid_t ruid, euid, suid;
+  EXPECT_THAT(getresuid(&ruid, &euid, &suid), SyscallSucceeds());
+  EXPECT_THAT(getuid(), SyscallSucceedsWithValue(ruid));
+  EXPECT_THAT(geteuid(), SyscallSucceedsWithValue(euid));
+}
+
+TEST(UidGidTest, Getgid) {
+  gid_t rgid, egid, sgid;
+  EXPECT_THAT(getresgid(&rgid, &egid, &sgid), SyscallSucceeds());
+  EXPECT_THAT(getgid(), SyscallSucceedsWithValue(rgid));
+  EXPECT_THAT(getegid(), SyscallSucceedsWithValue(egid));
+}
+
+TEST(UidGidTest, Getgroups) {
+  // "If size is zero, list is not modified, but the total number of
+  // supplementary group IDs for the process is returned." - getgroups(2)
+  int nr_groups;
+  ASSERT_THAT(nr_groups = getgroups(0, nullptr), SyscallSucceeds());
+  std::vector<gid_t> list(nr_groups);
+  EXPECT_THAT(getgroups(list.size(), list.data()), SyscallSucceeds());
+
+  // "EINVAL: size is less than the number of supplementary group IDs, but is
+  // not zero."
+  EXPECT_THAT(getgroups(-1, nullptr), SyscallFailsWithErrno(EINVAL));
+
+  // Testing for EFAULT requires actually having groups, which isn't guaranteed
+  // here; see the setgroups test below.
+}
+
+// If the caller's real/effective/saved user/group IDs are all 0, IsRoot returns
+// true. Otherwise IsRoot logs an explanatory message and returns false.
+PosixErrorOr<bool> IsRoot() {
+  uid_t ruid, euid, suid;
+  int rc = getresuid(&ruid, &euid, &suid);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "getresuid");
+  }
+  if (ruid != 0 || euid != 0 || suid != 0) {
+    return false;
+  }
+  gid_t rgid, egid, sgid;
+  rc = getresgid(&rgid, &egid, &sgid);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "getresgid");
+  }
+  if (rgid != 0 || egid != 0 || sgid != 0) {
+    return false;
+  }
+  return true;
+}
+
+// Checks that the calling process' real/effective/saved user IDs are
+// ruid/euid/suid respectively.
+PosixError CheckUIDs(uid_t ruid, uid_t euid, uid_t suid) {
+  uid_t actual_ruid, actual_euid, actual_suid;
+  int rc = getresuid(&actual_ruid, &actual_euid, &actual_suid);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "getresuid");
+  }
+  if (ruid != actual_ruid || euid != actual_euid || suid != actual_suid) {
+    return PosixError(
+        EPERM, absl::StrCat(
+                   "incorrect user IDs: got (",
+                   absl::StrJoin({actual_ruid, actual_euid, actual_suid}, ", "),
+                   ", wanted (", absl::StrJoin({ruid, euid, suid}, ", "), ")"));
+  }
+  return NoError();
+}
+
+PosixError CheckGIDs(gid_t rgid, gid_t egid, gid_t sgid) {
+  gid_t actual_rgid, actual_egid, actual_sgid;
+  int rc = getresgid(&actual_rgid, &actual_egid, &actual_sgid);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "getresgid");
+  }
+  if (rgid != actual_rgid || egid != actual_egid || sgid != actual_sgid) {
+    return PosixError(
+        EPERM, absl::StrCat(
+                   "incorrect group IDs: got (",
+                   absl::StrJoin({actual_rgid, actual_egid, actual_sgid}, ", "),
+                   ", wanted (", absl::StrJoin({rgid, egid, sgid}, ", "), ")"));
+  }
+  return NoError();
+}
+
+// N.B. These tests may break horribly unless run via a gVisor test runner,
+// because changing UID in one test may forfeit permissions required by other
+// tests. (The test runner runs each test in a separate process.)
+
+TEST(UidGidRootTest, Setuid) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  // Do setuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting this
+  // test. Otherwise, the files are created by root (UID before the test), but
+  // cannot be opened by the `uid` set below after the test. After calling
+  // setuid(non-zero-UID), there is no way to get root privileges back.
+  ScopedThread([&] {
+    // Use syscall instead of glibc setuid wrapper because we want this setuid
+    // call to only apply to this task. POSIX threads, however, require that all
+    // threads have the same UIDs, so using the setuid wrapper sets all threads'
+    // real UID.
+    EXPECT_THAT(syscall(SYS_setuid, -1), SyscallFailsWithErrno(EINVAL));
+
+    const uid_t uid = FLAGS_scratch_uid1;
+    EXPECT_THAT(syscall(SYS_setuid, uid), SyscallSucceeds());
+    // "If the effective UID of the caller is root (more precisely: if the
+    // caller has the CAP_SETUID capability), the real UID and saved set-user-ID
+    // are also set." - setuid(2)
+    EXPECT_NO_ERRNO(CheckUIDs(uid, uid, uid));
+  });
+}
+
+TEST(UidGidRootTest, Setgid) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  EXPECT_THAT(setgid(-1), SyscallFailsWithErrno(EINVAL));
+
+  const gid_t gid = FLAGS_scratch_gid1;
+  ASSERT_THAT(setgid(gid), SyscallSucceeds());
+  EXPECT_NO_ERRNO(CheckGIDs(gid, gid, gid));
+}
+
+TEST(UidGidRootTest, SetgidNotFromThreadGroupLeader) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  const gid_t gid = FLAGS_scratch_gid1;
+  // NOTE: Do setgid in a separate thread so that we can test if
+  // info.si_pid is set correctly.
+  ScopedThread([gid] { ASSERT_THAT(setgid(gid), SyscallSucceeds()); });
+  EXPECT_NO_ERRNO(CheckGIDs(gid, gid, gid));
+}
+
+TEST(UidGidRootTest, Setreuid) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  // "Supplying a value of -1 for either the real or effective user ID forces
+  // the system to leave that ID unchanged." - setreuid(2)
+  EXPECT_THAT(setreuid(-1, -1), SyscallSucceeds());
+  EXPECT_NO_ERRNO(CheckUIDs(0, 0, 0));
+
+  // Do setuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting this
+  // test. Otherwise, the files are created by root (UID before the test), but
+  // cannot be opened by the `uid` set below after the test. After calling
+  // setuid(non-zero-UID), there is no way to get root privileges back.
+  ScopedThread([&] {
+    const uid_t ruid = FLAGS_scratch_uid1;
+    const uid_t euid = FLAGS_scratch_uid2;
+
+    // Use syscall instead of glibc setuid wrapper because we want this setuid
+    // call to only apply to this task. posix threads, however, require that all
+    // threads have the same UIDs, so using the setuid wrapper sets all threads'
+    // real UID.
+    EXPECT_THAT(syscall(SYS_setreuid, ruid, euid), SyscallSucceeds());
+
+    // "If the real user ID is set or the effective user ID is set to a value
+    // not equal to the previous real user ID, the saved set-user-ID will be set
+    // to the new effective user ID." - setreuid(2)
+    EXPECT_NO_ERRNO(CheckUIDs(ruid, euid, euid));
+  });
+}
+
+TEST(UidGidRootTest, Setregid) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  EXPECT_THAT(setregid(-1, -1), SyscallSucceeds());
+  EXPECT_NO_ERRNO(CheckGIDs(0, 0, 0));
+
+  const gid_t rgid = FLAGS_scratch_gid1;
+  const gid_t egid = FLAGS_scratch_gid2;
+  ASSERT_THAT(setregid(rgid, egid), SyscallSucceeds());
+  EXPECT_NO_ERRNO(CheckGIDs(rgid, egid, egid));
+}
+
+TEST(UidGidRootTest, Setresuid) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  // "If one of the arguments equals -1, the corresponding value is not
+  // changed." - setresuid(2)
+  EXPECT_THAT(setresuid(-1, -1, -1), SyscallSucceeds());
+  EXPECT_NO_ERRNO(CheckUIDs(0, 0, 0));
+
+  // Do setuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting this
+  // test. Otherwise, the files are created by root (UID before the test), but
+  // cannot be opened by the `uid` set below after the test. After calling
+  // setuid(non-zero-UID), there is no way to get root privileges back.
+  ScopedThread([&] {
+    const uid_t ruid = 12345;
+    const uid_t euid = 23456;
+    const uid_t suid = 34567;
+
+    // Use syscall instead of glibc setuid wrapper because we want this setuid
+    // call to only apply to this task. posix threads, however, require that all
+    // threads have the same UIDs, so using the setuid wrapper sets all threads'
+    // real UID.
+    EXPECT_THAT(syscall(SYS_setresuid, ruid, euid, suid), SyscallSucceeds());
+    EXPECT_NO_ERRNO(CheckUIDs(ruid, euid, suid));
+  });
+}
+
+TEST(UidGidRootTest, Setresgid) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  EXPECT_THAT(setresgid(-1, -1, -1), SyscallSucceeds());
+  EXPECT_NO_ERRNO(CheckGIDs(0, 0, 0));
+
+  const gid_t rgid = 12345;
+  const gid_t egid = 23456;
+  const gid_t sgid = 34567;
+  ASSERT_THAT(setresgid(rgid, egid, sgid), SyscallSucceeds());
+  EXPECT_NO_ERRNO(CheckGIDs(rgid, egid, sgid));
+}
+
+TEST(UidGidRootTest, Setgroups) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  std::vector<gid_t> list = {123, 500};
+  ASSERT_THAT(setgroups(list.size(), list.data()), SyscallSucceeds());
+  std::vector<gid_t> list2(list.size());
+  ASSERT_THAT(getgroups(list2.size(), list2.data()), SyscallSucceeds());
+  EXPECT_THAT(list, UnorderedElementsAreArray(list2));
+
+  // "EFAULT: list has an invalid address."
+  EXPECT_THAT(getgroups(100, reinterpret_cast<gid_t*>(-1)),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/uname.cc b/test/syscalls/linux/uname.cc
new file mode 100644
index 000000000..d22a34bd7
--- /dev/null
+++ b/test/syscalls/linux/uname.cc
@@ -0,0 +1,99 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(UnameTest, Sanity) {
+  struct utsname buf;
+  ASSERT_THAT(uname(&buf), SyscallSucceeds());
+  EXPECT_NE(strlen(buf.release), 0);
+  EXPECT_NE(strlen(buf.version), 0);
+  EXPECT_NE(strlen(buf.machine), 0);
+  EXPECT_NE(strlen(buf.sysname), 0);
+  EXPECT_NE(strlen(buf.nodename), 0);
+  EXPECT_NE(strlen(buf.domainname), 0);
+}
+
+TEST(UnameTest, SetNames) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  constexpr char kHostname[] = "wubbalubba";
+  ASSERT_THAT(sethostname(kHostname, sizeof(kHostname)), SyscallSucceeds());
+
+  constexpr char kDomainname[] = "dubdub.com";
+  ASSERT_THAT(setdomainname(kDomainname, sizeof(kDomainname)),
+              SyscallSucceeds());
+
+  struct utsname buf;
+  EXPECT_THAT(uname(&buf), SyscallSucceeds());
+  EXPECT_EQ(absl::string_view(buf.nodename), kHostname);
+  EXPECT_EQ(absl::string_view(buf.domainname), kDomainname);
+
+  // These should just be glibc wrappers that also call uname(2).
+  char hostname[65];
+  EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+  EXPECT_EQ(absl::string_view(hostname), kHostname);
+
+  char domainname[65];
+  EXPECT_THAT(getdomainname(domainname, sizeof(domainname)), SyscallSucceeds());
+  EXPECT_EQ(absl::string_view(domainname), kDomainname);
+}
+
+TEST(UnameTest, UnprivilegedSetNames) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))) {
+    EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false));
+  }
+
+  EXPECT_THAT(sethostname("", 0), SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(setdomainname("", 0), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(UnameTest, UnshareUTS) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  struct utsname init;
+  ASSERT_THAT(uname(&init), SyscallSucceeds());
+
+  ScopedThread([&]() {
+    EXPECT_THAT(unshare(CLONE_NEWUTS), SyscallSucceeds());
+
+    constexpr char kHostname[] = "wubbalubba";
+    EXPECT_THAT(sethostname(kHostname, sizeof(kHostname)), SyscallSucceeds());
+
+    char hostname[65];
+    EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+  });
+
+  struct utsname after;
+  EXPECT_THAT(uname(&after), SyscallSucceeds());
+  EXPECT_EQ(absl::string_view(after.nodename), init.nodename);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.cc b/test/syscalls/linux/unix_domain_socket_test_util.cc
new file mode 100644
index 000000000..2d7a530b9
--- /dev/null
+++ b/test/syscalls/linux/unix_domain_socket_test_util.cc
@@ -0,0 +1,346 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+
+#include <sys/un.h>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::string DescribeUnixDomainSocketType(int type) {
+  const char* type_str = nullptr;
+  switch (type & ~(SOCK_NONBLOCK | SOCK_CLOEXEC)) {
+    case SOCK_STREAM:
+      type_str = "SOCK_STREAM";
+      break;
+    case SOCK_DGRAM:
+      type_str = "SOCK_DGRAM";
+      break;
+    case SOCK_SEQPACKET:
+      type_str = "SOCK_SEQPACKET";
+      break;
+  }
+  if (!type_str) {
+    return absl::StrCat("Unix domain socket with unknown type ", type);
+  } else {
+    return absl::StrCat(((type & SOCK_NONBLOCK) != 0) ? "non-blocking " : "",
+                        ((type & SOCK_CLOEXEC) != 0) ? "close-on-exec " : "",
+                        type_str, " Unix domain socket");
+  }
+}
+
+SocketPairKind UnixDomainSocketPair(int type) {
+  return SocketPairKind{DescribeUnixDomainSocketType(type),
+                        SyscallSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind FilesystemBoundUnixDomainSocketPair(int type) {
+  std::string description = absl::StrCat(DescribeUnixDomainSocketType(type),
+                                    " created with filesystem binding");
+  if ((type & SOCK_DGRAM) == SOCK_DGRAM) {
+    return SocketPairKind{
+        description,
+        FilesystemBidirectionalBindSocketPairCreator(AF_UNIX, type, 0)};
+  }
+  return SocketPairKind{
+      description, FilesystemAcceptBindSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind AbstractBoundUnixDomainSocketPair(int type) {
+  std::string description = absl::StrCat(DescribeUnixDomainSocketType(type),
+                                    " created with abstract namespace binding");
+  if ((type & SOCK_DGRAM) == SOCK_DGRAM) {
+    return SocketPairKind{
+        description,
+        AbstractBidirectionalBindSocketPairCreator(AF_UNIX, type, 0)};
+  }
+  return SocketPairKind{description,
+                        AbstractAcceptBindSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind SocketpairGoferUnixDomainSocketPair(int type) {
+  std::string description = absl::StrCat(DescribeUnixDomainSocketType(type),
+                                    " created with the socketpair gofer");
+  return SocketPairKind{description,
+                        SocketpairGoferSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind SocketpairGoferFileSocketPair(int type) {
+  std::string description =
+      absl::StrCat(((type & O_NONBLOCK) != 0) ? "non-blocking " : "",
+                   ((type & O_CLOEXEC) != 0) ? "close-on-exec " : "",
+                   "file socket created with the socketpair gofer");
+  return SocketPairKind{description,
+                        SocketpairGoferFileSocketPairCreator(type)};
+}
+
+SocketPairKind FilesystemUnboundUnixDomainSocketPair(int type) {
+  return SocketPairKind{absl::StrCat(DescribeUnixDomainSocketType(type),
+                                     " unbound with a filesystem address"),
+                        FilesystemUnboundSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind AbstractUnboundUnixDomainSocketPair(int type) {
+  return SocketPairKind{
+      absl::StrCat(DescribeUnixDomainSocketType(type),
+                   " unbound with an abstract namespace address"),
+      AbstractUnboundSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+void SendSingleFD(int sock, int fd, char buf[], int buf_size) {
+  ASSERT_NO_FATAL_FAILURE(SendFDs(sock, &fd, 1, buf, buf_size));
+}
+
+void SendFDs(int sock, int fds[], int fds_size, char buf[], int buf_size) {
+  struct msghdr msg = {};
+  std::vector<char> control(CMSG_SPACE(fds_size * sizeof(int)));
+  msg.msg_control = &control[0];
+  msg.msg_controllen = control.size();
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_len = CMSG_LEN(fds_size * sizeof(int));
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_RIGHTS;
+  for (int i = 0; i < fds_size; i++) {
+    memcpy(CMSG_DATA(cmsg) + i * sizeof(int), &fds[i], sizeof(int));
+  }
+
+  ASSERT_THAT(SendMsg(sock, &msg, buf, buf_size),
+              IsPosixErrorOkAndHolds(buf_size));
+}
+
+void RecvSingleFD(int sock, int* fd, char buf[], int buf_size) {
+  ASSERT_NO_FATAL_FAILURE(RecvFDs(sock, fd, 1, buf, buf_size, buf_size));
+}
+
+void RecvSingleFD(int sock, int* fd, char buf[], int buf_size,
+                  int expected_size) {
+  ASSERT_NO_FATAL_FAILURE(RecvFDs(sock, fd, 1, buf, buf_size, expected_size));
+}
+
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size) {
+  ASSERT_NO_FATAL_FAILURE(
+      RecvFDs(sock, fds, fds_size, buf, buf_size, buf_size));
+}
+
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size,
+             int expected_size, bool peek) {
+  struct msghdr msg = {};
+  std::vector<char> control(CMSG_SPACE(fds_size * sizeof(int)));
+  msg.msg_control = &control[0];
+  msg.msg_controllen = control.size();
+
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = buf_size;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  int flags = 0;
+  if (peek) {
+    flags |= MSG_PEEK;
+  }
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, flags),
+              SyscallSucceedsWithValue(expected_size));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(fds_size * sizeof(int)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+  for (int i = 0; i < fds_size; i++) {
+    memcpy(&fds[i], CMSG_DATA(cmsg) + i * sizeof(int), sizeof(int));
+  }
+}
+
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size,
+             int expected_size) {
+  ASSERT_NO_FATAL_FAILURE(
+      RecvFDs(sock, fds, fds_size, buf, buf_size, expected_size, false));
+}
+
+void PeekSingleFD(int sock, int* fd, char buf[], int buf_size) {
+  ASSERT_NO_FATAL_FAILURE(RecvFDs(sock, fd, 1, buf, buf_size, buf_size, true));
+}
+
+void RecvNoCmsg(int sock, char buf[], int buf_size, int expected_size) {
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(int)) + CMSG_SPACE(sizeof(struct ucred))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = buf_size;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, 0),
+              SyscallSucceedsWithValue(expected_size));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  EXPECT_EQ(cmsg, nullptr);
+}
+
+void SendNullCmsg(int sock, char buf[], int buf_size) {
+  struct msghdr msg = {};
+  msg.msg_control = nullptr;
+  msg.msg_controllen = 0;
+
+  ASSERT_THAT(SendMsg(sock, &msg, buf, buf_size),
+              IsPosixErrorOkAndHolds(buf_size));
+}
+
+void SendCreds(int sock, ucred creds, char buf[], int buf_size) {
+  struct msghdr msg = {};
+
+  char control[CMSG_SPACE(sizeof(struct ucred))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_CREDENTIALS;
+  cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+  memcpy(CMSG_DATA(cmsg), &creds, sizeof(struct ucred));
+
+  ASSERT_THAT(SendMsg(sock, &msg, buf, buf_size),
+              IsPosixErrorOkAndHolds(buf_size));
+}
+
+void SendCredsAndFD(int sock, ucred creds, int fd, char buf[], int buf_size) {
+  struct msghdr msg = {};
+
+  char control[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))] = {};
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct cmsghdr* cmsg1 = CMSG_FIRSTHDR(&msg);
+  cmsg1->cmsg_level = SOL_SOCKET;
+  cmsg1->cmsg_type = SCM_CREDENTIALS;
+  cmsg1->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+  memcpy(CMSG_DATA(cmsg1), &creds, sizeof(struct ucred));
+
+  struct cmsghdr* cmsg2 = CMSG_NXTHDR(&msg, cmsg1);
+  cmsg2->cmsg_level = SOL_SOCKET;
+  cmsg2->cmsg_type = SCM_RIGHTS;
+  cmsg2->cmsg_len = CMSG_LEN(sizeof(int));
+  memcpy(CMSG_DATA(cmsg2), &fd, sizeof(int));
+
+  ASSERT_THAT(SendMsg(sock, &msg, buf, buf_size),
+              IsPosixErrorOkAndHolds(buf_size));
+}
+
+void RecvCreds(int sock, ucred* creds, char buf[], int buf_size) {
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sock, creds, buf, buf_size, buf_size));
+}
+
+void RecvCreds(int sock, ucred* creds, char buf[], int buf_size,
+               int expected_size) {
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(struct ucred))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = buf_size;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, 0),
+              SyscallSucceedsWithValue(expected_size));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+
+  memcpy(creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+}
+
+void RecvCredsAndFD(int sock, ucred* creds, int* fd, char buf[], int buf_size) {
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = buf_size;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, 0),
+              SyscallSucceedsWithValue(buf_size));
+
+  struct cmsghdr* cmsg1 = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg1, nullptr);
+  ASSERT_EQ(cmsg1->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
+  ASSERT_EQ(cmsg1->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg1->cmsg_type, SCM_CREDENTIALS);
+  memcpy(creds, CMSG_DATA(cmsg1), sizeof(struct ucred));
+
+  struct cmsghdr* cmsg2 = CMSG_NXTHDR(&msg, cmsg1);
+  ASSERT_NE(cmsg2, nullptr);
+  ASSERT_EQ(cmsg2->cmsg_len, CMSG_LEN(sizeof(int)));
+  ASSERT_EQ(cmsg2->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg2->cmsg_type, SCM_RIGHTS);
+  memcpy(fd, CMSG_DATA(cmsg2), sizeof(int));
+}
+
+void RecvSingleFDUnaligned(int sock, int* fd, char buf[], int buf_size) {
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = buf_size;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, 0),
+              SyscallSucceedsWithValue(buf_size));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+  memcpy(fd, CMSG_DATA(cmsg), sizeof(int));
+}
+
+void SetSoPassCred(int sock) {
+  int one = 1;
+  EXPECT_THAT(setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)),
+              SyscallSucceeds());
+}
+
+void UnsetSoPassCred(int sock) {
+  int zero = 0;
+  EXPECT_THAT(setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &zero, sizeof(zero)),
+              SyscallSucceeds());
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h
new file mode 100644
index 000000000..1b09aeae7
--- /dev/null
+++ b/test/syscalls/linux/unix_domain_socket_test_util.h
@@ -0,0 +1,161 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_
+
+#include <string>
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// DescribeUnixDomainSocketType returns a human-readable std::string explaining the
+// given Unix domain socket type.
+std::string DescribeUnixDomainSocketType(int type);
+
+// UnixDomainSocketPair returns a SocketPairKind that represents SocketPairs
+// created by invoking the socketpair() syscall with AF_UNIX and the given type.
+SocketPairKind UnixDomainSocketPair(int type);
+
+// FilesystemBoundUnixDomainSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with a temp file path,
+// AF_UNIX and the given type.
+SocketPairKind FilesystemBoundUnixDomainSocketPair(int type);
+
+// AbstractBoundUnixDomainSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with a temp abstract
+// path, AF_UNIX and the given type.
+SocketPairKind AbstractBoundUnixDomainSocketPair(int type);
+
+// SocketpairGoferUnixDomainSocketPair returns a SocketPairKind that was created
+// with two sockets conected to the socketpair gofer.
+SocketPairKind SocketpairGoferUnixDomainSocketPair(int type);
+
+// SocketpairGoferFileSocketPair returns a SocketPairKind that was created with
+// two open() calls on paths backed by the socketpair gofer.
+SocketPairKind SocketpairGoferFileSocketPair(int type);
+
+// FilesystemUnboundUnixDomainSocketPair returns a SocketPairKind that
+// represents two unbound sockets and a filesystem path for binding.
+SocketPairKind FilesystemUnboundUnixDomainSocketPair(int type);
+
+// AbstractUnboundUnixDomainSocketPair returns a SocketPairKind that represents
+// two unbound sockets and an abstract namespace path for binding.
+SocketPairKind AbstractUnboundUnixDomainSocketPair(int type);
+
+// SendSingleFD sends both a single FD and some data over a unix domain socket
+// specified by an FD. Note that calls to this function must be wrapped in
+// ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void SendSingleFD(int sock, int fd, char buf[], int buf_size);
+
+// SendFDs sends an arbitrary number of FDs and some data over a unix domain
+// socket specified by an FD. Note that calls to this function must be wrapped
+// in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void SendFDs(int sock, int fds[], int fds_size, char buf[], int buf_size);
+
+// RecvSingleFD receives both a single FD and some data over a unix domain
+// socket specified by an FD. Note that calls to this function must be wrapped
+// in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void RecvSingleFD(int sock, int* fd, char buf[], int buf_size);
+
+// RecvSingleFD receives both a single FD and some data over a unix domain
+// socket specified by an FD. This version allows the expected amount of data
+// received to be different than the buffer size. Note that calls to this
+// function must be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions
+// to halt the test.
+void RecvSingleFD(int sock, int* fd, char buf[], int buf_size,
+                  int expected_size);
+
+// PeekSingleFD peeks at both a single FD and some data over a unix domain
+// socket specified by an FD. Note that calls to this function must be wrapped
+// in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void PeekSingleFD(int sock, int* fd, char buf[], int buf_size);
+
+// RecvFDs receives both an arbitrary number of FDs and some data over a unix
+// domain socket specified by an FD. Note that calls to this function must be
+// wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size);
+
+// RecvFDs receives both an arbitrary number of FDs and some data over a unix
+// domain socket specified by an FD. This version allows the expected amount of
+// data received to be different than the buffer size. Note that calls to this
+// function must be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions
+// to halt the test.
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size,
+             int expected_size);
+
+// RecvNoCmsg receives some data over a unix domain socket specified by an FD
+// and asserts that no control messages are available for receiving. Note that
+// calls to this function must be wrapped in ASSERT_NO_FATAL_FAILURE for
+// internal assertions to halt the test.
+void RecvNoCmsg(int sock, char buf[], int buf_size, int expected_size);
+
+inline void RecvNoCmsg(int sock, char buf[], int buf_size) {
+  RecvNoCmsg(sock, buf, buf_size, buf_size);
+}
+
+// SendCreds sends the credentials of the current process and some data over a
+// unix domain socket specified by an FD. Note that calls to this function must
+// be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the
+// test.
+void SendCreds(int sock, ucred creds, char buf[], int buf_size);
+
+// SendCredsAndFD sends the credentials of the current process, a single FD, and
+// some data over a unix domain socket specified by an FD. Note that calls to
+// this function must be wrapped in ASSERT_NO_FATAL_FAILURE for internal
+// assertions to halt the test.
+void SendCredsAndFD(int sock, ucred creds, int fd, char buf[], int buf_size);
+
+// RecvCreds receives some credentials and some data over a unix domain socket
+// specified by an FD. Note that calls to this function must be wrapped in
+// ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void RecvCreds(int sock, ucred* creds, char buf[], int buf_size);
+
+// RecvCreds receives some credentials and some data over a unix domain socket
+// specified by an FD. This version allows the expected amount of data received
+// to be different than the buffer size. Note that calls to this function must
+// be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the
+// test.
+void RecvCreds(int sock, ucred* creds, char buf[], int buf_size,
+               int expected_size);
+
+// RecvCredsAndFD receives some credentials, a single FD, and some data over a
+// unix domain socket specified by an FD. Note that calls to this function must
+// be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the
+// test.
+void RecvCredsAndFD(int sock, ucred* creds, int* fd, char buf[], int buf_size);
+
+// SendNullCmsg sends a null control message and some data over a unix domain
+// socket specified by an FD. Note that calls to this function must be wrapped
+// in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void SendNullCmsg(int sock, char buf[], int buf_size);
+
+// RecvSingleFDUnaligned sends both a single FD and some data over a unix domain
+// socket specified by an FD. This function does not obey the spec, but Linux
+// allows it and the apphosting code depends on this quirk. Note that calls to
+// this function must be wrapped in ASSERT_NO_FATAL_FAILURE for internal
+// assertions to halt the test.
+void RecvSingleFDUnaligned(int sock, int* fd, char buf[], int buf_size);
+
+// SetSoPassCred sets the SO_PASSCRED option on the specified socket.
+void SetSoPassCred(int sock);
+
+// UnsetSoPassCred clears the SO_PASSCRED option on the specified socket.
+void UnsetSoPassCred(int sock);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_
diff --git a/test/syscalls/linux/unlink.cc b/test/syscalls/linux/unlink.cc
new file mode 100644
index 000000000..4d5e0c6b6
--- /dev/null
+++ b/test/syscalls/linux/unlink.cc
@@ -0,0 +1,211 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(UnlinkTest, IsDir) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  EXPECT_THAT(unlink(dir.path().c_str()), SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(UnlinkTest, DirNotEmpty) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  int fd;
+  std::string path = JoinPath(dir.path(), "ExistingFile");
+  EXPECT_THAT(fd = open(path.c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  EXPECT_THAT(rmdir(dir.path().c_str()), SyscallFailsWithErrno(ENOTEMPTY));
+}
+
+TEST(UnlinkTest, Rmdir) {
+  std::string path = JoinPath(GetAbsoluteTestTmpdir(), "NewDir");
+  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  EXPECT_THAT(rmdir(path.c_str()), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, AtDir) {
+  int dirfd;
+  EXPECT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_DIRECTORY, 0),
+              SyscallSucceeds());
+
+  std::string path = JoinPath(GetAbsoluteTestTmpdir(), "NewDir");
+  EXPECT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  EXPECT_THAT(unlinkat(dirfd, "NewDir", AT_REMOVEDIR), SyscallSucceeds());
+  ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, AtDirDegradedPermissions_NoRandomSave) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  int dirfd;
+  ASSERT_THAT(dirfd = open(dir.path().c_str(), O_DIRECTORY, 0),
+              SyscallSucceeds());
+
+  std::string sub_dir = JoinPath(dir.path(), "NewDir");
+  EXPECT_THAT(mkdir(sub_dir.c_str(), 0755), SyscallSucceeds());
+  EXPECT_THAT(fchmod(dirfd, 0444), SyscallSucceeds());
+  EXPECT_THAT(unlinkat(dirfd, "NewDir", AT_REMOVEDIR),
+              SyscallFailsWithErrno(EACCES));
+  ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+// Files cannot be unlinked if the parent is not writable and executable.
+TEST(UnlinkTest, ParentDegradedPermissions) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+  ASSERT_THAT(chmod(dir.path().c_str(), 0000), SyscallSucceeds());
+
+  struct stat st;
+  ASSERT_THAT(stat(file.path().c_str(), &st), SyscallFailsWithErrno(EACCES));
+  ASSERT_THAT(unlinkat(AT_FDCWD, file.path().c_str(), 0),
+              SyscallFailsWithErrno(EACCES));
+
+  // Non-existent files also return EACCES.
+  const std::string nonexist = JoinPath(dir.path(), "doesnotexist");
+  ASSERT_THAT(stat(nonexist.c_str(), &st), SyscallFailsWithErrno(EACCES));
+  ASSERT_THAT(unlinkat(AT_FDCWD, nonexist.c_str(), 0),
+              SyscallFailsWithErrno(EACCES));
+}
+
+TEST(UnlinkTest, AtBad) {
+  int dirfd;
+  EXPECT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_DIRECTORY, 0),
+              SyscallSucceeds());
+
+  // Try removing a directory as a file.
+  std::string path = JoinPath(GetAbsoluteTestTmpdir(), "NewDir");
+  EXPECT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  EXPECT_THAT(unlinkat(dirfd, "NewDir", 0), SyscallFailsWithErrno(EISDIR));
+  EXPECT_THAT(unlinkat(dirfd, "NewDir", AT_REMOVEDIR), SyscallSucceeds());
+
+  // Try removing a file as a directory.
+  int fd;
+  EXPECT_THAT(fd = openat(dirfd, "UnlinkAtFile", O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile", AT_REMOVEDIR),
+              SyscallFailsWithErrno(ENOTDIR));
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+  EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile", 0), SyscallSucceeds());
+
+  // Cleanup.
+  ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, AbsTmpFile) {
+  int fd;
+  std::string path = JoinPath(GetAbsoluteTestTmpdir(), "ExistingFile");
+  EXPECT_THAT(fd = open(path.c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  EXPECT_THAT(unlink(path.c_str()), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, TooLongName) {
+  EXPECT_THAT(unlink(std::vector<char>(16384, '0').data()),
+              SyscallFailsWithErrno(ENAMETOOLONG));
+}
+
+TEST(UnlinkTest, BadNamePtr) {
+  EXPECT_THAT(unlink(reinterpret_cast<char*>(1)),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(UnlinkTest, AtFile) {
+  int dirfd;
+  EXPECT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_DIRECTORY, 0666),
+              SyscallSucceeds());
+  int fd;
+  EXPECT_THAT(fd = openat(dirfd, "UnlinkAtFile", O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile", 0), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, OpenFile) {
+  // We can't save unlinked file unless they are on tmpfs.
+  const DisableSave ds;
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  int fd;
+  EXPECT_THAT(fd = open(file.path().c_str(), O_RDWR, 0666), SyscallSucceeds());
+  EXPECT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, CannotRemoveDots) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string self = JoinPath(file.path(), ".");
+  ASSERT_THAT(unlink(self.c_str()), SyscallFailsWithErrno(ENOTDIR));
+  const std::string parent = JoinPath(file.path(), "..");
+  ASSERT_THAT(unlink(parent.c_str()), SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(UnlinkTest, CannotRemoveRoot) {
+  ASSERT_THAT(unlinkat(-1, "/", AT_REMOVEDIR), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(UnlinkTest, CannotRemoveRootWithAtDir) {
+  const FileDescriptor dirfd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(GetAbsoluteTestTmpdir(), O_DIRECTORY, 0666));
+  ASSERT_THAT(unlinkat(dirfd.get(), "/", AT_REMOVEDIR),
+              SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(RmdirTest, CannotRemoveDots) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string self = JoinPath(dir.path(), ".");
+  ASSERT_THAT(rmdir(self.c_str()), SyscallFailsWithErrno(EINVAL));
+  const std::string parent = JoinPath(dir.path(), "..");
+  ASSERT_THAT(rmdir(parent.c_str()), SyscallFailsWithErrno(ENOTEMPTY));
+}
+
+TEST(RmdirTest, CanRemoveWithTrailingSlashes) {
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string slash = absl::StrCat(dir1.path(), "/");
+  ASSERT_THAT(rmdir(slash.c_str()), SyscallSucceeds());
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string slashslash = absl::StrCat(dir2.path(), "//");
+  ASSERT_THAT(rmdir(slashslash.c_str()), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/unshare.cc b/test/syscalls/linux/unshare.cc
new file mode 100644
index 000000000..9dd6ec4b6
--- /dev/null
+++ b/test/syscalls/linux/unshare.cc
@@ -0,0 +1,50 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sched.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(UnshareTest, AllowsZeroFlags) {
+  ASSERT_THAT(unshare(0), SyscallSucceeds());
+}
+
+TEST(UnshareTest, ThreadFlagFailsIfMultithreaded) {
+  absl::Mutex mu;
+  bool finished = false;
+  ScopedThread t([&] {
+    mu.Lock();
+    mu.Await(absl::Condition(&finished));
+    mu.Unlock();
+  });
+  ASSERT_THAT(unshare(CLONE_THREAD), SyscallFailsWithErrno(EINVAL));
+  mu.Lock();
+  finished = true;
+  mu.Unlock();
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
new file mode 100644
index 000000000..d95ee74ec
--- /dev/null
+++ b/test/syscalls/linux/utimes.cc
@@ -0,0 +1,330 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#include <utime.h>
+#include <string>
+
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// TODO: utimes(nullptr) does not pick the "now" time in the
+// application's time domain, so when asserting that times are within a window,
+// we expand the window to allow for differences between the time domains.
+constexpr absl::Duration kClockSlack = absl::Milliseconds(100);
+
+// TimeBoxed runs fn, setting before and after to (coarse realtime) times
+// guaranteed* to come before and after fn started and completed, respectively.
+//
+// fn may be called more than once if the clock is adjusted.
+//
+// * See the comment on kClockSlack. gVisor breaks this guarantee.
+void TimeBoxed(absl::Time* before, absl::Time* after,
+               std::function<void()> const& fn) {
+  do {
+    // N.B. utimes and friends use CLOCK_REALTIME_COARSE for setting time (i.e.,
+    // current_kernel_time()). See fs/attr.c:notify_change.
+    //
+    // notify_change truncates the time to a multiple of s_time_gran, but most
+    // filesystems set it to 1, so we don't do any truncation.
+    struct timespec ts;
+    EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds());
+    *before = absl::TimeFromTimespec(ts);
+
+    fn();
+
+    EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds());
+    *after = absl::TimeFromTimespec(ts);
+
+    if (*after < *before) {
+      // Clock jumped backwards; retry.
+      //
+      // Technically this misses jumps small enough to keep after > before,
+      // which could lead to test failures, but that is very unlikely to happen.
+      continue;
+    }
+
+    if (IsRunningOnGvisor()) {
+      // See comment on kClockSlack.
+      *before -= kClockSlack;
+      *after += kClockSlack;
+    }
+  } while (*after < *before);
+}
+
+void TestUtimesOnPath(std::string const& path) {
+  struct stat statbuf;
+
+  struct timeval times[2] = {{1, 0}, {2, 0}};
+  EXPECT_THAT(utimes(path.c_str(), times), SyscallSucceeds());
+  EXPECT_THAT(stat(path.c_str(), &statbuf), SyscallSucceeds());
+  EXPECT_EQ(1, statbuf.st_atime);
+  EXPECT_EQ(2, statbuf.st_mtime);
+
+  absl::Time before;
+  absl::Time after;
+  TimeBoxed(&before, &after, [&] {
+    EXPECT_THAT(utimes(path.c_str(), nullptr), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(stat(path.c_str(), &statbuf), SyscallSucceeds());
+
+  absl::Time atime = absl::TimeFromTimespec(statbuf.st_atim);
+  EXPECT_GE(atime, before);
+  EXPECT_LE(atime, after);
+
+  absl::Time mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+  EXPECT_GE(mtime, before);
+  EXPECT_LE(mtime, after);
+}
+
+TEST(UtimesTest, OnFile) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  TestUtimesOnPath(f.path());
+}
+
+TEST(UtimesTest, OnDir) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TestUtimesOnPath(dir.path());
+}
+
+TEST(UtimesTest, MissingPath) {
+  auto path = NewTempAbsPath();
+  struct timeval times[2] = {{1, 0}, {2, 0}};
+  EXPECT_THAT(utimes(path.c_str(), times), SyscallFailsWithErrno(ENOENT));
+}
+
+void TestFutimesat(int dirFd, std::string const& path) {
+  struct stat statbuf;
+
+  struct timeval times[2] = {{1, 0}, {2, 0}};
+  EXPECT_THAT(futimesat(dirFd, path.c_str(), times), SyscallSucceeds());
+  EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds());
+  EXPECT_EQ(1, statbuf.st_atime);
+  EXPECT_EQ(2, statbuf.st_mtime);
+
+  absl::Time before;
+  absl::Time after;
+  TimeBoxed(&before, &after, [&] {
+    EXPECT_THAT(futimesat(dirFd, path.c_str(), nullptr), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds());
+
+  absl::Time atime = absl::TimeFromTimespec(statbuf.st_atim);
+  EXPECT_GE(atime, before);
+  EXPECT_LE(atime, after);
+
+  absl::Time mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+  EXPECT_GE(mtime, before);
+  EXPECT_LE(mtime, after);
+}
+
+TEST(FutimesatTest, OnAbsPath) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  TestFutimesat(0, f.path());
+}
+
+TEST(FutimesatTest, OnRelPath) {
+  auto d = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(d.path()));
+  auto basename = std::string(Basename(f.path()));
+  const FileDescriptor dirFd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(d.path(), O_RDONLY | O_DIRECTORY));
+  TestFutimesat(dirFd.get(), basename);
+}
+
+TEST(FutimesatTest, InvalidNsec) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  struct timeval times[4][2] = {{
+                                    {0, 1},                       // Valid
+                                    {1, static_cast<int64_t>(1e7)}  // Invalid
+                                },
+                                {
+                                    {1, static_cast<int64_t>(1e7)},  // Invalid
+                                    {0, 1}                         // Valid
+                                },
+                                {
+                                    {0, 1},  // Valid
+                                    {1, -1}  // Invalid
+                                },
+                                {
+                                    {1, -1},  // Invalid
+                                    {0, 1}    // Valid
+                                }};
+
+  for (unsigned int i = 0; i < sizeof(times) / sizeof(times[0]); i++) {
+    std::cout << "test:" << i << "\n";
+    EXPECT_THAT(futimesat(0, f.path().c_str(), times[i]),
+                SyscallFailsWithErrno(EINVAL));
+  }
+}
+
+void TestUtimensat(int dirFd, std::string const& path) {
+  struct stat statbuf;
+  const struct timespec times[2] = {{1, 0}, {2, 0}};
+  EXPECT_THAT(utimensat(dirFd, path.c_str(), times, 0), SyscallSucceeds());
+  EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds());
+  EXPECT_EQ(1, statbuf.st_atime);
+  EXPECT_EQ(2, statbuf.st_mtime);
+
+  // Test setting with UTIME_NOW and UTIME_OMIT.
+  struct stat statbuf2;
+  const struct timespec times2[2] = {
+      {0, UTIME_NOW},  // Should set atime to now.
+      {0, UTIME_OMIT}  // Should not change mtime.
+  };
+
+  absl::Time before;
+  absl::Time after;
+  TimeBoxed(&before, &after, [&] {
+    EXPECT_THAT(utimensat(dirFd, path.c_str(), times2, 0), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf2, 0), SyscallSucceeds());
+
+  absl::Time atime2 = absl::TimeFromTimespec(statbuf2.st_atim);
+  EXPECT_GE(atime2, before);
+  EXPECT_LE(atime2, after);
+
+  absl::Time mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+  absl::Time mtime2 = absl::TimeFromTimespec(statbuf2.st_mtim);
+  // mtime should not be changed.
+  EXPECT_EQ(mtime, mtime2);
+
+  // Test setting with times = NULL. Should set both atime and mtime to the
+  // current system time.
+  struct stat statbuf3;
+  TimeBoxed(&before, &after, [&] {
+    EXPECT_THAT(utimensat(dirFd, path.c_str(), nullptr, 0), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf3, 0), SyscallSucceeds());
+
+  absl::Time atime3 = absl::TimeFromTimespec(statbuf3.st_atim);
+  EXPECT_GE(atime3, before);
+  EXPECT_LE(atime3, after);
+
+  absl::Time mtime3 = absl::TimeFromTimespec(statbuf3.st_mtim);
+  EXPECT_GE(mtime3, before);
+  EXPECT_LE(mtime3, after);
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: Gofers set atime and mtime to different "now" times.
+    EXPECT_EQ(atime3, mtime3);
+  }
+}
+
+TEST(UtimensatTest, OnAbsPath) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  TestUtimensat(0, f.path());
+}
+
+TEST(UtimensatTest, OnRelPath) {
+  auto d = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(d.path()));
+  auto basename = std::string(Basename(f.path()));
+  const FileDescriptor dirFd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(d.path(), O_RDONLY | O_DIRECTORY));
+  TestUtimensat(dirFd.get(), basename);
+}
+
+TEST(UtimensatTest, OmitNoop) {
+  // Setting both timespecs to UTIME_OMIT on a nonexistant path should succeed.
+  auto path = NewTempAbsPath();
+  const struct timespec times[2] = {{0, UTIME_OMIT}, {0, UTIME_OMIT}};
+  EXPECT_THAT(utimensat(0, path.c_str(), times, 0), SyscallSucceeds());
+}
+
+// Verify that we can actually set atime and mtime to 0.
+TEST(UtimeTest, ZeroAtimeandMtime) {
+  const auto tmp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const auto tmp_file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(tmp_dir.path()));
+
+  // Stat the file before and after updating atime and mtime.
+  struct stat stat_before = {};
+  EXPECT_THAT(stat(tmp_file.path().c_str(), &stat_before), SyscallSucceeds());
+
+  ASSERT_NE(stat_before.st_atime, 0);
+  ASSERT_NE(stat_before.st_mtime, 0);
+
+  const struct utimbuf times = {};  // Zero for both atime and mtime.
+  EXPECT_THAT(utime(tmp_file.path().c_str(), &times), SyscallSucceeds());
+
+  struct stat stat_after = {};
+  EXPECT_THAT(stat(tmp_file.path().c_str(), &stat_after), SyscallSucceeds());
+
+  // We should see the atime and mtime changed when we set them to 0.
+  ASSERT_EQ(stat_after.st_atime, 0);
+  ASSERT_EQ(stat_after.st_mtime, 0);
+}
+
+TEST(UtimensatTest, InvalidNsec) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  struct timespec times[2][2] = {{
+                                     {0, UTIME_OMIT},               // Valid
+                                     {2, static_cast<int64_t>(1e10)}  // Invalid
+                                 },
+                                 {
+                                     {2, static_cast<int64_t>(1e10)},  // Invalid
+                                     {0, UTIME_OMIT}                 // Valid
+                                 }};
+
+  for (unsigned int i = 0; i < sizeof(times) / sizeof(times[0]); i++) {
+    std::cout << "test:" << i << "\n";
+    EXPECT_THAT(utimensat(0, f.path().c_str(), times[i], 0),
+                SyscallFailsWithErrno(EINVAL));
+  }
+}
+
+TEST(Utimensat, NullPath) {
+  // From man utimensat(2):
+  // "the Linux utimensat() system call implements a nonstandard feature: if
+  // pathname is NULL, then the call modifies the timestamps of the file
+  // referred to by the file descriptor dirfd (which may refer to any type of
+  // file).
+  // Note, however, that the glibc wrapper for utimensat() disallows
+  // passing NULL as the value for file: the wrapper function returns the error
+  // EINVAL in this case."
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+  struct stat statbuf;
+  const struct timespec times[2] = {{1, 0}, {2, 0}};
+  // Call syscall directly.
+  EXPECT_THAT(syscall(SYS_utimensat, fd.get(), NULL, times, 0),
+              SyscallSucceeds());
+  EXPECT_THAT(fstatat(0, f.path().c_str(), &statbuf, 0), SyscallSucceeds());
+  EXPECT_EQ(1, statbuf.st_atime);
+  EXPECT_EQ(2, statbuf.st_mtime);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/vdso.cc b/test/syscalls/linux/vdso.cc
new file mode 100644
index 000000000..0f6e1c7c6
--- /dev/null
+++ b/test/syscalls/linux/vdso.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include <sys/mman.h>
+
+#include <algorithm>
+
+#include "gtest/gtest.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Ensure that the vvar page cannot be made writable.
+TEST(VvarTest, WriteVvar) {
+  auto contents = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  auto maps = ASSERT_NO_ERRNO_AND_VALUE(ParseProcMaps(contents));
+  auto it = std::find_if(maps.begin(), maps.end(), [](const ProcMapsEntry& e) {
+    return e.filename == "[vvar]";
+  });
+
+  SKIP_IF(it == maps.end());
+  EXPECT_THAT(mprotect(reinterpret_cast<void*>(it->start), kPageSize,
+                       PROT_READ | PROT_WRITE),
+              SyscallFailsWithErrno(EACCES));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc
new file mode 100644
index 000000000..59dd78833
--- /dev/null
+++ b/test/syscalls/linux/vdso_clock_gettime.cc
@@ -0,0 +1,104 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <syscall.h>
+#include <time.h>
+#include <unistd.h>
+#include <map>
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) {
+  switch (info.param) {
+    case CLOCK_MONOTONIC:
+      return "CLOCK_MONOTONIC";
+    case CLOCK_REALTIME:
+      return "CLOCK_REALTIME";
+    default:
+      return absl::StrCat(info.param);
+  }
+}
+
+class CorrectVDSOClockTest : public ::testing::TestWithParam<clockid_t> {};
+
+TEST_P(CorrectVDSOClockTest, IsCorrect) {
+  struct timespec tvdso, tsys;
+  absl::Time vdso_time, sys_time;
+  uint64_t total_calls = 0;
+
+  // It is expected that 82.5% of clock_gettime calls will be less than 100us
+  // skewed from the system time.
+  // Unfortunately this is not only influenced by the VDSO clock skew, but also
+  // by arbitrary scheduling delays and the like. The test is therefore
+  // regularly disabled.
+  std::map<absl::Duration, std::tuple<double, uint64_t, uint64_t>> confidence =
+      {
+          {absl::Microseconds(100), std::make_tuple(0.825, 0, 0)},
+          {absl::Microseconds(250), std::make_tuple(0.94, 0, 0)},
+          {absl::Milliseconds(1), std::make_tuple(0.999, 0, 0)},
+      };
+
+  absl::Time start = absl::Now();
+  while (absl::Now() < start + absl::Seconds(30)) {
+    EXPECT_THAT(clock_gettime(GetParam(), &tvdso), SyscallSucceeds());
+    EXPECT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys),
+                SyscallSucceeds());
+
+    vdso_time = absl::TimeFromTimespec(tvdso);
+
+    for (auto const& conf : confidence) {
+      std::get<1>(confidence[conf.first]) +=
+          (sys_time - vdso_time) < conf.first;
+    }
+
+    sys_time = absl::TimeFromTimespec(tsys);
+
+    for (auto const& conf : confidence) {
+      std::get<2>(confidence[conf.first]) +=
+          (vdso_time - sys_time) < conf.first;
+    }
+
+    ++total_calls;
+  }
+
+  for (auto const& conf : confidence) {
+    EXPECT_GE(std::get<1>(conf.second) / static_cast<double>(total_calls),
+              std::get<0>(conf.second));
+    EXPECT_GE(std::get<2>(conf.second) / static_cast<double>(total_calls),
+              std::get<0>(conf.second));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(ClockGettime, CorrectVDSOClockTest,
+                        ::testing::Values(CLOCK_MONOTONIC, CLOCK_REALTIME),
+                        PrintClockId);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
new file mode 100644
index 000000000..9999a909e
--- /dev/null
+++ b/test/syscalls/linux/vfork.cc
@@ -0,0 +1,193 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+DEFINE_bool(vfork_test_child, false,
+            "If true, run the VforkTest child workload.");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// We don't test with raw CLONE_VFORK to avoid interacting with glibc's use of
+// TLS.
+//
+// Even with vfork(2), we must be careful to do little more in the child than
+// call execve(2). We use the simplest sleep function possible, though this is
+// still precarious, as we're officially only allowed to call execve(2) and
+// _exit(2).
+constexpr absl::Duration kChildDelay = absl::Seconds(10);
+
+// Exit code for successful child subprocesses. We don't want to use 0 since
+// it's too common, and an execve(2) failure causes the child to exit with the
+// errno, so kChildExitCode is chosen to be an unlikely errno:
+constexpr int kChildExitCode = 118;  // ENOTNAM: Not a XENIX named type file
+
+int64_t MonotonicNow() {
+  struct timespec now;
+  TEST_PCHECK(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
+  return now.tv_sec * 1000000000ll + now.tv_nsec;
+}
+
+TEST(VforkTest, ParentStopsUntilChildExits) {
+  const auto test = [] {
+    // N.B. Run the test in a single-threaded subprocess because
+    // vfork is not safe in a multi-threaded process.
+
+    const int64_t start = MonotonicNow();
+
+    pid_t pid = vfork();
+    if (pid == 0) {
+      SleepSafe(kChildDelay);
+      _exit(kChildExitCode);
+    }
+    TEST_PCHECK_MSG(pid > 0, "vfork failed");
+    MaybeSave();
+
+    const int64_t end = MonotonicNow();
+
+    absl::Duration dur = absl::Nanoseconds(end - start);
+
+    TEST_CHECK(dur >= kChildDelay);
+
+    int status = 0;
+    TEST_PCHECK(RetryEINTR(waitpid)(pid, &status, 0));
+    TEST_CHECK(WIFEXITED(status));
+    TEST_CHECK(WEXITSTATUS(status) == kChildExitCode);
+  };
+
+  EXPECT_THAT(InForkedProcess(test), IsPosixErrorOkAndHolds(0));
+}
+
+TEST(VforkTest, ParentStopsUntilChildExecves_NoRandomSave) {
+  ExecveArray const owned_child_argv = {"/proc/self/exe", "--vfork_test_child"};
+  char* const* const child_argv = owned_child_argv.get();
+
+  const auto test = [&] {
+    const int64_t start = MonotonicNow();
+
+    pid_t pid = vfork();
+    if (pid == 0) {
+      SleepSafe(kChildDelay);
+      execve(child_argv[0], child_argv, /* envp = */ nullptr);
+      _exit(errno);
+    }
+    // Don't attempt save/restore until after recording end_time,
+    // since the test expects an upper bound on the time spent
+    // stopped.
+    int saved_errno = errno;
+    const int64_t end = MonotonicNow();
+    errno = saved_errno;
+    TEST_PCHECK_MSG(pid > 0, "vfork failed");
+    MaybeSave();
+
+    absl::Duration dur = absl::Nanoseconds(end - start);
+
+    // The parent should resume execution after execve, but before
+    // the post-execve test child exits.
+    TEST_CHECK(dur >= kChildDelay);
+    TEST_CHECK(dur <= 2 * kChildDelay);
+
+    int status = 0;
+    TEST_PCHECK(RetryEINTR(waitpid)(pid, &status, 0));
+    TEST_CHECK(WIFEXITED(status));
+    TEST_CHECK(WEXITSTATUS(status) == kChildExitCode);
+  };
+
+  EXPECT_THAT(InForkedProcess(test), IsPosixErrorOkAndHolds(0));
+}
+
+// A vfork child does not unstop the parent a second time when it exits after
+// exec.
+TEST(VforkTest, ExecedChildExitDoesntUnstopParent_NoRandomSave) {
+  ExecveArray const owned_child_argv = {"/proc/self/exe", "--vfork_test_child"};
+  char* const* const child_argv = owned_child_argv.get();
+
+  const auto test = [&] {
+    pid_t pid1 = vfork();
+    if (pid1 == 0) {
+      execve(child_argv[0], child_argv, /* envp = */ nullptr);
+      _exit(errno);
+    }
+    TEST_PCHECK_MSG(pid1 > 0, "vfork failed");
+    MaybeSave();
+
+    // pid1 exec'd and is now sleeping.
+    SleepSafe(kChildDelay / 2);
+
+    const int64_t start = MonotonicNow();
+
+    pid_t pid2 = vfork();
+    if (pid2 == 0) {
+      SleepSafe(kChildDelay);
+      _exit(kChildExitCode);
+    }
+    TEST_PCHECK_MSG(pid2 > 0, "vfork failed");
+    MaybeSave();
+
+    const int64_t end = MonotonicNow();
+
+    absl::Duration dur = absl::Nanoseconds(end - start);
+
+    // The parent should resume execution only after pid2 exits, not
+    // when pid1 exits.
+    TEST_CHECK(dur >= kChildDelay);
+
+    int status = 0;
+    TEST_PCHECK(RetryEINTR(waitpid)(pid1, &status, 0));
+    TEST_CHECK(WIFEXITED(status));
+    TEST_CHECK(WEXITSTATUS(status) == kChildExitCode);
+
+    TEST_PCHECK(RetryEINTR(waitpid)(pid2, &status, 0));
+    TEST_CHECK(WIFEXITED(status));
+    TEST_CHECK(WEXITSTATUS(status) == kChildExitCode);
+  };
+
+  EXPECT_THAT(InForkedProcess(test), IsPosixErrorOkAndHolds(0));
+}
+
+int RunChild() {
+  SleepSafe(kChildDelay);
+  return kChildExitCode;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  gvisor::testing::TestInit(&argc, &argv);
+
+  if (FLAGS_vfork_test_child) {
+    return gvisor::testing::RunChild();
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
new file mode 100644
index 000000000..cb6840cc6
--- /dev/null
+++ b/test/syscalls/linux/vsyscall.cc
@@ -0,0 +1,44 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "test/util/proc_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+time_t vsyscall_time(time_t* t) {
+  constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
+  return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
+}
+
+TEST(VsyscallTest, VsyscallAlwaysAvailableOnGvisor) {
+  SKIP_IF(!IsRunningOnGvisor());
+  // Vsyscall is always advertised by gvisor.
+  EXPECT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+  // Vsyscall should always works on gvisor.
+  time_t t;
+  EXPECT_THAT(vsyscall_time(&t), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
new file mode 100644
index 000000000..0a4ec7c6a
--- /dev/null
+++ b/test/syscalls/linux/wait.cc
@@ -0,0 +1,748 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <functional>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+using ::testing::UnorderedElementsAre;
+
+// These unit tests focus on the wait4(2) system call, but include a basic
+// checks for the i386 waitpid(2) syscall, which is a subset of wait4(2).
+//
+// NOTE: Some functionality is not tested as
+// it is not currently supported by gVisor:
+// * UID in waitid(2) siginfo.
+// * Process groups.
+// * Core dump status (WCOREDUMP).
+// * Linux only option __WNOTHREAD.
+//
+// Tests for waiting on stopped/continued children are in sigstop.cc.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// The CloneChild function seems to need more than one page of stack space.
+static const size_t kStackSize = 2 * kPageSize;
+
+// The child thread created in CloneAndExit runs this function.
+// This child does not have the TLS setup, so it must not use glibc functions.
+int CloneChild(void* priv) {
+  int64_t sleep = reinterpret_cast<int64_t>(priv);
+  SleepSafe(absl::Seconds(sleep));
+
+  // glibc's _exit(2) function wrapper will helpfully call exit_group(2),
+  // exiting the entire process.
+  syscall(__NR_exit, 0);
+  return 1;
+}
+
+// ForkAndExit forks a child process which exits with exit_code, after
+// sleeping for the specified duration (seconds).
+pid_t ForkAndExit(int exit_code, int64_t sleep) {
+  pid_t child = fork();
+  if (child == 0) {
+    SleepSafe(absl::Seconds(sleep));
+    _exit(exit_code);
+  }
+  return child;
+}
+
+int64_t clock_gettime_nsecs(clockid_t id) {
+  struct timespec ts;
+  TEST_PCHECK(clock_gettime(id, &ts) == 0);
+  return (ts.tv_sec * 1000000000 + ts.tv_nsec);
+}
+
+void spin(int64_t sec) {
+  int64_t ns = sec * 1000000000;
+  int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
+  int64_t end = start + ns;
+
+  do {
+    constexpr int kLoopCount = 1000000;  // large and arbitrary
+    // volatile to prevent the compiler from skipping this loop.
+    for (volatile int i = 0; i < kLoopCount; i++) {
+    }
+  } while (clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID) < end);
+}
+
+// ForkSpinAndExit forks a child process which exits with exit_code, after
+// spinning for the specified duration (seconds).
+pid_t ForkSpinAndExit(int exit_code, int64_t spintime) {
+  pid_t child = fork();
+  if (child == 0) {
+    spin(spintime);
+    _exit(exit_code);
+  }
+  return child;
+}
+
+absl::Duration RusageCpuTime(const struct rusage& ru) {
+  return absl::DurationFromTimeval(ru.ru_utime) +
+         absl::DurationFromTimeval(ru.ru_stime);
+}
+
+// Returns the address of the top of the stack.
+// Free with FreeStack.
+uintptr_t AllocStack() {
+  void* addr = mmap(nullptr, kStackSize, PROT_READ | PROT_WRITE,
+                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+  if (addr == MAP_FAILED) {
+    return reinterpret_cast<uintptr_t>(MAP_FAILED);
+  }
+
+  return reinterpret_cast<uintptr_t>(addr) + kStackSize;
+}
+
+// Frees a stack page allocated with AllocStack.
+int FreeStack(uintptr_t addr) {
+  addr -= kStackSize;
+  return munmap(reinterpret_cast<void*>(addr), kPageSize);
+}
+
+// CloneAndExit clones a child thread, which exits with 0 after sleeping for
+// the specified duration (must be in seconds). extra_flags are ORed against
+// the standard clone(2) flags.
+int CloneAndExit(int64_t sleep, uintptr_t stack, int extra_flags) {
+  return clone(CloneChild, reinterpret_cast<void*>(stack),
+               CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_VM | extra_flags,
+               reinterpret_cast<void*>(sleep));
+}
+
+// Simple wrappers around wait4(2) and waitid(2) that ignore interrupts.
+constexpr auto Wait4 = RetryEINTR(wait4);
+constexpr auto Waitid = RetryEINTR(waitid);
+
+// Fixture for tests parameterized by a function that waits for any child to
+// exit with the given options, checks that it exited with the given code, and
+// then returns its PID.
+//
+// N.B. These tests run in a multi-threaded environment. We assume that
+// background threads do not create child processes and are not themselves
+// created with clone(... | SIGCHLD). Either may cause these tests to
+// erroneously wait on child processes/threads.
+class WaitAnyChildTest : public ::testing::TestWithParam<
+                             std::function<PosixErrorOr<pid_t>(int, int)>> {
+ protected:
+  PosixErrorOr<pid_t> WaitAny(int code) { return WaitAnyWithOptions(code, 0); }
+
+  PosixErrorOr<pid_t> WaitAnyWithOptions(int code, int options) {
+    return GetParam()(code, options);
+  }
+};
+
+// Wait for any child to exit.
+TEST_P(WaitAnyChildTest, Fork) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+  EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
+}
+
+// Call wait4 for any process after the child has already exited.
+TEST_P(WaitAnyChildTest, AfterExit) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+  absl::SleepFor(absl::Seconds(5));
+
+  EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
+}
+
+// Wait for multiple children to exit, waiting for either at a time.
+TEST_P(WaitAnyChildTest, MultipleFork) {
+  pid_t child1, child2;
+  ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
+  ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());
+
+  std::vector<pid_t> pids;
+  pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
+  pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
+  EXPECT_THAT(pids, UnorderedElementsAre(child1, child2));
+}
+
+// Wait for any child to exit.
+// A non-CLONE_THREAD child which sends SIGCHLD upon exit behaves much like
+// a forked process.
+TEST_P(WaitAnyChildTest, CloneSIGCHLD) {
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free = Cleanup(
+      [this, stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  int child;
+  ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());
+
+  EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
+}
+
+// Wait for a child thread and process.
+TEST_P(WaitAnyChildTest, ForkAndClone) {
+  pid_t process;
+  ASSERT_THAT(process = ForkAndExit(0, 0), SyscallSucceeds());
+
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free = Cleanup(
+      [this, stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  int thread;
+  // Send SIGCHLD for normal wait semantics.
+  ASSERT_THAT(thread = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());
+
+  std::vector<pid_t> pids;
+  pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
+  pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
+  EXPECT_THAT(pids, UnorderedElementsAre(process, thread));
+}
+
+// Return immediately if no child has exited.
+TEST_P(WaitAnyChildTest, WaitWNOHANG) {
+  EXPECT_THAT(
+      WaitAnyWithOptions(0, WNOHANG),
+      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
+                                            ::testing::StrEq("wait4"))));
+}
+
+// Bad options passed
+TEST_P(WaitAnyChildTest, BadOption) {
+  EXPECT_THAT(
+      WaitAnyWithOptions(0, 123456),
+      PosixErrorIs(EINVAL, ::testing::AnyOf(::testing::StrEq("waitid"),
+                                            ::testing::StrEq("wait4"))));
+}
+
+TEST_P(WaitAnyChildTest, WaitedChildRusage) {
+  struct rusage before;
+  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &before), SyscallSucceeds());
+
+  pid_t child;
+  constexpr absl::Duration kSpin = absl::Seconds(3);
+  ASSERT_THAT(child = ForkSpinAndExit(0, absl::ToInt64Seconds(kSpin)),
+              SyscallSucceeds());
+  ASSERT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
+
+  struct rusage after;
+  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &after), SyscallSucceeds());
+
+  EXPECT_GE(RusageCpuTime(after) - RusageCpuTime(before), kSpin);
+}
+
+TEST_P(WaitAnyChildTest, IgnoredChildRusage) {
+  // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
+  // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
+  // sigaction(2)), then children that terminate do not become zombies and a
+  // call to wait() or waitpid() will block until all children have terminated,
+  // and then fail with errno set to ECHILD." - waitpid(2)
+  //
+  // "RUSAGE_CHILDREN: Return resource usage statistics for all children of the
+  // calling process that have terminated *and been waited for*." -
+  // getrusage(2), emphasis added
+
+  struct sigaction sa;
+  sa.sa_handler = SIG_IGN;
+  const auto cleanup_sigact =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGCHLD, sa));
+
+  struct rusage before;
+  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &before), SyscallSucceeds());
+
+  const absl::Duration start =
+      absl::Nanoseconds(clock_gettime_nsecs(CLOCK_MONOTONIC));
+
+  constexpr absl::Duration kSpin = absl::Seconds(3);
+
+  // ForkAndSpin uses CLOCK_THREAD_CPUTIME_ID, which is lower resolution than,
+  // and may diverge from, CLOCK_MONOTONIC, so we allow a small grace period but
+  // still check that we blocked for a while.
+  constexpr absl::Duration kSpinGrace = absl::Milliseconds(100);
+
+  pid_t child;
+  ASSERT_THAT(child = ForkSpinAndExit(0, absl::ToInt64Seconds(kSpin)),
+              SyscallSucceeds());
+  ASSERT_THAT(WaitAny(0), PosixErrorIs(ECHILD, ::testing::AnyOf(
+                                                   ::testing::StrEq("waitid"),
+                                                   ::testing::StrEq("wait4"))));
+  const absl::Duration end =
+      absl::Nanoseconds(clock_gettime_nsecs(CLOCK_MONOTONIC));
+  EXPECT_GE(end - start, kSpin - kSpinGrace);
+
+  struct rusage after;
+  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &after), SyscallSucceeds());
+  EXPECT_EQ(before.ru_utime.tv_sec, after.ru_utime.tv_sec);
+  EXPECT_EQ(before.ru_utime.tv_usec, after.ru_utime.tv_usec);
+  EXPECT_EQ(before.ru_stime.tv_sec, after.ru_stime.tv_sec);
+  EXPECT_EQ(before.ru_stime.tv_usec, after.ru_stime.tv_usec);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Waiters, WaitAnyChildTest,
+    ::testing::Values(
+        [](int code, int options) -> PosixErrorOr<pid_t> {
+          int status;
+          auto const pid = Wait4(-1, &status, options, nullptr);
+          MaybeSave();
+          if (pid < 0) {
+            return PosixError(errno, "wait4");
+          }
+          if (!WIFEXITED(status) || WEXITSTATUS(status) != code) {
+            return PosixError(
+                EINVAL, absl::StrCat("unexpected wait status: got ", status,
+                                     ", wanted ", code));
+          }
+          return static_cast<pid_t>(pid);
+        },
+        [](int code, int options) -> PosixErrorOr<pid_t> {
+          siginfo_t si;
+          auto const rv = Waitid(P_ALL, 0, &si, WEXITED | options);
+          MaybeSave();
+          if (rv < 0) {
+            return PosixError(errno, "waitid");
+          }
+          if (si.si_signo != SIGCHLD) {
+            return PosixError(
+                EINVAL, absl::StrCat("unexpected signo: got ", si.si_signo,
+                                     ", wanted ", SIGCHLD));
+          }
+          if (si.si_status != code) {
+            return PosixError(
+                EINVAL, absl::StrCat("unexpected status: got ", si.si_status,
+                                     ", wanted ", code));
+          }
+          if (si.si_code != CLD_EXITED) {
+            return PosixError(EINVAL,
+                              absl::StrCat("unexpected code: got ", si.si_code,
+                                           ", wanted ", CLD_EXITED));
+          }
+          auto const uid = getuid();
+          if (si.si_uid != uid) {
+            return PosixError(EINVAL,
+                              absl::StrCat("unexpected uid: got ", si.si_uid,
+                                           ", wanted ", uid));
+          }
+          return static_cast<pid_t>(si.si_pid);
+        }));
+
+// Fixture for tests parameterized by a function that takes the PID of a
+// specific child to wait for, waits for it to exit, and checks that it exits
+// with the given code.
+class WaitSpecificChildTest
+    : public ::testing::TestWithParam<std::function<PosixError(pid_t, int)>> {
+ protected:
+  PosixError WaitFor(pid_t pid, int code) { return GetParam()(pid, code); }
+};
+
+// Wait for specific child to exit.
+TEST_P(WaitSpecificChildTest, Fork) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Non-zero exit codes are correctly propagated.
+TEST_P(WaitSpecificChildTest, NormalExit) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitFor(child, 42));
+}
+
+// Wait for multiple children to exit.
+TEST_P(WaitSpecificChildTest, MultipleFork) {
+  pid_t child1, child2;
+  ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
+  ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitFor(child1, 0));
+  EXPECT_NO_ERRNO(WaitFor(child2, 0));
+}
+
+// Wait for multiple children to exit, out of the order they were created.
+TEST_P(WaitSpecificChildTest, MultipleForkOutOfOrder) {
+  pid_t child1, child2;
+  ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
+  ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitFor(child2, 0));
+  EXPECT_NO_ERRNO(WaitFor(child1, 0));
+}
+
+// Wait for specific child to exit, entering wait4 before the exit occurs.
+TEST_P(WaitSpecificChildTest, ForkSleep) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 5), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Wait should block until the child exits.
+TEST_P(WaitSpecificChildTest, ForkBlock) {
+  pid_t child;
+
+  auto start = absl::Now();
+  ASSERT_THAT(child = ForkAndExit(0, 5), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+
+  EXPECT_GE(absl::Now() - start, absl::Seconds(5));
+}
+
+// Waiting after the child has already exited returns immediately.
+TEST_P(WaitSpecificChildTest, AfterExit) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+  absl::SleepFor(absl::Seconds(5));
+
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Wait for specific child to exit.
+// A non-CLONE_THREAD child which sends SIGCHLD upon exit behaves much like
+// a forked process.
+TEST_P(WaitSpecificChildTest, CloneSIGCHLD) {
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free = Cleanup(
+      [this, stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  int child;
+  ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Wait for specific child to exit.
+// A non-CLONE_THREAD child which does not send SIGCHLD upon exit can be waited
+// on, but returns ECHILD.
+TEST_P(WaitSpecificChildTest, CloneNoSIGCHLD) {
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free = Cleanup(
+      [this, stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  int child;
+  ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
+
+  EXPECT_THAT(
+      WaitFor(child, 0),
+      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
+                                            ::testing::StrEq("wait4"))));
+}
+
+// Waiting after the child has already exited returns immediately.
+TEST_P(WaitSpecificChildTest, CloneAfterExit) {
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free = Cleanup(
+      [this, stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  int child;
+  // Send SIGCHLD for normal wait semantics.
+  ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());
+
+  absl::SleepFor(absl::Seconds(5));
+
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// A CLONE_THREAD child cannot be waited on.
+TEST_P(WaitSpecificChildTest, CloneThread) {
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free = Cleanup(
+      [this, stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  int child;
+  ASSERT_THAT(child = CloneAndExit(15, stack, CLONE_THREAD), SyscallSucceeds());
+  auto start = absl::Now();
+
+  EXPECT_THAT(
+      WaitFor(child, 0),
+      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
+                                            ::testing::StrEq("wait4"))));
+
+  // Ensure wait4 didn't block.
+  EXPECT_LE(absl::Now() - start, absl::Seconds(10));
+
+  // Since we can't wait on the child, we sleep to try to avoid freeing its
+  // stack before it exits.
+  absl::SleepFor(absl::Seconds(5));
+}
+
+// Return ECHILD for bad child.
+TEST_P(WaitSpecificChildTest, BadChild) {
+  EXPECT_THAT(
+      WaitFor(42, 0),
+      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
+                                            ::testing::StrEq("wait4"))));
+}
+
+// Wait for a child process that only exits after calling execve(2) from a
+// non-leader thread.
+TEST_P(WaitSpecificChildTest, AfterChildExecve) {
+  ExecveArray const owned_child_argv = {"/bin/true"};
+  char* const* const child_argv = owned_child_argv.get();
+
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free = Cleanup(
+      [this, stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  pid_t const child = fork();
+  if (child == 0) {
+    // Give the parent some time to start waiting.
+    SleepSafe(absl::Seconds(5));
+    // Pass CLONE_VFORK to block the original thread in the child process until
+    // the clone thread calls execve, annihilating them both. (This means that
+    // if clone returns at all, something went wrong.)
+    //
+    // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
+    // x86_64 implementation is safe. See glibc
+    // sysdeps/unix/sysv/linux/x86_64/clone.S.
+    clone(
+        +[](void* arg) {
+          auto child_argv = static_cast<char* const*>(arg);
+          execve(child_argv[0], child_argv, /* envp = */ nullptr);
+          return errno;
+        },
+        reinterpret_cast<void*>(stack),
+        CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
+            CLONE_VFORK,
+        const_cast<char**>(child_argv));
+    _exit(errno);
+  }
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Waiters, WaitSpecificChildTest,
+    ::testing::Values(
+        [](pid_t pid, int code) -> PosixError {
+          int status;
+          auto const rv = Wait4(pid, &status, 0, nullptr);
+          MaybeSave();
+          if (rv < 0) {
+            return PosixError(errno, "wait4");
+          } else if (rv != pid) {
+            return PosixError(EINVAL, absl::StrCat("unexpected pid: got ", rv,
+                                                   ", wanted ", pid));
+          }
+          if (!WIFEXITED(status) || WEXITSTATUS(status) != code) {
+            return PosixError(
+                EINVAL, absl::StrCat("unexpected wait status: got ", status,
+                                     ", wanted ", code));
+          }
+          return NoError();
+        },
+        [](pid_t pid, int code) -> PosixError {
+          siginfo_t si;
+          auto const rv = Waitid(P_PID, pid, &si, WEXITED);
+          MaybeSave();
+          if (rv < 0) {
+            return PosixError(errno, "waitid");
+          }
+          if (si.si_pid != pid) {
+            return PosixError(EINVAL,
+                              absl::StrCat("unexpected pid: got ", si.si_pid,
+                                           ", wanted ", pid));
+          }
+          if (si.si_signo != SIGCHLD) {
+            return PosixError(
+                EINVAL, absl::StrCat("unexpected signo: got ", si.si_signo,
+                                     ", wanted ", SIGCHLD));
+          }
+          if (si.si_status != code) {
+            return PosixError(
+                EINVAL, absl::StrCat("unexpected status: got ", si.si_status,
+                                     ", wanted ", code));
+          }
+          if (si.si_code != CLD_EXITED) {
+            return PosixError(EINVAL,
+                              absl::StrCat("unexpected code: got ", si.si_code,
+                                           ", wanted ", CLD_EXITED));
+          }
+          return NoError();
+        }));
+
+// WIFEXITED, WIFSIGNALED, WTERMSIG indicate signal exit.
+TEST(WaitTest, SignalExit) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 10), SyscallSucceeds());
+
+  EXPECT_THAT(kill(child, SIGKILL), SyscallSucceeds());
+
+  int status;
+  EXPECT_THAT(Wait4(child, &status, 0, nullptr),
+              SyscallSucceedsWithValue(child));
+
+  EXPECT_FALSE(WIFEXITED(status));
+  EXPECT_TRUE(WIFSIGNALED(status));
+  EXPECT_EQ(SIGKILL, WTERMSIG(status));
+}
+
+// A child that does not send a SIGCHLD on exit may be waited on with
+// the __WCLONE flag.
+TEST(WaitTest, CloneWCLONE) {
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free = Cleanup(
+      [this, stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  int child;
+  ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
+
+  EXPECT_THAT(Wait4(child, nullptr, __WCLONE, nullptr),
+              SyscallSucceedsWithValue(child));
+}
+
+// waitid requires at least one option.
+TEST(WaitTest, WaitidOptions) {
+  EXPECT_THAT(Waitid(P_ALL, 0, nullptr, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+// waitid does not wait for a child to exit if not passed WEXITED.
+TEST(WaitTest, WaitidNoWEXITED) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+  EXPECT_THAT(Waitid(P_ALL, 0, nullptr, WSTOPPED),
+              SyscallFailsWithErrno(ECHILD));
+  EXPECT_THAT(Waitid(P_ALL, 0, nullptr, WEXITED), SyscallSucceeds());
+}
+
+// WNOWAIT allows the same wait result to be returned again.
+TEST(WaitTest, WaitidWNOWAIT) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());
+
+  siginfo_t info;
+  ASSERT_THAT(Waitid(P_PID, child, &info, WEXITED | WNOWAIT),
+              SyscallSucceeds());
+  EXPECT_EQ(child, info.si_pid);
+  EXPECT_EQ(SIGCHLD, info.si_signo);
+  EXPECT_EQ(CLD_EXITED, info.si_code);
+  EXPECT_EQ(42, info.si_status);
+
+  ASSERT_THAT(Waitid(P_PID, child, &info, WEXITED), SyscallSucceeds());
+  EXPECT_EQ(child, info.si_pid);
+  EXPECT_EQ(SIGCHLD, info.si_signo);
+  EXPECT_EQ(CLD_EXITED, info.si_code);
+  EXPECT_EQ(42, info.si_status);
+
+  EXPECT_THAT(Waitid(P_PID, child, &info, WEXITED),
+              SyscallFailsWithErrno(ECHILD));
+}
+
+// waitpid(pid, status, options) is equivalent to
+// wait4(pid, status, options, nullptr).
+// This is a dedicated syscall on i386, glibc maps it to wait4 on amd64.
+TEST(WaitTest, WaitPid) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+              SyscallSucceedsWithValue(child));
+
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(42, WEXITSTATUS(status));
+}
+
+// Test that signaling a zombie succeeds. This is a signals test that is in this
+// file for some reason.
+TEST(WaitTest, KillZombie) {
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());
+
+  // Sleep for three seconds to ensure the child has exited.
+  absl::SleepFor(absl::Seconds(3));
+
+  // The child is now a zombie. Check that killing it returns 0.
+  EXPECT_THAT(kill(child, SIGTERM), SyscallSucceeds());
+  EXPECT_THAT(kill(child, 0), SyscallSucceeds());
+
+  EXPECT_THAT(Wait4(child, nullptr, 0, nullptr),
+              SyscallSucceedsWithValue(child));
+}
+
+TEST(WaitTest, Wait4Rusage) {
+  pid_t child;
+  constexpr absl::Duration kSpin = absl::Seconds(3);
+  ASSERT_THAT(child = ForkSpinAndExit(21, absl::ToInt64Seconds(kSpin)),
+              SyscallSucceeds());
+
+  int status;
+  struct rusage rusage = {};
+  ASSERT_THAT(Wait4(child, &status, 0, &rusage),
+              SyscallSucceedsWithValue(child));
+
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(21, WEXITSTATUS(status));
+
+  EXPECT_GE(RusageCpuTime(rusage), kSpin);
+}
+
+TEST(WaitTest, WaitidRusage) {
+  pid_t child;
+  constexpr absl::Duration kSpin = absl::Seconds(3);
+  ASSERT_THAT(child = ForkSpinAndExit(27, absl::ToInt64Seconds(kSpin)),
+              SyscallSucceeds());
+
+  siginfo_t si = {};
+  struct rusage rusage = {};
+
+  // From waitid(2):
+  // The  raw  waitid()  system  call  takes a fifth argument, of type
+  // struct rusage *. If this argument is non-NULL, then  it  is  used
+  // to return resource  usage  information  about  the  child,  in the
+  // same manner as wait4(2).
+  EXPECT_THAT(
+      RetryEINTR(syscall)(SYS_waitid, P_PID, child, &si, WEXITED, &rusage),
+      SyscallSucceeds());
+  EXPECT_EQ(si.si_signo, SIGCHLD);
+  EXPECT_EQ(si.si_code, CLD_EXITED);
+  EXPECT_EQ(si.si_status, 27);
+  EXPECT_EQ(si.si_pid, child);
+
+  EXPECT_GE(RusageCpuTime(rusage), kSpin);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
new file mode 100644
index 000000000..ca6aafd18
--- /dev/null
+++ b/test/syscalls/linux/write.cc
@@ -0,0 +1,134 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/cleanup.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+// This test is currently very rudimentary.
+//
+// TODO:
+// * bad buffer states (EFAULT).
+// * bad fds (wrong permission, wrong type of file, EBADF).
+// * check offset is incremented.
+// * check for EOF.
+// * writing to pipes, symlinks, special files.
+class WriteTest : public ::testing::Test {
+ public:
+  ssize_t WriteBytes(int fd, int bytes) {
+    std::vector<char> buf(bytes);
+    std::fill(buf.begin(), buf.end(), 'a');
+    return WriteFd(fd, buf.data(), buf.size());
+  }
+};
+
+TEST_F(WriteTest, WriteNoExceedsRLimit) {
+  // Get the current rlimit and restore after test run.
+  struct rlimit initial_lim;
+  ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  auto cleanup = Cleanup([&initial_lim] {
+    EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  });
+
+  int fd;
+  struct rlimit setlim;
+  const int target_lim = 1024;
+  setlim.rlim_cur = target_lim;
+  setlim.rlim_max = RLIM_INFINITY;
+  const std::string pathname = NewTempAbsPath();
+  ASSERT_THAT(fd = open(pathname.c_str(), O_WRONLY | O_CREAT, S_IRWXU),
+              SyscallSucceeds());
+  ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+
+  EXPECT_THAT(WriteBytes(fd, target_lim), SyscallSucceedsWithValue(target_lim));
+
+  std::vector<char> buf(target_lim + 1);
+  std::fill(buf.begin(), buf.end(), 'a');
+  EXPECT_THAT(pwrite(fd, buf.data(), target_lim, 1), SyscallSucceeds());
+  EXPECT_THAT(pwrite64(fd, buf.data(), target_lim, 1), SyscallSucceeds());
+
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(WriteTest, WriteExceedsRLimit) {
+  // Get the current rlimit and restore after test run.
+  struct rlimit initial_lim;
+  ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  auto cleanup = Cleanup([&initial_lim] {
+    EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  });
+
+  int fd;
+  sigset_t filesize_mask;
+  sigemptyset(&filesize_mask);
+  sigaddset(&filesize_mask, SIGXFSZ);
+
+  struct rlimit setlim;
+  const int target_lim = 1024;
+  setlim.rlim_cur = target_lim;
+  setlim.rlim_max = RLIM_INFINITY;
+
+  const std::string pathname = NewTempAbsPath();
+  ASSERT_THAT(fd = open(pathname.c_str(), O_WRONLY | O_CREAT, S_IRWXU),
+              SyscallSucceeds());
+  ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+  ASSERT_THAT(sigprocmask(SIG_BLOCK, &filesize_mask, nullptr),
+              SyscallSucceeds());
+  std::vector<char> buf(target_lim + 2);
+  std::fill(buf.begin(), buf.end(), 'a');
+
+  EXPECT_THAT(write(fd, buf.data(), target_lim + 1),
+              SyscallSucceedsWithValue(target_lim));
+  EXPECT_THAT(write(fd, buf.data(), 1), SyscallFailsWithErrno(EFBIG));
+  struct timespec timelimit = {0, 0};
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, nullptr, &timelimit),
+              SyscallSucceedsWithValue(SIGXFSZ));
+
+  EXPECT_THAT(pwrite(fd, buf.data(), target_lim + 1, 1),
+              SyscallSucceedsWithValue(target_lim - 1));
+  EXPECT_THAT(pwrite(fd, buf.data(), 1, target_lim),
+              SyscallFailsWithErrno(EFBIG));
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, nullptr, &timelimit),
+              SyscallSucceedsWithValue(SIGXFSZ));
+
+  EXPECT_THAT(pwrite64(fd, buf.data(), target_lim + 1, 1),
+              SyscallSucceedsWithValue(target_lim - 1));
+  EXPECT_THAT(pwrite64(fd, buf.data(), 1, target_lim),
+              SyscallFailsWithErrno(EFBIG));
+  EXPECT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, nullptr, &timelimit),
+              SyscallSucceedsWithValue(SIGXFSZ));
+
+  ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &filesize_mask, nullptr),
+              SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/syscall_test.go b/test/syscalls/syscall_test.go
new file mode 100644
index 000000000..8463289fe
--- /dev/null
+++ b/test/syscalls/syscall_test.go
@@ -0,0 +1,245 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package syscall_test runs the syscall test suites in gVisor containers. It
+// is meant to be run with "go test", and will panic if run on its own.
+package syscall_test
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"testing"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+	"gvisor.googlesource.com/gvisor/test/syscalls/gtest"
+)
+
+// Location of syscall tests, relative to the repo root.
+const testDir = "test/syscalls/linux"
+
+var (
+	testName = flag.String("test-name", "", "name of test binary to run")
+	debug    = flag.Bool("debug", false, "enable debug logs")
+	strace   = flag.Bool("strace", false, "enable strace logs")
+	platform = flag.String("platform", "ptrace", "platform to run on")
+	parallel = flag.Bool("parallel", false, "run tests in parallel")
+)
+
+func TestSyscalls(t *testing.T) {
+	if *testName == "" {
+		t.Fatalf("test-name flag must be provided")
+	}
+
+	// Get path to test binary.
+	fullTestName := filepath.Join(testDir, *testName)
+	testBin, err := testutil.FindFile(fullTestName)
+	if err != nil {
+		t.Fatalf("FindFile(%q) failed: %v", fullTestName, err)
+	}
+
+	// Get all test cases in each binary.
+	testCases, err := gtest.ParseTestCases(testBin)
+	if err != nil {
+		t.Fatalf("ParseTestCases(%q) failed: %v", testBin, err)
+	}
+
+	// Make sure stdout and stderr are opened with O_APPEND, otherwise logs
+	// from outside the sandbox can (and will) stomp on logs from inside
+	// the sandbox.
+	for _, f := range []*os.File{os.Stdout, os.Stderr} {
+		flags, err := unix.FcntlInt(f.Fd(), unix.F_GETFL, 0)
+		if err != nil {
+			t.Fatalf("error getting file flags for %v: %v", f, err)
+		}
+		if flags&unix.O_APPEND == 0 {
+			flags |= unix.O_APPEND
+			if _, err := unix.FcntlInt(f.Fd(), unix.F_SETFL, flags); err != nil {
+				t.Fatalf("error setting file flags for %v: %v", f, err)
+			}
+		}
+	}
+
+	for _, tc := range testCases {
+		// Capture tc.
+		tc := tc
+
+		testName := fmt.Sprintf("%s_%s", tc.Suite, tc.Name)
+		t.Run(testName, func(t *testing.T) {
+			if *parallel {
+				t.Parallel()
+			}
+
+			if *platform == "native" {
+				// Run the test case on host.
+				runTestCaseNative(testBin, tc, t)
+				return
+			}
+
+			// Run the test case in runsc.
+			runTestCaseRunsc(testBin, tc, t)
+		})
+	}
+}
+
+// runTestCaseNative runs the test case directly on the host machine.
+func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
+	// These tests might be running in parallel, so make sure they have a
+	// unique test temp dir.
+	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
+	if err != nil {
+		t.Fatalf("could not create temp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	// Replace TEST_TMPDIR in the current environment with something
+	// unique.
+	env := os.Environ()
+	newEnvVar := "TEST_TMPDIR=" + tmpDir
+	var found bool
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = newEnvVar
+			found = true
+			break
+		}
+	}
+	if !found {
+		env = append(env, newEnvVar)
+	}
+	// Remove the TEST_PREMATURE_EXIT_FILE variable and XML_OUTPUT_FILE
+	// from the environment.
+	env = filterEnv(env, []string{"TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	cmd := exec.Command(testBin, gtest.FilterTestFlag+"="+tc.FullName())
+	cmd.Env = env
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		ws := err.(*exec.ExitError).Sys().(syscall.WaitStatus)
+		t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus())
+	}
+}
+
+// runsTestCaseRunsc runs the test case in runsc.
+func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("SetupRootDir failed: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+	conf.Debug = *debug
+	conf.Strace = *strace
+	p, err := boot.MakePlatformType(*platform)
+	if err != nil {
+		t.Fatalf("error getting platform %q: %v", *platform, err)
+	}
+	conf.Platform = p
+
+	// Run a new container with the test executable and filter for the
+	// given test suite and name.
+	spec := testutil.NewSpecWithArgs(testBin, gtest.FilterTestFlag+"="+tc.FullName())
+
+	// Mark the root as writeable, as some tests attempt to
+	// write to the rootfs, and expect EACCES, not EROFS.
+	spec.Root.Readonly = false
+
+	// Set environment variable that indicates we are
+	// running in gVisor and with the given platform.
+	platformVar := "TEST_ON_GVISOR"
+	env := append(os.Environ(), platformVar+"="+*platform)
+
+	// Remove the TEST_PREMATURE_EXIT_FILE variable and XML_OUTPUT_FILE
+	// from the environment.
+	env = filterEnv(env, []string{"TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
+	// be backed by tmpfs.
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = "TEST_TMPDIR=/tmp"
+			break
+		}
+	}
+
+	spec.Process.Env = env
+
+	bundleDir, err := testutil.SetupBundleDir(spec)
+	if err != nil {
+		t.Fatalf("SetupBundleDir failed: %v", err)
+	}
+	defer os.RemoveAll(bundleDir)
+
+	id := testutil.UniqueContainerID()
+	log.Infof("Running test %q in container %q", tc.FullName(), id)
+	specutils.LogSpec(spec)
+	ws, err := container.Run(id, spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("container.Run failed: %v", err)
+	}
+	if got := ws.ExitStatus(); got != 0 {
+		t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus())
+	}
+}
+
+// filterEnv returns an environment with the blacklisted variables removed.
+func filterEnv(env, blacklist []string) []string {
+	var out []string
+	for _, kv := range env {
+		ok := true
+		for _, k := range blacklist {
+			if strings.HasPrefix(kv, k+"=") {
+				ok = false
+				break
+			}
+		}
+		if ok {
+			out = append(out, kv)
+		}
+	}
+	return out
+}
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+
+	log.SetLevel(log.Warning)
+	if *debug {
+		log.SetLevel(log.Debug)
+	}
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
+	}
+
+	if *platform != "native" {
+		// The native tests don't expect to be running as root, but
+		// runsc requires it.
+		testutil.RunAsRoot()
+	}
+
+	os.Exit(m.Run())
+}
diff --git a/test/syscalls/syscall_test_runner.sh b/test/syscalls/syscall_test_runner.sh
new file mode 100755
index 000000000..de479f68c
--- /dev/null
+++ b/test/syscalls/syscall_test_runner.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# syscall_test_runner.sh is a simple wrapper around the go syscall test runner.
+# It exists so that we can build the syscall test runner once, and use it for
+# all syscall tests, rather than build it for each test run.
+
+set -euf -o pipefail
+
+# The syscall test runner binary and arguments have all been passed as arguments
+# to this shell script.
+exec "$@"
diff --git a/test/util/BUILD b/test/util/BUILD
new file mode 100644
index 000000000..e4eec4ab9
--- /dev/null
+++ b/test/util/BUILD
@@ -0,0 +1,239 @@
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "capability_util",
+    testonly = 1,
+    srcs = ["capability_util.cc"],
+    hdrs = ["capability_util.h"],
+    deps = [
+        ":cleanup",
+        ":memory_util",
+        ":posix_error",
+        ":save_util",
+        ":test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "file_descriptor",
+    testonly = 1,
+    hdrs = ["file_descriptor.h"],
+    deps = [
+        ":logging",
+        ":posix_error",
+        ":save_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "proc_util",
+    testonly = 1,
+    srcs = ["proc_util.cc"],
+    hdrs = ["proc_util.h"],
+    deps = [
+        ":fs_util",
+        ":posix_error",
+        ":test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "cleanup",
+    testonly = 1,
+    hdrs = ["cleanup.h"],
+)
+
+cc_library(
+    name = "fs_util",
+    testonly = 1,
+    srcs = ["fs_util.cc"],
+    hdrs = ["fs_util.h"],
+    deps = [
+        ":cleanup",
+        ":file_descriptor",
+        ":posix_error",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "fs_util_test",
+    size = "small",
+    srcs = ["fs_util_test.cc"],
+    deps = [
+        ":fs_util",
+        ":posix_error",
+        ":temp_path",
+        ":test_util",
+    ],
+)
+
+cc_library(
+    name = "logging",
+    testonly = 1,
+    srcs = ["logging.cc"],
+    hdrs = ["logging.h"],
+)
+
+cc_library(
+    name = "memory_util",
+    testonly = 1,
+    hdrs = ["memory_util.h"],
+    deps = [
+        ":logging",
+        ":posix_error",
+        ":save_util",
+        ":test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+cc_library(
+    name = "mount_util",
+    testonly = 1,
+    hdrs = ["mount_util.h"],
+    deps = [
+        ":cleanup",
+        ":posix_error",
+        ":test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "save_util",
+    testonly = 1,
+    srcs = ["save_util.cc"],
+    hdrs = ["save_util.h"],
+)
+
+cc_library(
+    name = "multiprocess_util",
+    testonly = 1,
+    srcs = ["multiprocess_util.cc"],
+    hdrs = ["multiprocess_util.h"],
+    deps = [
+        ":cleanup",
+        ":file_descriptor",
+        ":posix_error",
+        ":save_util",
+        ":test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "posix_error",
+    testonly = 1,
+    srcs = ["posix_error.cc"],
+    hdrs = ["posix_error.h"],
+    deps = [
+        ":logging",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "posix_error_test",
+    size = "small",
+    srcs = ["posix_error_test.cc"],
+    deps = [":posix_error"],
+)
+
+cc_library(
+    name = "signal_util",
+    testonly = 1,
+    srcs = ["signal_util.cc"],
+    hdrs = ["signal_util.h"],
+    deps = [
+        ":cleanup",
+        ":posix_error",
+        ":test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "temp_path",
+    testonly = 1,
+    srcs = ["temp_path.cc"],
+    hdrs = ["temp_path.h"],
+    deps = [
+        ":fs_util",
+        ":posix_error",
+        ":test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        ":fs_util",
+        ":logging",
+        ":posix_error",
+        ":save_util",
+        "@com_github_gflags_gflags//:gflags",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
+        "@com_google_glog//:glog",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "thread_util",
+    testonly = 1,
+    hdrs = ["thread_util.h"],
+    deps = [":logging"],
+)
+
+cc_library(
+    name = "timer_util",
+    testonly = 1,
+    srcs = ["timer_util.cc"],
+    hdrs = ["timer_util.h"],
+    deps = [
+        ":cleanup",
+        ":logging",
+        ":posix_error",
+        ":test_util",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "test_util_test",
+    size = "small",
+    srcs = ["test_util_test.cc"],
+    deps = [":test_util"],
+)
+
+cc_library(
+    name = "test_main",
+    testonly = 1,
+    srcs = ["test_main.cc"],
+    deps = [":test_util"],
+)
diff --git a/test/util/capability_util.cc b/test/util/capability_util.cc
new file mode 100644
index 000000000..0656775d6
--- /dev/null
+++ b/test/util/capability_util.cc
@@ -0,0 +1,79 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/capability_util.h"
+
+#include <linux/capability.h>
+#include <sched.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#include "absl/strings/str_cat.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<bool> CanCreateUserNamespace() {
+  // The most reliable way to determine if userns creation is possible is by
+  // trying to create one; see below.
+  ASSIGN_OR_RETURN_ERRNO(
+      auto child_stack,
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  int const child_pid =
+      clone(+[](void*) { return 0; },
+            reinterpret_cast<void*>(child_stack.addr() + kPageSize),
+            CLONE_NEWUSER | SIGCHLD, /* arg = */ nullptr);
+  if (child_pid > 0) {
+    int status;
+    int const ret = waitpid(child_pid, &status, /* options = */ 0);
+    MaybeSave();
+    if (ret < 0) {
+      return PosixError(errno, "waitpid");
+    }
+    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+      return PosixError(
+          ESRCH, absl::StrCat("child process exited with status ", status));
+    }
+    return true;
+  } else if (errno == EPERM) {
+    // Per clone(2), EPERM can be returned if:
+    //
+    // - "CLONE_NEWUSER was specified in flags, but either the effective user ID
+    // or the effective group ID of the caller does not have a mapping in the
+    // parent namespace (see user_namespaces(7))."
+    //
+    // - "(since Linux 3.9) CLONE_NEWUSER was specified in flags and the caller
+    // is in a chroot environment (i.e., the caller's root directory does
+    // not match the root directory of the mount namespace in which it
+    // resides)."
+    LOG(INFO) << "clone(CLONE_NEWUSER) failed with EPERM";
+    return false;
+  } else if (errno == EUSERS) {
+    // "(since Linux 3.11) CLONE_NEWUSER was specified in flags, and the call
+    // would cause the limit on the number of nested user namespaces to be
+    // exceeded. See user_namespaces(7)."
+    LOG(INFO) << "clone(CLONE_NEWUSER) failed with EUSERS";
+    return false;
+  } else {
+    // Unexpected error code; indicate an actual error.
+    return PosixError(errno, "clone(CLONE_NEWUSER)");
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/capability_util.h b/test/util/capability_util.h
new file mode 100644
index 000000000..8708f5e69
--- /dev/null
+++ b/test/util/capability_util.h
@@ -0,0 +1,101 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Utilities for testing capabilties.
+
+#ifndef GVISOR_TEST_UTIL_CAPABILITY_UTIL_H_
+#define GVISOR_TEST_UTIL_CAPABILITY_UTIL_H_
+
+#include <errno.h>
+#include <linux/capability.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "test/util/cleanup.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+#ifndef _LINUX_CAPABILITY_VERSION_3
+#error Expecting _LINUX_CAPABILITY_VERSION_3 support
+#endif
+
+namespace gvisor {
+namespace testing {
+
+// HaveCapability returns true if the process has the specified EFFECTIVE
+// capability.
+inline PosixErrorOr<bool> HaveCapability(int cap) {
+  if (!cap_valid(cap)) {
+    return PosixError(EINVAL, "Invalid capability");
+  }
+
+  struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0};
+  struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {};
+  RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps));
+  MaybeSave();
+
+  return (caps[CAP_TO_INDEX(cap)].effective & CAP_TO_MASK(cap)) != 0;
+}
+
+// SetCapability sets the specified EFFECTIVE capability.
+inline PosixError SetCapability(int cap, bool set) {
+  if (!cap_valid(cap)) {
+    return PosixError(EINVAL, "Invalid capability");
+  }
+
+  struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0};
+  struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {};
+  RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps));
+  MaybeSave();
+
+  if (set) {
+    caps[CAP_TO_INDEX(cap)].effective |= CAP_TO_MASK(cap);
+  } else {
+    caps[CAP_TO_INDEX(cap)].effective &= ~CAP_TO_MASK(cap);
+  }
+  header = {_LINUX_CAPABILITY_VERSION_3, 0};
+  RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capset, &header, &caps));
+  MaybeSave();
+
+  return NoError();
+}
+
+// DropPermittedCapability drops the specified PERMITTED. The EFFECTIVE
+// capabilities must be a subset of PERMITTED, so those are dropped as well.
+inline PosixError DropPermittedCapability(int cap) {
+  if (!cap_valid(cap)) {
+    return PosixError(EINVAL, "Invalid capability");
+  }
+
+  struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0};
+  struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {};
+  RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps));
+  MaybeSave();
+
+  caps[CAP_TO_INDEX(cap)].effective &= ~CAP_TO_MASK(cap);
+  caps[CAP_TO_INDEX(cap)].permitted &= ~CAP_TO_MASK(cap);
+
+  header = {_LINUX_CAPABILITY_VERSION_3, 0};
+  RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capset, &header, &caps));
+  MaybeSave();
+
+  return NoError();
+}
+
+PosixErrorOr<bool> CanCreateUserNamespace();
+
+}  // namespace testing
+}  // namespace gvisor
+#endif  // GVISOR_TEST_UTIL_CAPABILITY_UTIL_H_
diff --git a/test/util/cleanup.h b/test/util/cleanup.h
new file mode 100644
index 000000000..fb4724f97
--- /dev/null
+++ b/test/util/cleanup.h
@@ -0,0 +1,61 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_CLEANUP_H_
+#define GVISOR_TEST_UTIL_CLEANUP_H_
+
+#include <functional>
+#include <utility>
+
+namespace gvisor {
+namespace testing {
+
+class Cleanup {
+ public:
+  Cleanup() : released_(true) {}
+  explicit Cleanup(std::function<void()>&& callback) : cb_(callback) {}
+
+  Cleanup(Cleanup&& other) {
+    released_ = other.released_;
+    cb_ = other.Release();
+  }
+
+  Cleanup& operator=(Cleanup&& other) {
+    released_ = other.released_;
+    cb_ = other.Release();
+    return *this;
+  }
+
+  ~Cleanup() {
+    if (!released_) {
+      cb_();
+    }
+  }
+
+  std::function<void()>&& Release() {
+    released_ = true;
+    return std::move(cb_);
+  }
+
+ private:
+  Cleanup(Cleanup const& other) = delete;
+
+  bool released_ = false;
+  std::function<void(void)> cb_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_CLEANUP_H_
diff --git a/test/util/file_descriptor.h b/test/util/file_descriptor.h
new file mode 100644
index 000000000..be8812d01
--- /dev/null
+++ b/test/util/file_descriptor.h
@@ -0,0 +1,134 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_FILE_DESCRIPTOR_H_
+#define GVISOR_TEST_UTIL_FILE_DESCRIPTOR_H_
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// FileDescriptor is an RAII type class which takes ownership of a file
+// descriptor. It will close the FD when this object goes out of scope.
+class FileDescriptor {
+ public:
+  // Constructs an empty FileDescriptor (one that does not own a file
+  // descriptor).
+  FileDescriptor() = default;
+
+  // Constructs a FileDescriptor that owns fd. If fd is negative, constructs an
+  // empty FileDescriptor.
+  explicit FileDescriptor(int fd) { set_fd(fd); }
+
+  FileDescriptor(FileDescriptor&& orig) : fd_(orig.release()) {}
+
+  FileDescriptor& operator=(FileDescriptor&& orig) {
+    reset(orig.release());
+    return *this;
+  }
+
+  PosixErrorOr<FileDescriptor> Dup() const {
+    if (fd_ < 0) {
+      return PosixError(EINVAL, "Attempting to Dup unset fd");
+    }
+
+    int fd = dup(fd_);
+    if (fd < 0) {
+      return PosixError(errno, absl::StrCat("dup ", fd_));
+    }
+    MaybeSave();
+    return FileDescriptor(fd);
+  }
+
+  FileDescriptor(FileDescriptor const& other) = delete;
+  FileDescriptor& operator=(FileDescriptor const& other) = delete;
+
+  ~FileDescriptor() { reset(); }
+
+  // If this object is non-empty, returns the owned file descriptor. (Ownership
+  // is retained by the FileDescriptor.) Otherwise returns -1.
+  int get() const { return fd_; }
+
+  // If this object is non-empty, transfers ownership of the file descriptor to
+  // the caller and returns it. Otherwise returns -1.
+  int release() {
+    int const fd = fd_;
+    fd_ = -1;
+    return fd;
+  }
+
+  // If this object is non-empty, closes the owned file descriptor (recording a
+  // test failure if the close fails).
+  void reset() { reset(-1); }
+
+  // Like no-arg reset(), but the FileDescriptor takes ownership of fd after
+  // closing its existing file descriptor.
+  void reset(int fd) {
+    if (fd_ >= 0) {
+      TEST_PCHECK(close(fd_) == 0);
+      MaybeSave();
+    }
+    set_fd(fd);
+  }
+
+ private:
+  // Wrapper that coerces negative fd values other than -1 to -1 so that get()
+  // etc. return -1.
+  void set_fd(int fd) { fd_ = std::max(fd, -1); }
+
+  int fd_ = -1;
+};
+
+// Wrapper around open(2) that returns a FileDescriptor.
+inline PosixErrorOr<FileDescriptor> Open(std::string const& path, int flags,
+                                         mode_t mode = 0) {
+  int fd = open(path.c_str(), flags, mode);
+  if (fd < 0) {
+    return PosixError(errno, absl::StrFormat("open(%s, %#x, %#o)", path.c_str(),
+                                             flags, mode));
+  }
+  MaybeSave();
+  return FileDescriptor(fd);
+}
+
+// Wrapper around openat(2) that returns a FileDescriptor.
+inline PosixErrorOr<FileDescriptor> OpenAt(int dirfd, std::string const& path,
+                                           int flags, mode_t mode = 0) {
+  int fd = openat(dirfd, path.c_str(), flags, mode);
+  if (fd < 0) {
+    return PosixError(errno, absl::StrFormat("openat(%d, %s, %#x, %#o)", dirfd,
+                                             path, flags, mode));
+  }
+  MaybeSave();
+  return FileDescriptor(fd);
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_FILE_DESCRIPTOR_H_
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
new file mode 100644
index 000000000..e7e8be1d8
--- /dev/null
+++ b/test/util/fs_util.cc
@@ -0,0 +1,585 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/fs_util.h"
+
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+PosixError WriteContentsToFD(int fd, absl::string_view contents) {
+  int written = 0;
+  while (static_cast<absl::string_view::size_type>(written) < contents.size()) {
+    int wrote = write(fd, contents.data() + written, contents.size() - written);
+    if (wrote < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return PosixError(
+          errno, absl::StrCat("WriteContentsToFD fd: ", fd, " write failure."));
+    }
+    written += wrote;
+  }
+  return NoError();
+}
+}  // namespace
+
+namespace internal {
+
+// Given a collection of file paths, append them all together,
+// ensuring that the proper path separators are inserted between them.
+std::string JoinPathImpl(std::initializer_list<absl::string_view> paths) {
+  std::string result;
+
+  if (paths.size() != 0) {
+    // This size calculation is worst-case: it assumes one extra "/" for every
+    // path other than the first.
+    size_t total_size = paths.size() - 1;
+    for (const absl::string_view path : paths) total_size += path.size();
+    result.resize(total_size);
+
+    auto begin = result.begin();
+    auto out = begin;
+    bool trailing_slash = false;
+    for (absl::string_view path : paths) {
+      if (path.empty()) continue;
+      if (path.front() == '/') {
+        if (trailing_slash) {
+          path.remove_prefix(1);
+        }
+      } else {
+        if (!trailing_slash && out != begin) *out++ = '/';
+      }
+      const size_t this_size = path.size();
+      memcpy(&*out, path.data(), this_size);
+      out += this_size;
+      trailing_slash = out[-1] == '/';
+    }
+    result.erase(out - begin);
+  }
+  return result;
+}
+}  // namespace internal
+
+// Returns a status or the current working directory.
+PosixErrorOr<std::string> GetCWD() {
+  char buffer[PATH_MAX + 1] = {};
+  if (getcwd(buffer, PATH_MAX) == nullptr) {
+    return PosixError(errno, "GetCWD() failed");
+  }
+
+  return std::string(buffer);
+}
+
+PosixErrorOr<struct stat> Stat(absl::string_view path) {
+  struct stat stat_buf;
+  int res = stat(std::string(path).c_str(), &stat_buf);
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("stat ", path));
+  }
+  return stat_buf;
+}
+
+PosixErrorOr<bool> Exists(absl::string_view path) {
+  struct stat stat_buf;
+  int res = stat(std::string(path).c_str(), &stat_buf);
+  if (res < 0) {
+    if (errno == ENOENT) {
+      return false;
+    }
+    return PosixError(errno, absl::StrCat("stat ", path));
+  }
+  return true;
+}
+
+PosixErrorOr<bool> IsDirectory(absl::string_view path) {
+  ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Stat(path));
+  if (S_ISDIR(stat_buf.st_mode)) {
+    return true;
+  }
+
+  return false;
+}
+
+PosixError Delete(absl::string_view path) {
+  int res = unlink(std::string(path).c_str());
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("unlink ", path));
+  }
+
+  return NoError();
+}
+
+PosixError Truncate(absl::string_view path, int length) {
+  int res = truncate(std::string(path).c_str(), length);
+  if (res < 0) {
+    return PosixError(errno,
+                      absl::StrCat("truncate ", path, " to length ", length));
+  }
+
+  return NoError();
+}
+
+PosixError Chmod(absl::string_view path, int mode) {
+  int res = chmod(std::string(path).c_str(), mode);
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("chmod ", path));
+  }
+
+  return NoError();
+}
+
+PosixError Mkdir(absl::string_view path, int mode) {
+  int res = mkdir(std::string(path).c_str(), mode);
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("mkdir ", path, " mode ", mode));
+  }
+
+  return NoError();
+}
+
+PosixError Rmdir(absl::string_view path) {
+  int res = rmdir(std::string(path).c_str());
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("rmdir ", path));
+  }
+
+  return NoError();
+}
+
+PosixError SetContents(absl::string_view path, absl::string_view contents) {
+  ASSIGN_OR_RETURN_ERRNO(bool exists, Exists(path));
+  if (!exists) {
+    return PosixError(
+        ENOENT, absl::StrCat("SetContents file ", path, " doesn't exist."));
+  }
+
+  ASSIGN_OR_RETURN_ERRNO(auto fd, Open(std::string(path), O_WRONLY | O_TRUNC));
+  return WriteContentsToFD(fd.get(), contents);
+}
+
+// Create a file with the given contents (if it does not already exist with the
+// given mode) and then set the contents.
+PosixError CreateWithContents(absl::string_view path,
+                              absl::string_view contents, int mode) {
+  ASSIGN_OR_RETURN_ERRNO(
+      auto fd, Open(std::string(path), O_WRONLY | O_CREAT | O_TRUNC, mode));
+  return WriteContentsToFD(fd.get(), contents);
+}
+
+PosixError GetContents(absl::string_view path, std::string* output) {
+  ASSIGN_OR_RETURN_ERRNO(auto fd, Open(std::string(path), O_RDONLY));
+  output->clear();
+
+  // Keep reading until we hit an EOF or an error.
+  return GetContentsFD(fd.get(), output);
+}
+
+PosixErrorOr<std::string> GetContents(absl::string_view path) {
+  std::string ret;
+  RETURN_IF_ERRNO(GetContents(path, &ret));
+  return ret;
+}
+
+PosixErrorOr<std::string> GetContentsFD(int fd) {
+  std::string ret;
+  RETURN_IF_ERRNO(GetContentsFD(fd, &ret));
+  return ret;
+}
+
+PosixError GetContentsFD(int fd, std::string* output) {
+  // Keep reading until we hit an EOF or an error.
+  while (true) {
+    char buf[16 * 1024] = {};  // Read in 16KB chunks.
+    int bytes_read = read(fd, buf, sizeof(buf));
+    if (bytes_read < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return PosixError(errno, "GetContentsFD read failure.");
+    }
+
+    if (bytes_read == 0) {
+      break;  // EOF.
+    }
+
+    output->append(buf, bytes_read);
+  }
+  return NoError();
+}
+
+PosixErrorOr<std::string> ReadLink(absl::string_view path) {
+  char buf[PATH_MAX + 1] = {};
+  int ret = readlink(std::string(path).c_str(), buf, PATH_MAX);
+  if (ret < 0) {
+    return PosixError(errno, absl::StrCat("readlink ", path));
+  }
+
+  return std::string(buf, ret);
+}
+
+PosixError WalkTree(
+    absl::string_view path, bool recursive,
+    const std::function<void(absl::string_view, const struct stat&)>& cb) {
+  DIR* dir = opendir(std::string(path).c_str());
+  if (dir == nullptr) {
+    return PosixError(errno, absl::StrCat("opendir ", path));
+  }
+  auto dir_closer = Cleanup([&dir]() { closedir(dir); });
+  while (true) {
+    // Readdir(3): If the end of the directory stream is reached, NULL is
+    // returned and errno is not changed.  If an error occurs, NULL is returned
+    // and errno is set appropriately.  To distinguish end of stream and from an
+    // error, set errno to zero before calling readdir() and then check the
+    // value of errno if NULL is returned.
+    errno = 0;
+    struct dirent* dp = readdir(dir);
+    if (dp == nullptr) {
+      if (errno != 0) {
+        return PosixError(errno, absl::StrCat("readdir ", path));
+      }
+      break;  // We're done.
+    }
+
+    if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0) {
+      // Skip dots.
+      continue;
+    }
+
+    auto full_path = JoinPath(path, dp->d_name);
+    ASSIGN_OR_RETURN_ERRNO(struct stat s, Stat(full_path));
+    if (S_ISDIR(s.st_mode) && recursive) {
+      RETURN_IF_ERRNO(WalkTree(full_path, recursive, cb));
+    } else {
+      cb(full_path, s);
+    }
+  }
+  // We're done walking so let's invoke our cleanup callback now.
+  dir_closer.Release()();
+
+  // And we have to dispatch the callback on the base directory.
+  ASSIGN_OR_RETURN_ERRNO(struct stat s, Stat(path));
+  cb(path, s);
+
+  return NoError();
+}
+
+PosixErrorOr<std::vector<std::string>> ListDir(absl::string_view abspath,
+                                          bool skipdots) {
+  std::vector<std::string> files;
+
+  DIR* dir = opendir(std::string(abspath).c_str());
+  if (dir == nullptr) {
+    return PosixError(errno, absl::StrCat("opendir ", abspath));
+  }
+  auto dir_closer = Cleanup([&dir]() { closedir(dir); });
+  while (true) {
+    // Readdir(3): If the end of the directory stream is reached, NULL is
+    // returned and errno is not changed.  If an error occurs, NULL is returned
+    // and errno is set appropriately.  To distinguish end of stream and from an
+    // error, set errno to zero before calling readdir() and then check the
+    // value of errno if NULL is returned.
+    errno = 0;
+    struct dirent* dp = readdir(dir);
+    if (dp == nullptr) {
+      if (errno != 0) {
+        return PosixError(errno, absl::StrCat("readdir ", abspath));
+      }
+      break;  // We're done.
+    }
+
+    if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0) {
+      if (skipdots) {
+        continue;
+      }
+    }
+    files.push_back(std::string(dp->d_name));
+  }
+
+  return files;
+}
+
+PosixError RecursivelyDelete(absl::string_view path, int* undeleted_dirs,
+                             int* undeleted_files) {
+  ASSIGN_OR_RETURN_ERRNO(bool exists, Exists(path));
+  if (!exists) {
+    return PosixError(ENOENT, absl::StrCat(path, " does not exist"));
+  }
+
+  ASSIGN_OR_RETURN_ERRNO(bool dir, IsDirectory(path));
+  if (!dir) {
+    // Nothing recursive needs to happen we can just call Delete.
+    auto status = Delete(path);
+    if (!status.ok() && undeleted_files) {
+      (*undeleted_files)++;
+    }
+    return status;
+  }
+
+  return WalkTree(path, /*recursive=*/true,
+                  [&](absl::string_view absolute_path, const struct stat& s) {
+                    if (S_ISDIR(s.st_mode)) {
+                      auto rm_status = Rmdir(absolute_path);
+                      if (!rm_status.ok() && undeleted_dirs) {
+                        (*undeleted_dirs)++;
+                      }
+                    } else {
+                      auto delete_status = Delete(absolute_path);
+                      if (!delete_status.ok() && undeleted_files) {
+                        (*undeleted_files)++;
+                      }
+                    }
+                  });
+}
+
+PosixError RecursivelyCreateDir(absl::string_view path) {
+  if (path.empty() || path == "/") {
+    return PosixError(EINVAL, "Cannot create root!");
+  }
+
+  // Does it already exist, if so we're done.
+  ASSIGN_OR_RETURN_ERRNO(bool exists, Exists(path));
+  if (exists) {
+    return NoError();
+  }
+
+  // Do we need to create directories under us?
+  auto dirname = Dirname(path);
+  ASSIGN_OR_RETURN_ERRNO(exists, Exists(dirname));
+  if (!exists) {
+    RETURN_IF_ERRNO(RecursivelyCreateDir(dirname));
+  }
+
+  return Mkdir(path);
+}
+
+// Makes a path absolute with respect to an optional base. If no base is
+// provided it will use the current working directory.
+PosixErrorOr<std::string> MakeAbsolute(absl::string_view filename,
+                                  absl::string_view base) {
+  if (filename.empty()) {
+    return PosixError(EINVAL, "filename cannot be empty.");
+  }
+
+  if (filename[0] == '/') {
+    // This path is already absolute.
+    return std::string(filename);
+  }
+
+  std::string actual_base;
+  if (!base.empty()) {
+    actual_base = std::string(base);
+  } else {
+    auto cwd_or = GetCWD();
+    RETURN_IF_ERRNO(cwd_or.error());
+    actual_base = cwd_or.ValueOrDie();
+  }
+
+  // Reverse iterate removing trailing slashes, effectively right trim '/'.
+  for (int i = actual_base.size() - 1; i >= 0 && actual_base[i] == '/'; --i) {
+    actual_base.erase(i, 1);
+  }
+
+  if (filename == ".") {
+    return actual_base.empty() ? "/" : actual_base;
+  }
+
+  return absl::StrCat(actual_base, "/", filename);
+}
+
+std::string CleanPath(const absl::string_view unclean_path) {
+  std::string path = std::string(unclean_path);
+  const char *src = path.c_str();
+  std::string::iterator dst = path.begin();
+
+  // Check for absolute path and determine initial backtrack limit.
+  const bool is_absolute_path = *src == '/';
+  if (is_absolute_path) {
+    *dst++ = *src++;
+    while (*src == '/') ++src;
+  }
+  std::string::const_iterator backtrack_limit = dst;
+
+  // Process all parts
+  while (*src) {
+    bool parsed = false;
+
+    if (src[0] == '.') {
+      //  1dot ".<whateverisnext>", check for END or SEP.
+      if (src[1] == '/' || !src[1]) {
+        if (*++src) {
+          ++src;
+        }
+        parsed = true;
+      } else if (src[1] == '.' && (src[2] == '/' || !src[2])) {
+        // 2dot END or SEP (".." | "../<whateverisnext>").
+        src += 2;
+        if (dst != backtrack_limit) {
+          // We can backtrack the previous part
+          for (--dst; dst != backtrack_limit && dst[-1] != '/'; --dst) {
+            // Empty.
+          }
+        } else if (!is_absolute_path) {
+          // Failed to backtrack and we can't skip it either. Rewind and copy.
+          src -= 2;
+          *dst++ = *src++;
+          *dst++ = *src++;
+          if (*src) {
+            *dst++ = *src;
+          }
+          // We can never backtrack over a copied "../" part so set new limit.
+          backtrack_limit = dst;
+        }
+        if (*src) {
+          ++src;
+        }
+        parsed = true;
+      }
+    }
+
+    // If not parsed, copy entire part until the next SEP or EOS.
+    if (!parsed) {
+      while (*src && *src != '/') {
+        *dst++ = *src++;
+      }
+      if (*src) {
+        *dst++ = *src++;
+      }
+    }
+
+    // Skip consecutive SEP occurrences
+    while (*src == '/') {
+      ++src;
+    }
+  }
+
+  // Calculate and check the length of the cleaned path.
+  int path_length = dst - path.begin();
+  if (path_length != 0) {
+    // Remove trailing '/' except if it is root path ("/" ==> path_length := 1)
+    if (path_length > 1 && path[path_length - 1] == '/') {
+      --path_length;
+    }
+    path.resize(path_length);
+  } else {
+    // The cleaned path is empty; assign "." as per the spec.
+    path.assign(1, '.');
+  }
+  return path;
+}
+
+PosixErrorOr<std::string> GetRelativePath(absl::string_view source,
+                                     absl::string_view dest) {
+  if (!absl::StartsWith(source, "/") || !absl::StartsWith(dest, "/")) {
+    // At least one of the inputs is not an absolute path.
+    return PosixError(
+        EINVAL,
+        "GetRelativePath: At least one of the inputs is not an absolute path.");
+  }
+  const std::string clean_source = CleanPath(source);
+  const std::string clean_dest = CleanPath(dest);
+  auto source_parts = absl::StrSplit(clean_source, '/', absl::SkipEmpty());
+  auto dest_parts = absl::StrSplit(clean_dest, '/', absl::SkipEmpty());
+  auto source_iter = source_parts.begin();
+  auto dest_iter = dest_parts.begin();
+
+  // Advance past common prefix.
+  while (source_iter != source_parts.end() && dest_iter != dest_parts.end() &&
+         *source_iter == *dest_iter) {
+    ++source_iter;
+    ++dest_iter;
+  }
+
+  // Build result backtracking.
+  std::string result = "";
+  while (source_iter != source_parts.end()) {
+    absl::StrAppend(&result, "../");
+    ++source_iter;
+  }
+
+  // Add remaining path to dest.
+  while (dest_iter != dest_parts.end()) {
+    absl::StrAppend(&result, *dest_iter, "/");
+    ++dest_iter;
+  }
+
+  if (result.empty()) {
+    return std::string(".");
+  }
+
+  // Remove trailing slash.
+  result.erase(result.size() - 1);
+  return result;
+}
+
+absl::string_view Dirname(absl::string_view path) {
+  return SplitPath(path).first;
+}
+
+absl::string_view Basename(absl::string_view path) {
+  return SplitPath(path).second;
+}
+
+std::pair<absl::string_view, absl::string_view> SplitPath(
+    absl::string_view path) {
+  std::string::size_type pos = path.find_last_of('/');
+
+  // Handle the case with no '/' in 'path'.
+  if (pos == absl::string_view::npos)
+    return std::make_pair(path.substr(0, 0), path);
+
+  // Handle the case with a single leading '/' in 'path'.
+  if (pos == 0)
+    return std::make_pair(path.substr(0, 1), absl::ClippedSubstr(path, 1));
+
+  return std::make_pair(path.substr(0, pos),
+                        absl::ClippedSubstr(path, pos + 1));
+}
+
+std::string JoinPath(absl::string_view path1, absl::string_view path2) {
+  if (path1.empty()) return std::string(path2);
+  if (path2.empty()) return std::string(path1);
+  if (path1.back() == '/') {
+    if (path2.front() == '/')
+      return absl::StrCat(path1, absl::ClippedSubstr(path2, 1));
+  } else {
+    if (path2.front() != '/') return absl::StrCat(path1, "/", path2);
+  }
+  return absl::StrCat(path1, path2);
+}
+
+PosixErrorOr<std::string> ProcessExePath(int pid) {
+  if (pid <= 0) {
+    return PosixError(EINVAL, "Invalid pid specified");
+  }
+
+  return ReadLink(absl::StrCat("/proc/", pid, "/exe"));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
new file mode 100644
index 000000000..9412b2f71
--- /dev/null
+++ b/test/util/fs_util.h
@@ -0,0 +1,182 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_FS_UTIL_H_
+#define GVISOR_TEST_UTIL_FS_UTIL_H_
+
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "absl/strings/string_view.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+// Returns a status or the current working directory.
+PosixErrorOr<std::string> GetCWD();
+
+// Returns true/false depending on whether or not path exists, or an error if it
+// can't be determined.
+PosixErrorOr<bool> Exists(absl::string_view path);
+
+// Returns a stat structure for the given path or an error.
+PosixErrorOr<struct stat> Stat(absl::string_view path);
+
+// Deletes the file or directory at path or returns an error.
+PosixError Delete(absl::string_view path);
+
+// Changes the mode of a file or returns an error.
+PosixError Chmod(absl::string_view path, int mode);
+
+// Truncates a file to the given length or returns an error.
+PosixError Truncate(absl::string_view path, int length);
+
+// Returns true/false depending on whether or not the path is a directory or
+// returns an error.
+PosixErrorOr<bool> IsDirectory(absl::string_view path);
+
+// Makes a directory or returns an error.
+PosixError Mkdir(absl::string_view path, int mode = 0755);
+
+// Removes a directory or returns an error.
+PosixError Rmdir(absl::string_view path);
+
+// Attempts to set the contents of a file or returns an error.
+PosixError SetContents(absl::string_view path, absl::string_view contents);
+
+// Creates a file with the given contents and mode or returns an error.
+PosixError CreateWithContents(absl::string_view path,
+                              absl::string_view contents, int mode = 0666);
+
+// Attempts to read the entire contents of the file into the provided std::string
+// buffer or returns an error.
+PosixError GetContents(absl::string_view path, std::string* output);
+
+// Attempts to read the entire contents of the file or returns an error.
+PosixErrorOr<std::string> GetContents(absl::string_view path);
+
+// Attempts to read the entire contents of the provided fd into the provided
+// std::string or returns an error.
+PosixError GetContentsFD(int fd, std::string* output);
+
+// Attempts to read the entire contents of the provided fd or returns an error.
+PosixErrorOr<std::string> GetContentsFD(int fd);
+
+// Executes the readlink(2) system call or returns an error.
+PosixErrorOr<std::string> ReadLink(absl::string_view path);
+
+// WalkTree will walk a directory tree in a depth first search manner (if
+// recursive). It will invoke a provided callback for each file and directory,
+// the parent will always be invoked last making this appropriate for things
+// such as deleting an entire directory tree.
+//
+// This method will return an error when it's unable to access the provided
+// path, or when the path is not a directory.
+PosixError WalkTree(
+    absl::string_view path, bool recursive,
+    const std::function<void(absl::string_view, const struct stat&)>& cb);
+
+// Returns the base filenames for all files under a given absolute path. If
+// skipdots is true the returned vector will not contain "." or "..". This
+// method does not walk the tree recursively it only returns the elements
+// in that directory.
+PosixErrorOr<std::vector<std::string>> ListDir(absl::string_view abspath,
+                                          bool skipdots);
+
+// Attempt to recursively delete a directory or file. Returns an error and
+// the number of undeleted directories and files. If either
+// undeleted_dirs or undeleted_files is nullptr then it will not be used.
+PosixError RecursivelyDelete(absl::string_view path, int* undeleted_dirs,
+                             int* undeleted_files);
+
+// Recursively create the directory provided or return an error.
+PosixError RecursivelyCreateDir(absl::string_view path);
+
+// Makes a path absolute with respect to an optional base. If no base is
+// provided it will use the current working directory.
+PosixErrorOr<std::string> MakeAbsolute(absl::string_view filename,
+                                  absl::string_view base);
+
+// Generates a relative path from the source directory to the destination
+// (dest) file or directory.  This uses ../ when necessary for destinations
+// which are not nested within the source.  Both source and dest are required
+// to be absolute paths, and an empty std::string will be returned if they are not.
+PosixErrorOr<std::string> GetRelativePath(absl::string_view source,
+                                     absl::string_view dest);
+
+// Returns the part of the path before the final "/", EXCEPT:
+// * If there is a single leading "/" in the path, the result will be the
+//   leading "/".
+// * If there is no "/" in the path, the result is the empty prefix of the
+//   input std::string.
+absl::string_view Dirname(absl::string_view path);
+
+// Return the parts of the path, split on the final "/".  If there is no
+// "/" in the path, the first part of the output is empty and the second
+// is the input. If the only "/" in the path is the first character, it is
+// the first part of the output.
+std::pair<absl::string_view, absl::string_view> SplitPath(
+    absl::string_view path);
+
+// Returns the part of the path after the final "/". If there is no
+// "/" in the path, the result is the same as the input.
+// Note that this function's behavior differs from the Unix basename
+// command if path ends with "/". For such paths, this function returns the
+// empty std::string.
+absl::string_view Basename(absl::string_view path);
+
+// Collapse duplicate "/"s, resolve ".." and "." path elements, remove
+// trailing "/".
+//
+// NOTE: This respects relative vs. absolute paths, but does not
+// invoke any system calls (getcwd(2)) in order to resolve relative
+// paths wrt actual working directory.  That is, this is purely a
+// std::string manipulation, completely independent of process state.
+std::string CleanPath(absl::string_view path);
+
+// Returns the full path to the executable of the given pid or a PosixError.
+PosixErrorOr<std::string> ProcessExePath(int pid);
+
+namespace internal {
+// Not part of the public API.
+std::string JoinPathImpl(std::initializer_list<absl::string_view> paths);
+}  // namespace internal
+
+// Join multiple paths together.
+// All paths will be treated as relative paths, regardless of whether or not
+// they start with a leading '/'.  That is, all paths will be concatenated
+// together, with the appropriate path separator inserted in between.
+// Arguments must be convertible to absl::string_view.
+//
+// Usage:
+// std::string path = JoinPath("/foo", dirname, filename);
+// std::string path = JoinPath(FLAGS_test_srcdir, filename);
+//
+// 0, 1, 2-path specializations exist to optimize common cases.
+inline std::string JoinPath() { return std::string(); }
+inline std::string JoinPath(absl::string_view path) {
+  return std::string(path.data(), path.size());
+}
+
+std::string JoinPath(absl::string_view path1, absl::string_view path2);
+template <typename... T>
+inline std::string JoinPath(absl::string_view path1, absl::string_view path2,
+                       absl::string_view path3, const T&... args) {
+  return internal::JoinPathImpl({path1, path2, path3, args...});
+}
+}  // namespace testing
+}  // namespace gvisor
+#endif  // GVISOR_TEST_UTIL_FS_UTIL_H_
diff --git a/test/util/fs_util_test.cc b/test/util/fs_util_test.cc
new file mode 100644
index 000000000..ce70d58aa
--- /dev/null
+++ b/test/util/fs_util_test.cc
@@ -0,0 +1,100 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(FsUtilTest, RecursivelyCreateDirManualDelete) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string base_path = JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m");
+
+  ASSERT_THAT(Exists(base_path), IsPosixErrorOkAndHolds(false));
+  ASSERT_NO_ERRNO(RecursivelyCreateDir(base_path));
+
+  // Delete everything until we hit root and then stop, we want to try this
+  // without using RecursivelyDelete.
+  std::string cur_path = base_path;
+  while (cur_path != root.path()) {
+    ASSERT_THAT(Exists(cur_path), IsPosixErrorOkAndHolds(true));
+    ASSERT_NO_ERRNO(Rmdir(cur_path));
+    ASSERT_THAT(Exists(cur_path), IsPosixErrorOkAndHolds(false));
+    auto dir = Dirname(cur_path);
+    cur_path = std::string(dir);
+  }
+}
+
+TEST(FsUtilTest, RecursivelyCreateAndDeleteDir) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string base_path = JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m");
+
+  ASSERT_THAT(Exists(base_path), IsPosixErrorOkAndHolds(false));
+  ASSERT_NO_ERRNO(RecursivelyCreateDir(base_path));
+
+  const std::string sub_path = JoinPath(root.path(), "a");
+  ASSERT_NO_ERRNO(RecursivelyDelete(sub_path, nullptr, nullptr));
+  ASSERT_THAT(Exists(sub_path), IsPosixErrorOkAndHolds(false));
+}
+
+TEST(FsUtilTest, RecursivelyCreateAndDeletePartial) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string base_path = JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m");
+
+  ASSERT_THAT(Exists(base_path), IsPosixErrorOkAndHolds(false));
+  ASSERT_NO_ERRNO(RecursivelyCreateDir(base_path));
+
+  const std::string a = JoinPath(root.path(), "a");
+  auto listing = ASSERT_NO_ERRNO_AND_VALUE(ListDir(a, true));
+  ASSERT_THAT(listing, ::testing::Contains("b"));
+  ASSERT_EQ(listing.size(), 1);
+
+  listing = ASSERT_NO_ERRNO_AND_VALUE(ListDir(a, false));
+  ASSERT_THAT(listing, ::testing::Contains("."));
+  ASSERT_THAT(listing, ::testing::Contains(".."));
+  ASSERT_THAT(listing, ::testing::Contains("b"));
+  ASSERT_EQ(listing.size(), 3);
+
+  const std::string sub_path = JoinPath(root.path(), "/a/b/c/d/e/f");
+
+  ASSERT_NO_ERRNO(
+      CreateWithContents(JoinPath(Dirname(sub_path), "file"), "Hello World"));
+  std::string contents = "";
+  ASSERT_NO_ERRNO(GetContents(JoinPath(Dirname(sub_path), "file"), &contents));
+  ASSERT_EQ(contents, "Hello World");
+
+  ASSERT_NO_ERRNO(RecursivelyDelete(sub_path, nullptr, nullptr));
+  ASSERT_THAT(Exists(sub_path), IsPosixErrorOkAndHolds(false));
+
+  // The parent of the subpath (directory e) should still exist.
+  ASSERT_THAT(Exists(Dirname(sub_path)), IsPosixErrorOkAndHolds(true));
+
+  // The file we created along side f should also still exist.
+  ASSERT_THAT(Exists(JoinPath(Dirname(sub_path), "file")),
+              IsPosixErrorOkAndHolds(true));
+}
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/logging.cc b/test/util/logging.cc
new file mode 100644
index 000000000..86ea71df3
--- /dev/null
+++ b/test/util/logging.cc
@@ -0,0 +1,97 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/logging.h"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// We implement this here instead of using test_util to avoid cyclic
+// dependencies.
+int Write(int fd, const char* buf, size_t size) {
+  size_t written = 0;
+  while (written < size) {
+    int res = write(fd, buf + written, size - written);
+    if (res < 0 && errno == EINTR) {
+      continue;
+    } else if (res <= 0) {
+      break;
+    }
+
+    written += res;
+  }
+  return static_cast<int>(written);
+}
+
+// Write 32-bit decimal number to fd.
+int WriteNumber(int fd, uint32_t val) {
+  constexpr char kDigits[] = "0123456789";
+  constexpr int kBase = 10;
+
+  // 10 chars for 32-bit number in decimal, 1 char for the NUL-terminator.
+  constexpr int kBufferSize = 11;
+  char buf[kBufferSize];
+
+  // Convert the number to std::string.
+  char* s = buf + sizeof(buf) - 1;
+  size_t size = 0;
+
+  *s = '\0';
+  do {
+    s--;
+    size++;
+
+    *s = kDigits[val % kBase];
+    val /= kBase;
+  } while (val);
+
+  return Write(fd, s, size);
+}
+
+}  // namespace
+
+void CheckFailure(const char* cond, size_t cond_size, const char* msg,
+                  size_t msg_size, bool include_errno) {
+  int saved_errno = errno;
+
+  constexpr char kCheckFailure[] = "Check failed: ";
+  Write(2, kCheckFailure, sizeof(kCheckFailure) - 1);
+  Write(2, cond, cond_size);
+
+  if (msg != nullptr) {
+    Write(2, ": ", 2);
+    Write(2, msg, msg_size);
+  }
+
+  if (include_errno) {
+    constexpr char kErrnoMessage[] = " (errno ";
+    Write(2, kErrnoMessage, sizeof(kErrnoMessage) - 1);
+    WriteNumber(2, saved_errno);
+    Write(2, ")", 1);
+  }
+
+  Write(2, "\n", 1);
+
+  abort();
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/logging.h b/test/util/logging.h
new file mode 100644
index 000000000..6e957b172
--- /dev/null
+++ b/test/util/logging.h
@@ -0,0 +1,73 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_LOGGING_H_
+#define GVISOR_TEST_UTIL_LOGGING_H_
+
+#include <stddef.h>
+
+namespace gvisor {
+namespace testing {
+
+void CheckFailure(const char* cond, size_t cond_size, const char* msg,
+                  size_t msg_size, bool include_errno);
+
+// If cond is false, aborts the current process.
+//
+// This macro is async-signal-safe.
+#define TEST_CHECK(cond)                                                       \
+  do {                                                                         \
+    if (!(cond)) {                                                             \
+      ::gvisor::testing::CheckFailure(#cond, sizeof(#cond) - 1, nullptr, \
+                                            0, false);                         \
+    }                                                                          \
+  } while (0)
+
+// If cond is false, logs msg then aborts the current process.
+//
+// This macro is async-signal-safe.
+#define TEST_CHECK_MSG(cond, msg)                                          \
+  do {                                                                     \
+    if (!(cond)) {                                                         \
+      ::gvisor::testing::CheckFailure(#cond, sizeof(#cond) - 1, msg, \
+                                            sizeof(msg) - 1, false);       \
+    }                                                                      \
+  } while (0)
+
+// If cond is false, logs errno, then aborts the current process.
+//
+// This macro is async-signal-safe.
+#define TEST_PCHECK(cond)                                                      \
+  do {                                                                         \
+    if (!(cond)) {                                                             \
+      ::gvisor::testing::CheckFailure(#cond, sizeof(#cond) - 1, nullptr, \
+                                            0, true);                          \
+    }                                                                          \
+  } while (0)
+
+// If cond is false, logs msg and errno, then aborts the current process.
+//
+// This macro is async-signal-safe.
+#define TEST_PCHECK_MSG(cond, msg)                                         \
+  do {                                                                     \
+    if (!(cond)) {                                                         \
+      ::gvisor::testing::CheckFailure(#cond, sizeof(#cond) - 1, msg, \
+                                            sizeof(msg) - 1, true);        \
+    }                                                                      \
+  } while (0)
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_LOGGING_H_
diff --git a/test/util/memory_util.h b/test/util/memory_util.h
new file mode 100644
index 000000000..8f6e99ba6
--- /dev/null
+++ b/test/util/memory_util.h
@@ -0,0 +1,124 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_MEMORY_UTIL_H_
+#define GVISOR_TEST_UTIL_MEMORY_UTIL_H_
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/mman.h>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// RAII type for mmap'ed memory. Only usable in tests due to use of a test-only
+// macro that can't be named without invoking the presubmit's wrath.
+class Mapping {
+ public:
+  // Constructs a mapping that owns nothing.
+  Mapping() = default;
+
+  // Constructs a mapping that owns the mmapped memory [ptr, ptr+len). Most
+  // users should use Mmap or MmapAnon instead.
+  Mapping(void* ptr, size_t len) : ptr_(ptr), len_(len) {}
+
+  Mapping(Mapping&& orig) : ptr_(orig.ptr_), len_(orig.len_) { orig.release(); }
+
+  Mapping& operator=(Mapping&& orig) {
+    ptr_ = orig.ptr_;
+    len_ = orig.len_;
+    orig.release();
+    return *this;
+  }
+
+  Mapping(Mapping const&) = delete;
+  Mapping& operator=(Mapping const&) = delete;
+
+  ~Mapping() { reset(); }
+
+  void* ptr() const { return ptr_; }
+  size_t len() const { return len_; }
+
+  // Returns a pointer to the end of the mapping. Useful for when the mapping
+  // is used as a thread stack.
+  void* endptr() const { return reinterpret_cast<void*>(addr() + len_); }
+
+  // Returns the start of this mapping cast to uintptr_t for ease of pointer
+  // arithmetic.
+  uintptr_t addr() const { return reinterpret_cast<uintptr_t>(ptr_); }
+
+  // Returns the end of this mapping cast to uintptr_t for ease of pointer
+  // arithmetic.
+  uintptr_t endaddr() const { return reinterpret_cast<uintptr_t>(endptr()); }
+
+  // Returns this mapping as a StringPiece for ease of comparison.
+  //
+  // This function is named view in anticipation of the eventual replacement of
+  // StringPiece with std::string_view.
+  absl::string_view view() const {
+    return absl::string_view(static_cast<char const*>(ptr_), len_);
+  }
+
+  // These are both named reset for consistency with standard smart pointers.
+
+  void reset(void* ptr, size_t len) {
+    if (len_) {
+      TEST_PCHECK(munmap(ptr_, len_) == 0);
+    }
+    ptr_ = ptr;
+    len_ = len;
+  }
+
+  void reset() { reset(nullptr, 0); }
+
+  void release() {
+    ptr_ = nullptr;
+    len_ = 0;
+  }
+
+ private:
+  void* ptr_ = nullptr;
+  size_t len_ = 0;
+};
+
+// Wrapper around mmap(2) that returns a Mapping.
+inline PosixErrorOr<Mapping> Mmap(void* addr, size_t length, int prot,
+                                  int flags, int fd, off_t offset) {
+  void* ptr = mmap(addr, length, prot, flags, fd, offset);
+  if (ptr == MAP_FAILED) {
+    return PosixError(
+        errno, absl::StrFormat("mmap(%p, %d, %x, %x, %d, %d)", addr, length,
+                               prot, flags, fd, offset));
+  }
+  MaybeSave();
+  return Mapping(ptr, length);
+}
+
+// Convenience wrapper around Mmap for anonymous mappings.
+inline PosixErrorOr<Mapping> MmapAnon(size_t length, int prot, int flags) {
+  return Mmap(nullptr, length, prot, flags | MAP_ANONYMOUS, -1, 0);
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_MEMORY_UTIL_H_
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
new file mode 100644
index 000000000..468170646
--- /dev/null
+++ b/test/util/mount_util.h
@@ -0,0 +1,48 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_MOUNT_UTIL_H_
+#define GVISOR_TEST_UTIL_MOUNT_UTIL_H_
+
+#include <errno.h>
+#include <sys/mount.h>
+#include <functional>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "test/util/cleanup.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Mount mounts the filesystem, and unmounts when the returned reference is
+// destroyed.
+inline PosixErrorOr<Cleanup> Mount(const std::string &source, const std::string &target,
+                                   const std::string &fstype, uint64_t mountflags,
+                                   const std::string &data, uint64_t umountflags) {
+  if (mount(source.c_str(), target.c_str(), fstype.c_str(), mountflags,
+            data.c_str()) == -1) {
+    return PosixError(errno, "mount failed");
+  }
+  return Cleanup([target, umountflags]() {
+    EXPECT_THAT(umount2(target.c_str(), umountflags), SyscallSucceeds());
+  });
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_MOUNT_UTIL_H_
diff --git a/test/util/multiprocess_util.cc b/test/util/multiprocess_util.cc
new file mode 100644
index 000000000..12637db8c
--- /dev/null
+++ b/test/util/multiprocess_util.cc
@@ -0,0 +1,139 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/multiprocess_util.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "absl/strings/str_cat.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
+                                  const ExecveArray& argv,
+                                  const ExecveArray& envv,
+                                  const std::function<void()>& fn, pid_t* child,
+                                  int* execve_errno) {
+  int pfds[2];
+  int ret = pipe2(pfds, O_CLOEXEC);
+  if (ret < 0) {
+    return PosixError(errno, "pipe failed");
+  }
+  FileDescriptor rfd(pfds[0]);
+  FileDescriptor wfd(pfds[1]);
+
+  int parent_stdout = dup(STDOUT_FILENO);
+  if (parent_stdout < 0) {
+    return PosixError(errno, "dup stdout");
+  }
+  int parent_stderr = dup(STDERR_FILENO);
+  if (parent_stdout < 0) {
+    return PosixError(errno, "dup stderr");
+  }
+
+  pid_t pid = fork();
+  if (pid < 0) {
+    return PosixError(errno, "fork failed");
+  } else if (pid == 0) {
+    // Child.
+    rfd.reset();
+    if (dup2(parent_stdout, STDOUT_FILENO) < 0) {
+      _exit(3);
+    }
+    if (dup2(parent_stderr, STDERR_FILENO) < 0) {
+      _exit(4);
+    }
+    close(parent_stdout);
+    close(parent_stderr);
+
+    // Clean ourself up in case the parent doesn't.
+    if (prctl(PR_SET_PDEATHSIG, SIGKILL)) {
+      _exit(3);
+    }
+
+    if (fn) {
+      fn();
+    }
+
+    execve(filename.c_str(), argv.get(), envv.get());
+    int error = errno;
+    if (WriteFd(pfds[1], &error, sizeof(error)) != sizeof(error)) {
+      // We can't do much if the write fails, but we can at least exit with a
+      // different code.
+      _exit(2);
+    }
+    _exit(1);
+  }
+
+  // Parent.
+  if (child) {
+    *child = pid;
+  }
+
+  auto cleanup = Cleanup([pid] {
+    kill(pid, SIGKILL);
+    RetryEINTR(waitpid)(pid, nullptr, 0);
+  });
+
+  wfd.reset();
+
+  int read_errno;
+  ret = ReadFd(rfd.get(), &read_errno, sizeof(read_errno));
+  if (ret == 0) {
+    // Other end of the pipe closed, execve must have succeeded.
+    read_errno = 0;
+  } else if (ret < 0) {
+    return PosixError(errno, "read pipe failed");
+  } else if (ret != sizeof(read_errno)) {
+    return PosixError(EPIPE, absl::StrCat("pipe read wrong size ", ret));
+  }
+
+  if (execve_errno) {
+    *execve_errno = read_errno;
+  }
+
+  return std::move(cleanup);
+}
+
+PosixErrorOr<int> InForkedProcess(const std::function<void()>& fn) {
+  pid_t pid = fork();
+  if (pid == 0) {
+    fn();
+    _exit(0);
+  }
+  MaybeSave();
+  if (pid < 0) {
+    return PosixError(errno, "fork failed");
+  }
+
+  int status;
+  if (waitpid(pid, &status, 0) < 0) {
+    return PosixError(errno, "waitpid failed");
+  }
+
+  return status;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
new file mode 100644
index 000000000..c09d6167f
--- /dev/null
+++ b/test/util/multiprocess_util.h
@@ -0,0 +1,113 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_MULTIPROCESS_UTIL_H_
+#define GVISOR_TEST_UTIL_MULTIPROCESS_UTIL_H_
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "test/util/cleanup.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// Immutable holder for a dynamically-sized array of pointers to mutable char,
+// terminated by a null pointer, as required for the argv and envp arguments to
+// execve(2).
+class ExecveArray {
+ public:
+  // Constructs an empty ExecveArray.
+  ExecveArray() = default;
+
+  // Constructs an ExecveArray by copying strings from the given range. T must
+  // be a range over ranges of char.
+  template <typename T>
+  explicit ExecveArray(T const& strs) : ExecveArray(strs.begin(), strs.end()) {}
+
+  // Constructs an ExecveArray by copying strings from [first, last). InputIt
+  // must be an input iterator over a range over char.
+  template <typename InputIt>
+  ExecveArray(InputIt first, InputIt last) {
+    std::vector<size_t> offsets;
+    auto output_it = std::back_inserter(str_);
+    for (InputIt it = first; it != last; ++it) {
+      offsets.push_back(str_.size());
+      auto const& s = *it;
+      std::copy(s.begin(), s.end(), output_it);
+      str_.push_back('\0');
+    }
+    ptrs_.reserve(offsets.size() + 1);
+    for (auto offset : offsets) {
+      ptrs_.push_back(str_.data() + offset);
+    }
+    ptrs_.push_back(nullptr);
+  }
+
+  // Constructs an ExecveArray by copying strings from list. This overload must
+  // exist independently of the single-argument template constructor because
+  // std::initializer_list does not participate in template argument deduction
+  // (i.e. cannot be type-inferred in an invocation of the templated
+  // constructor).
+  /* implicit */ ExecveArray(std::initializer_list<absl::string_view> list)
+      : ExecveArray(list.begin(), list.end()) {}
+
+  // Disable move construction and assignment since ptrs_ points into str_.
+  ExecveArray(ExecveArray&&) = delete;
+  ExecveArray& operator=(ExecveArray&&) = delete;
+
+  char* const* get() const { return ptrs_.data(); }
+
+ private:
+  std::vector<char> str_;
+  std::vector<char*> ptrs_;
+};
+
+// Simplified version of SubProcess. Returns OK and a cleanup function to kill
+// the child if it made it to execve.
+//
+// fn is run between fork and exec. If it needs to fail, it should exit the
+// process.
+//
+// The child pid is returned via child, if provided.
+// execve's error code is returned via execve_errno, if provided.
+PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
+                                  const ExecveArray& argv,
+                                  const ExecveArray& envv,
+                                  const std::function<void()>& fn, pid_t* child,
+                                  int* execve_errno);
+
+inline PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
+                                         const ExecveArray& argv,
+                                         const ExecveArray& envv, pid_t* child,
+                                         int* execve_errno) {
+  return ForkAndExec(filename, argv, envv, [] {}, child, execve_errno);
+}
+
+// Calls fn in a forked subprocess and returns the exit status of the
+// subprocess.
+//
+// fn must be async-signal-safe.
+PosixErrorOr<int> InForkedProcess(const std::function<void()>& fn);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_MULTIPROCESS_UTIL_H_
diff --git a/test/util/posix_error.cc b/test/util/posix_error.cc
new file mode 100644
index 000000000..9db72c6de
--- /dev/null
+++ b/test/util/posix_error.cc
@@ -0,0 +1,93 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/posix_error.h"
+
+#include <cassert>
+#include <cerrno>
+#include <cstring>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+
+namespace gvisor {
+namespace testing {
+
+std::string PosixError::ToString() const {
+  if (ok()) {
+    return "No Error";
+  }
+
+  std::string ret;
+
+  char strerrno_buf[1024] = {};
+  char* msg = nullptr;
+  if ((msg = strerror_r(errno_, strerrno_buf, sizeof(strerrno_buf))) ==
+      nullptr) {
+    ret = absl::StrCat("PosixError(errno=", errno_, " strerror_r FAILED)");
+  } else {
+    ret = absl::StrCat("PosixError(errno=", errno_, " ", msg, ")");
+  }
+
+  if (!msg_.empty()) {
+    ret.append(" ");
+    ret.append(msg_);
+  }
+
+  return ret;
+}
+
+::std::ostream& operator<<(::std::ostream& os, const PosixError& e) {
+  os << e.ToString();
+  return os;
+}
+
+void PosixErrorIsMatcherCommonImpl::DescribeTo(std::ostream* os) const {
+  *os << "has an errno value that ";
+  code_matcher_.DescribeTo(os);
+  *os << ", and has an error message that ";
+  message_matcher_.DescribeTo(os);
+}
+
+void PosixErrorIsMatcherCommonImpl::DescribeNegationTo(std::ostream* os) const {
+  *os << "has an errno value that ";
+  code_matcher_.DescribeNegationTo(os);
+  *os << ", or has an error message that ";
+  message_matcher_.DescribeNegationTo(os);
+}
+
+bool PosixErrorIsMatcherCommonImpl::MatchAndExplain(
+    const PosixError& error,
+    ::testing::MatchResultListener* result_listener) const {
+  ::testing::StringMatchResultListener inner_listener;
+
+  inner_listener.Clear();
+  if (!code_matcher_.MatchAndExplain(error.errno_value(), &inner_listener)) {
+    *result_listener << (inner_listener.str().empty()
+                             ? "whose errno value is wrong"
+                             : "which has a errno value " +
+                                   inner_listener.str());
+    return false;
+  }
+
+  if (!message_matcher_.Matches(error.error_message())) {
+    *result_listener << "whose error message is wrong";
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/posix_error.h b/test/util/posix_error.h
new file mode 100644
index 000000000..8450be9b9
--- /dev/null
+++ b/test/util/posix_error.h
@@ -0,0 +1,428 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_POSIX_ERROR_H_
+#define GVISOR_TEST_UTIL_POSIX_ERROR_H_
+
+#include <string>
+
+#include "gmock/gmock.h"
+#include "absl/base/attributes.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/variant.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+class PosixErrorIsMatcherCommonImpl;
+
+template <typename T>
+class PosixErrorOr;
+
+class ABSL_MUST_USE_RESULT PosixError {
+ public:
+  PosixError() {}
+  explicit PosixError(int errno_value) : errno_(errno_value) {}
+  PosixError(int errno_value, std::string msg)
+      : errno_(errno_value), msg_(std::move(msg)) {}
+
+  PosixError(PosixError&& other) = default;
+  PosixError& operator=(PosixError&& other) = default;
+  PosixError(const PosixError&) = default;
+  PosixError& operator=(const PosixError&) = default;
+
+  bool ok() const { return errno_ == 0; }
+
+  // Returns a reference to *this to make matchers compatible with
+  // PosixErrorOr.
+  const PosixError& error() const { return *this; }
+
+  std::string error_message() const { return msg_; }
+
+  // ToString produces a full std::string representation of this posix error
+  // including the printable representation of the errno and the error message.
+  std::string ToString() const;
+
+  // Ignores any errors. This method does nothing except potentially suppress
+  // complaints from any tools that are checking that errors are not dropped on
+  // the floor.
+  void IgnoreError() const {}
+
+ private:
+  int errno_value() const { return errno_; }
+  int errno_ = 0;
+  std::string msg_;
+
+  friend class PosixErrorIsMatcherCommonImpl;
+
+  template <typename T>
+  friend class PosixErrorOr;
+};
+
+template <typename T>
+class ABSL_MUST_USE_RESULT PosixErrorOr {
+ public:
+  PosixErrorOr(const PosixError& error);  // NOLINT
+  explicit PosixErrorOr(const T& value);
+  PosixErrorOr(T&& value);  // NOLINT
+
+  PosixErrorOr(PosixErrorOr&& other) = default;
+  PosixErrorOr& operator=(PosixErrorOr&& other) = default;
+  PosixErrorOr(const PosixErrorOr&) = default;
+  PosixErrorOr& operator=(const PosixErrorOr&) = default;
+
+  // Conversion copy/move constructor, T must be convertible from U.
+  template <typename U>
+  friend class PosixErrorOr;
+
+  template <typename U>
+  PosixErrorOr(PosixErrorOr<U> other);
+
+  template <typename U>
+  PosixErrorOr& operator=(PosixErrorOr<U> other);
+
+  // Return a reference to the error or NoError().
+  const PosixError error() const;
+
+  // Returns this->error().error_message();
+  const std::string error_message() const;
+
+  // Returns this->error().ok()
+  bool ok() const;
+
+  // Returns a reference to our current value, or CHECK-fails if !this->ok().
+  const T& ValueOrDie() const;
+  T& ValueOrDie();
+
+  // Ignores any errors. This method does nothing except potentially suppress
+  // complaints from any tools that are checking that errors are not dropped on
+  // the floor.
+  void IgnoreError() const {}
+
+ private:
+  const int errno_value() const;
+  absl::variant<T, PosixError> value_;
+
+  friend class PosixErrorIsMatcherCommonImpl;
+};
+
+template <typename T>
+PosixErrorOr<T>::PosixErrorOr(const PosixError& error) : value_(error) {}
+
+template <typename T>
+PosixErrorOr<T>::PosixErrorOr(const T& value) : value_(value) {}
+
+template <typename T>
+PosixErrorOr<T>::PosixErrorOr(T&& value) : value_(std::move(value)) {}
+
+// Conversion copy/move constructor, T must be convertible from U.
+template <typename T>
+template <typename U>
+inline PosixErrorOr<T>::PosixErrorOr(PosixErrorOr<U> other) {
+  if (absl::holds_alternative<U>(other.value_)) {
+    // T is convertible from U.
+    value_ = absl::get<U>(std::move(other.value_));
+  } else if (absl::holds_alternative<PosixError>(other.value_)) {
+    value_ = absl::get<PosixError>(std::move(other.value_));
+  } else {
+    TEST_CHECK_MSG(false, "PosixErrorOr does not contain PosixError or value");
+  }
+}
+
+template <typename T>
+template <typename U>
+inline PosixErrorOr<T>& PosixErrorOr<T>::operator=(PosixErrorOr<U> other) {
+  if (absl::holds_alternative<U>(other.value_)) {
+    // T is convertible from U.
+    value_ = absl::get<U>(std::move(other.value_));
+  } else if (absl::holds_alternative<PosixError>(other.value_)) {
+    value_ = absl::get<PosixError>(std::move(other.value_));
+  } else {
+    TEST_CHECK_MSG(false, "PosixErrorOr does not contain PosixError or value");
+  }
+  return *this;
+}
+
+template <typename T>
+const PosixError PosixErrorOr<T>::error() const {
+  if (!absl::holds_alternative<PosixError>(value_)) {
+    return PosixError();
+  }
+  return absl::get<PosixError>(value_);
+}
+
+template <typename T>
+const int PosixErrorOr<T>::errno_value() const {
+  return error().errno_value();
+}
+
+template <typename T>
+const std::string PosixErrorOr<T>::error_message() const {
+  return error().error_message();
+}
+
+template <typename T>
+bool PosixErrorOr<T>::ok() const {
+  return error().ok();
+}
+
+template <typename T>
+const T& PosixErrorOr<T>::ValueOrDie() const {
+  TEST_CHECK(absl::holds_alternative<T>(value_));
+  return absl::get<T>(value_);
+}
+
+template <typename T>
+T& PosixErrorOr<T>::ValueOrDie() {
+  TEST_CHECK(absl::holds_alternative<T>(value_));
+  return absl::get<T>(value_);
+}
+
+extern ::std::ostream& operator<<(::std::ostream& os, const PosixError& e);
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const PosixErrorOr<T>& e) {
+  os << e.error();
+  return os;
+}
+
+// NoError is a PosixError that represents a successful state, i.e. No Error.
+inline PosixError NoError() { return PosixError(); }
+
+// Monomorphic implementation of matcher IsPosixErrorOk() for a given type T.
+// T can be PosixError, PosixErrorOr<>, or a reference to either of them.
+template <typename T>
+class MonoPosixErrorIsOkMatcherImpl : public ::testing::MatcherInterface<T> {
+ public:
+  void DescribeTo(std::ostream* os) const override { *os << "is OK"; }
+  void DescribeNegationTo(std::ostream* os) const override {
+    *os << "is not OK";
+  }
+  bool MatchAndExplain(T actual_value,
+                       ::testing::MatchResultListener*) const override {
+    return actual_value.ok();
+  }
+};
+
+// Implements IsPosixErrorOkMatcher() as a polymorphic matcher.
+class IsPosixErrorOkMatcher {
+ public:
+  template <typename T>
+  operator ::testing::Matcher<T>() const {  // NOLINT
+    return MakeMatcher(new MonoPosixErrorIsOkMatcherImpl<T>());
+  }
+};
+
+// Monomorphic implementation of a matcher for a PosixErrorOr.
+template <typename PosixErrorOrType>
+class IsPosixErrorOkAndHoldsMatcherImpl
+    : public ::testing::MatcherInterface<PosixErrorOrType> {
+ public:
+  using ValueType = typename std::remove_reference<decltype(
+      std::declval<PosixErrorOrType>().ValueOrDie())>::type;
+
+  template <typename InnerMatcher>
+  explicit IsPosixErrorOkAndHoldsMatcherImpl(InnerMatcher&& inner_matcher)
+      : inner_matcher_(::testing::SafeMatcherCast<const ValueType&>(
+            std::forward<InnerMatcher>(inner_matcher))) {}
+
+  void DescribeTo(std::ostream* os) const override {
+    *os << "is OK and has a value that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const override {
+    *os << "isn't OK or has a value that ";
+    inner_matcher_.DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(
+      PosixErrorOrType actual_value,
+      ::testing::MatchResultListener* listener) const override {
+    if (!actual_value.ok()) {
+      *listener << "which has error value " << actual_value.error();
+      return false;
+    }
+
+    ::testing::StringMatchResultListener inner_listener;
+    const bool matches = inner_matcher_.MatchAndExplain(
+        actual_value.ValueOrDie(), &inner_listener);
+    const std::string inner_explanation = inner_listener.str();
+    if (!inner_explanation.empty()) {
+      *listener << "which contains value "
+                << ::testing::PrintToString(actual_value.ValueOrDie()) << ", "
+                << inner_explanation;
+    }
+    return matches;
+  }
+
+ private:
+  const ::testing::Matcher<const ValueType&> inner_matcher_;
+};
+
+// Implements IsOkAndHolds() as a polymorphic matcher.
+template <typename InnerMatcher>
+class IsPosixErrorOkAndHoldsMatcher {
+ public:
+  explicit IsPosixErrorOkAndHoldsMatcher(InnerMatcher inner_matcher)
+      : inner_matcher_(std::move(inner_matcher)) {}
+
+  // Converts this polymorphic matcher to a monomorphic one of the given type.
+  // PosixErrorOrType can be either PosixErrorOr<T> or a reference to
+  // PosixErrorOr<T>.
+  template <typename PosixErrorOrType>
+  operator ::testing::Matcher<PosixErrorOrType>() const {  // NOLINT
+    return ::testing::MakeMatcher(
+        new IsPosixErrorOkAndHoldsMatcherImpl<PosixErrorOrType>(
+            inner_matcher_));
+  }
+
+ private:
+  const InnerMatcher inner_matcher_;
+};
+
+// PosixErrorIs() is a polymorphic matcher.  This class is the common
+// implementation of it shared by all types T where PosixErrorIs() can be
+// used as a Matcher<T>.
+class PosixErrorIsMatcherCommonImpl {
+ public:
+  PosixErrorIsMatcherCommonImpl(
+      ::testing::Matcher<int> code_matcher,
+      ::testing::Matcher<const std::string&> message_matcher)
+      : code_matcher_(std::move(code_matcher)),
+        message_matcher_(std::move(message_matcher)) {}
+
+  void DescribeTo(std::ostream* os) const;
+
+  void DescribeNegationTo(std::ostream* os) const;
+
+  bool MatchAndExplain(const PosixError& error,
+                       ::testing::MatchResultListener* result_listener) const;
+
+ private:
+  const ::testing::Matcher<int> code_matcher_;
+  const ::testing::Matcher<const std::string&> message_matcher_;
+};
+
+// Monomorphic implementation of matcher PosixErrorIs() for a given type
+// T.  T can be PosixError, PosixErrorOr<>, or a reference to either of them.
+template <typename T>
+class MonoPosixErrorIsMatcherImpl : public ::testing::MatcherInterface<T> {
+ public:
+  explicit MonoPosixErrorIsMatcherImpl(
+      PosixErrorIsMatcherCommonImpl common_impl)
+      : common_impl_(std::move(common_impl)) {}
+
+  void DescribeTo(std::ostream* os) const override {
+    common_impl_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const override {
+    common_impl_.DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(
+      T actual_value,
+      ::testing::MatchResultListener* result_listener) const override {
+    return common_impl_.MatchAndExplain(actual_value.error(), result_listener);
+  }
+
+ private:
+  PosixErrorIsMatcherCommonImpl common_impl_;
+};
+
+inline ::testing::Matcher<int> ToErrorCodeMatcher(
+    const ::testing::Matcher<int>& m) {
+  return m;
+}
+
+// Implements PosixErrorIs() as a polymorphic matcher.
+class PosixErrorIsMatcher {
+ public:
+  template <typename ErrorCodeMatcher>
+  PosixErrorIsMatcher(ErrorCodeMatcher&& code_matcher,
+                      ::testing::Matcher<const std::string&> message_matcher)
+      : common_impl_(
+            ToErrorCodeMatcher(std::forward<ErrorCodeMatcher>(code_matcher)),
+            std::move(message_matcher)) {}
+
+  // Converts this polymorphic matcher to a monomorphic matcher of the
+  // given type.  T can be StatusOr<>, Status, or a reference to
+  // either of them.
+  template <typename T>
+  operator ::testing::Matcher<T>() const {  // NOLINT
+    return MakeMatcher(new MonoPosixErrorIsMatcherImpl<T>(common_impl_));
+  }
+
+ private:
+  const PosixErrorIsMatcherCommonImpl common_impl_;
+};
+
+// Returns a gMock matcher that matches a PosixError or PosixErrorOr<> whose
+// whose error code matches code_matcher, and whose error message matches
+// message_matcher.
+template <typename ErrorCodeMatcher>
+PosixErrorIsMatcher PosixErrorIs(
+    ErrorCodeMatcher&& code_matcher,
+    ::testing::Matcher<const std::string&> message_matcher) {
+  return PosixErrorIsMatcher(std::forward<ErrorCodeMatcher>(code_matcher),
+                             std::move(message_matcher));
+}
+
+// Returns a gMock matcher that matches a PosixErrorOr<> which is ok() and
+// value matches the inner matcher.
+template <typename InnerMatcher>
+IsPosixErrorOkAndHoldsMatcher<typename std::decay<InnerMatcher>::type>
+IsPosixErrorOkAndHolds(InnerMatcher&& inner_matcher) {
+  return IsPosixErrorOkAndHoldsMatcher<typename std::decay<InnerMatcher>::type>(
+      std::forward<InnerMatcher>(inner_matcher));
+}
+
+// Internal helper for concatenating macro values.
+#define POSIX_ERROR_IMPL_CONCAT_INNER_(x, y) x##y
+#define POSIX_ERROR_IMPL_CONCAT_(x, y) POSIX_ERROR_IMPL_CONCAT_INNER_(x, y)
+
+#define POSIX_ERROR_IMPL_ASSIGN_OR_RETURN_(posixerroror, lhs, rexpr) \
+  auto posixerroror = (rexpr);                                       \
+  if (!posixerroror.ok()) {                                          \
+    return (posixerroror.error());                                   \
+  }                                                                  \
+  lhs = std::move(posixerroror.ValueOrDie())
+
+#define EXPECT_NO_ERRNO(expression) \
+  EXPECT_THAT(expression, IsPosixErrorOkMatcher())
+#define ASSERT_NO_ERRNO(expression) \
+  ASSERT_THAT(expression, IsPosixErrorOkMatcher())
+
+#define ASSIGN_OR_RETURN_ERRNO(lhs, rexpr) \
+  POSIX_ERROR_IMPL_ASSIGN_OR_RETURN_(      \
+      POSIX_ERROR_IMPL_CONCAT_(_status_or_value, __LINE__), lhs, rexpr)
+
+#define RETURN_IF_ERRNO(s) \
+  do {                     \
+    if (!s.ok()) return s; \
+  } while (false);
+
+#define ASSERT_NO_ERRNO_AND_VALUE(expr)   \
+  ({                                      \
+    auto _expr_result = (expr);           \
+    ASSERT_NO_ERRNO(_expr_result);        \
+    std::move(_expr_result.ValueOrDie()); \
+  })
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_POSIX_ERROR_H_
diff --git a/test/util/posix_error_test.cc b/test/util/posix_error_test.cc
new file mode 100644
index 000000000..535b9f66a
--- /dev/null
+++ b/test/util/posix_error_test.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/posix_error.h"
+
+#include <errno.h>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(PosixErrorTest, PosixError) {
+  auto err = PosixError(EAGAIN);
+  EXPECT_THAT(err, PosixErrorIs(EAGAIN, ""));
+}
+
+TEST(PosixErrorTest, PosixErrorOrPosixError) {
+  auto err = PosixErrorOr<std::nullptr_t>(PosixError(EAGAIN));
+  EXPECT_THAT(err, PosixErrorIs(EAGAIN, ""));
+}
+
+TEST(PosixErrorTest, PosixErrorOrNullptr) {
+  auto err = PosixErrorOr<std::nullptr_t>(nullptr);
+  EXPECT_THAT(err, PosixErrorIs(0, ""));
+  EXPECT_NO_ERRNO(err);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/proc_util.cc b/test/util/proc_util.cc
new file mode 100644
index 000000000..72f7e67d0
--- /dev/null
+++ b/test/util/proc_util.cc
@@ -0,0 +1,98 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/proc_util.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Parses a single line from /proc/<xxx>/maps.
+PosixErrorOr<ProcMapsEntry> ParseProcMapsLine(absl::string_view line) {
+  ProcMapsEntry map_entry = {};
+  std::vector<std::string> parts = absl::StrSplit(line, ' ', absl::SkipEmpty());
+
+  // A size of 5 means there is no file name specified.
+  if (parts.size() != 5 && parts.size() != 6) {
+    return PosixError(EINVAL, absl::StrCat("Invalid line: ", line));
+  }
+
+  // Address range in the form X-X where X are hex values without leading 0x.
+  std::vector<std::string> addresses = absl::StrSplit(parts[0], '-');
+  if (addresses.size() != 2) {
+    return PosixError(EINVAL,
+                      absl::StrCat("Invalid address range: ", parts[0]));
+  }
+  ASSIGN_OR_RETURN_ERRNO(map_entry.start, AtoiBase(addresses[0], 16));
+  ASSIGN_OR_RETURN_ERRNO(map_entry.end, AtoiBase(addresses[1], 16));
+
+  // Permissions are four bytes of the form rwxp or - if permission not set.
+  if (parts[1].size() != 4) {
+    return PosixError(EINVAL,
+                      absl::StrCat("Invalid permission field: ", parts[1]));
+  }
+
+  map_entry.readable = parts[1][0] == 'r';
+  map_entry.writable = parts[1][1] == 'w';
+  map_entry.executable = parts[1][2] == 'x';
+  map_entry.priv = parts[1][3] == 'p';
+
+  ASSIGN_OR_RETURN_ERRNO(map_entry.offset, AtoiBase(parts[2], 16));
+
+  std::vector<std::string> device = absl::StrSplit(parts[3], ':');
+  if (device.size() != 2) {
+    return PosixError(EINVAL, absl::StrCat("Invalid device: ", parts[3]));
+  }
+  ASSIGN_OR_RETURN_ERRNO(map_entry.major, AtoiBase(device[0], 16));
+  ASSIGN_OR_RETURN_ERRNO(map_entry.minor, AtoiBase(device[1], 16));
+
+  ASSIGN_OR_RETURN_ERRNO(map_entry.inode, Atoi<int64_t>(parts[4]));
+  if (parts.size() == 6) {
+    // A filename is present.
+    map_entry.filename = parts[5];
+  }
+
+  return map_entry;
+}
+
+PosixErrorOr<std::vector<ProcMapsEntry>> ParseProcMaps(
+    absl::string_view contents) {
+  std::vector<ProcMapsEntry> entries;
+  auto lines = absl::StrSplit(contents, '\n', absl::SkipEmpty());
+  for (const auto& l : lines) {
+    LOG(INFO) << "line: " << l;
+    ASSIGN_OR_RETURN_ERRNO(auto entry, ParseProcMapsLine(l));
+    entries.push_back(entry);
+  }
+  return entries;
+}
+
+PosixErrorOr<bool> IsVsyscallEnabled() {
+  ASSIGN_OR_RETURN_ERRNO(auto contents, GetContents("/proc/self/maps"));
+  ASSIGN_OR_RETURN_ERRNO(auto maps, ParseProcMaps(contents));
+  return std::any_of(maps.begin(), maps.end(), [](const ProcMapsEntry& e) {
+    return e.filename == "[vsyscall]";
+  });
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/proc_util.h b/test/util/proc_util.h
new file mode 100644
index 000000000..f8021d92e
--- /dev/null
+++ b/test/util/proc_util.h
@@ -0,0 +1,150 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_PROC_UTIL_H_
+#define GVISOR_TEST_UTIL_PROC_UTIL_H_
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// ProcMapsEntry contains the data from a single line in /proc/<xxx>/maps.
+struct ProcMapsEntry {
+  uint64_t start;
+  uint64_t end;
+  bool readable;
+  bool writable;
+  bool executable;
+  bool priv;
+  uint64_t offset;
+  int major;
+  int minor;
+  int64_t inode;
+  std::string filename;
+};
+
+// Parses a ProcMaps line or returns an error.
+PosixErrorOr<ProcMapsEntry> ParseProcMapsLine(absl::string_view line);
+PosixErrorOr<std::vector<ProcMapsEntry>> ParseProcMaps(
+    absl::string_view contents);
+
+// Returns true if vsyscall (emmulation or not) is enabled.
+PosixErrorOr<bool> IsVsyscallEnabled();
+
+// Printer for ProcMapsEntry.
+inline std::ostream& operator<<(std::ostream& os, const ProcMapsEntry& entry) {
+  std::string str =
+      absl::StrCat(absl::Hex(entry.start, absl::PadSpec::kZeroPad8), "-",
+                   absl::Hex(entry.end, absl::PadSpec::kZeroPad8), " ");
+
+  absl::StrAppend(&str, entry.readable ? "r" : "-");
+  absl::StrAppend(&str, entry.writable ? "w" : "-");
+  absl::StrAppend(&str, entry.executable ? "x" : "-");
+  absl::StrAppend(&str, entry.priv ? "p" : "s");
+
+  absl::StrAppend(&str, " ", absl::Hex(entry.offset, absl::PadSpec::kZeroPad8),
+                  " ", absl::Hex(entry.major, absl::PadSpec::kZeroPad2), ":",
+                  absl::Hex(entry.minor, absl::PadSpec::kZeroPad2), " ",
+                  entry.inode);
+  if (absl::string_view(entry.filename) != "") {
+    // Pad to column 74
+    int pad = 73 - str.length();
+    if (pad > 0) {
+      absl::StrAppend(&str, std::string(pad, ' '));
+    }
+    absl::StrAppend(&str, entry.filename);
+  }
+  os << str;
+  return os;
+}
+
+// Printer for std::vector<ProcMapsEntry>.
+inline std::ostream& operator<<(std::ostream& os,
+                                const std::vector<ProcMapsEntry>& vec) {
+  for (unsigned int i = 0; i < vec.size(); i++) {
+    os << vec[i];
+    if (i != vec.size() - 1) {
+      os << "\n";
+    }
+  }
+  return os;
+}
+
+// GMock printer for std::vector<ProcMapsEntry>.
+inline void PrintTo(const std::vector<ProcMapsEntry>& vec, std::ostream* os) {
+  *os << vec;
+}
+
+// Checks that /proc/pid/maps contains all of the passed mappings.
+//
+// The major, minor, and inode fields are ignored.
+MATCHER_P(ContainsMappings, mappings,
+          "contains mappings:\n" + ::testing::PrintToString(mappings)) {
+  auto contents_or = GetContents(absl::StrCat("/proc/", arg, "/maps"));
+  if (!contents_or.ok()) {
+    *result_listener << "Unable to read mappings: "
+                     << contents_or.error().ToString();
+    return false;
+  }
+
+  auto maps_or = ParseProcMaps(contents_or.ValueOrDie());
+  if (!maps_or.ok()) {
+    *result_listener << "Unable to parse mappings: "
+                     << maps_or.error().ToString();
+    return false;
+  }
+
+  auto maps = std::move(maps_or.ValueOrDie());
+
+  // Does maps contain all elements in mappings? The comparator ignores
+  // the major, minor, and inode fields.
+  bool all_present = true;
+  std::for_each(mappings.begin(), mappings.end(), [&](const ProcMapsEntry& e1) {
+    auto it =
+        std::find_if(maps.begin(), maps.end(), [&e1](const ProcMapsEntry& e2) {
+          return e1.start == e2.start && e1.end == e2.end &&
+                 e1.readable == e2.readable && e1.writable == e2.writable &&
+                 e1.executable == e2.executable && e1.priv == e2.priv &&
+                 e1.offset == e2.offset && e1.filename == e2.filename;
+        });
+    if (it == maps.end()) {
+      // It wasn't found.
+      if (all_present) {
+        // We will output the message once and then a line for each mapping
+        // that wasn't found.
+        all_present = false;
+        *result_listener << "Got mappings:\n"
+                         << maps << "\nThat were missing:\n";
+      }
+      *result_listener << e1 << "\n";
+    }
+  });
+
+  return all_present;
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_PROC_UTIL_H_
diff --git a/test/util/save_util.cc b/test/util/save_util.cc
new file mode 100644
index 000000000..71f4078a7
--- /dev/null
+++ b/test/util/save_util.cc
@@ -0,0 +1,59 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/save_util.h"
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <atomic>
+#include <cerrno>
+
+#define GVISOR_COOPERATIVE_SAVE_TEST "GVISOR_COOPERATIVE_SAVE_TEST"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+bool CooperativeSaveEnabled() {
+  static bool enabled = getenv(GVISOR_COOPERATIVE_SAVE_TEST) != nullptr;
+  return enabled;
+}
+
+std::atomic<int> save_disable;
+
+}  // namespace
+
+DisableSave::DisableSave() { save_disable++; }
+
+DisableSave::~DisableSave() { reset(); }
+
+void DisableSave::reset() {
+  if (!reset_) {
+    reset_ = true;
+    save_disable--;
+  }
+}
+
+void MaybeSave() {
+  if (CooperativeSaveEnabled() && !save_disable.load()) {
+    int orig_errno = errno;
+    syscall(SYS_create_module, nullptr, 0);
+    errno = orig_errno;
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/save_util.h b/test/util/save_util.h
new file mode 100644
index 000000000..919e4af3d
--- /dev/null
+++ b/test/util/save_util.h
@@ -0,0 +1,47 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_SAVE_UTIL_H_
+#define GVISOR_TEST_UTIL_SAVE_UTIL_H_
+
+namespace gvisor {
+namespace testing {
+// Disable save prevents saving while the given function executes.
+//
+// This lasts the duration of the object, unless reset is called.
+class DisableSave {
+ public:
+  DisableSave();
+  ~DisableSave();
+  DisableSave(DisableSave const&) = delete;
+  DisableSave(DisableSave&&) = delete;
+  DisableSave& operator=(DisableSave const&) = delete;
+  DisableSave& operator=(DisableSave&&) = delete;
+
+  // reset allows saves to continue, and is called implicitly by the destructor.
+  // It may be called multiple times safely, but is not thread-safe.
+  void reset();
+
+ private:
+  bool reset_ = false;
+};
+
+// May perform a co-operative save cycle.
+//
+// errno is guaranteed to be preserved.
+void MaybeSave();
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_SAVE_UTIL_H_
diff --git a/test/util/signal_util.cc b/test/util/signal_util.cc
new file mode 100644
index 000000000..3e2df32a6
--- /dev/null
+++ b/test/util/signal_util.cc
@@ -0,0 +1,103 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/signal_util.h"
+
+#include <signal.h>
+#include <ostream>
+
+#include "gtest/gtest.h"
+#include "test/util/cleanup.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace {
+
+struct Range {
+  int start;
+  int end;
+};
+
+// Format a Range as "start-end" or "start" for single value Ranges.
+static ::std::ostream& operator<<(::std::ostream& os, const Range& range) {
+  if (range.end > range.start) {
+    return os << range.start << '-' << range.end;
+  }
+
+  return os << range.start;
+}
+
+}  // namespace
+
+// Format a sigset_t as a comma separated list of numeric ranges.
+// Empty sigset: []
+// Full  sigset: [1-31,34-64]
+::std::ostream& operator<<(::std::ostream& os, const sigset_t& sigset) {
+  const char* delim = "";
+  Range range = {0, 0};
+
+  os << '[';
+
+  for (int sig = 1; sig <= gvisor::testing::kMaxSignal; ++sig) {
+    if (sigismember(&sigset, sig)) {
+      if (range.start) {
+        range.end = sig;
+      } else {
+        range.start = sig;
+        range.end = sig;
+      }
+    } else if (range.start) {
+      os << delim << range;
+      delim = ",";
+      range.start = 0;
+      range.end = 0;
+    }
+  }
+
+  if (range.start) {
+    os << delim << range;
+  }
+
+  return os << ']';
+}
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<Cleanup> ScopedSigaction(int sig, struct sigaction const& sa) {
+  struct sigaction old_sa;
+  int rc = sigaction(sig, &sa, &old_sa);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "sigaction failed");
+  }
+  return Cleanup([sig, old_sa] {
+    EXPECT_THAT(sigaction(sig, &old_sa, nullptr), SyscallSucceeds());
+  });
+}
+
+PosixErrorOr<Cleanup> ScopedSignalMask(int how, sigset_t const& set) {
+  sigset_t old;
+  int rc = sigprocmask(how, &set, &old);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "sigprocmask failed");
+  }
+  return Cleanup([old] {
+    EXPECT_THAT(sigprocmask(SIG_SETMASK, &old, nullptr), SyscallSucceeds());
+  });
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/signal_util.h b/test/util/signal_util.h
new file mode 100644
index 000000000..f58f4c6c4
--- /dev/null
+++ b/test/util/signal_util.h
@@ -0,0 +1,92 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_SIGNAL_UTIL_H_
+#define GVISOR_TEST_UTIL_SIGNAL_UTIL_H_
+
+#include <signal.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <ostream>
+
+#include "gmock/gmock.h"
+#include "test/util/cleanup.h"
+#include "test/util/posix_error.h"
+
+// Format a sigset_t as a comma separated list of numeric ranges.
+::std::ostream& operator<<(::std::ostream& os, const sigset_t& sigset);
+
+namespace gvisor {
+namespace testing {
+
+// The maximum signal number.
+static constexpr int kMaxSignal = 64;
+
+// Wrapper for the tgkill(2) syscall, which glibc does not provide.
+inline int tgkill(pid_t tgid, pid_t tid, int sig) {
+  return syscall(__NR_tgkill, tgid, tid, sig);
+}
+
+// Installs the passed sigaction and returns a cleanup function to restore the
+// previous handler when it goes out of scope.
+PosixErrorOr<Cleanup> ScopedSigaction(int sig, struct sigaction const& sa);
+
+// Updates the signal mask as per sigprocmask(2) and returns a cleanup function
+// to restore the previous signal mask when it goes out of scope.
+PosixErrorOr<Cleanup> ScopedSignalMask(int how, sigset_t const& set);
+
+// ScopedSignalMask variant that creates a mask of the single signal 'sig'.
+inline PosixErrorOr<Cleanup> ScopedSignalMask(int how, int sig) {
+  sigset_t set;
+  sigemptyset(&set);
+  sigaddset(&set, sig);
+  return ScopedSignalMask(how, set);
+}
+
+// Asserts equality of two sigset_t values.
+MATCHER_P(EqualsSigset, value, "equals " + ::testing::PrintToString(value)) {
+  for (int sig = 1; sig <= kMaxSignal; ++sig) {
+    if (sigismember(&arg, sig) != sigismember(&value, sig)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#ifdef __x86_64__
+// Fault can be used to generate a synchronous SIGSEGV.
+//
+// This fault can be fixed up in a handler via fixup, below.
+inline void Fault() {
+  // Zero and dereference %ax.
+  asm("movabs $0, %%rax\r\n"
+      "mov 0(%%rax), %%rax\r\n"
+      :
+      :
+      : "ax");
+}
+
+// FixupFault fixes up a fault generated by fault, above.
+inline void FixupFault(ucontext* ctx) {
+  // Skip the bad instruction above.
+  //
+  // The encoding is 0x48 0xab 0x00.
+  ctx->uc_mcontext.gregs[REG_RIP] += 3;
+}
+#endif
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_SIGNAL_UTIL_H_
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
new file mode 100644
index 000000000..e45909655
--- /dev/null
+++ b/test/util/temp_path.cc
@@ -0,0 +1,157 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/temp_path.h"
+
+#include <unistd.h>
+#include <atomic>
+#include <cstdlib>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+std::atomic<uint64_t> global_temp_file_number = ATOMIC_VAR_INIT(1);
+
+// Return a new temp filename, intended to be unique system-wide.
+//
+// The global file number helps maintain file naming consistency across
+// different runs of a test.
+//
+// The timestamp is necessary because the test infrastructure invokes each
+// test case in a separate process (resetting global_temp_file_number) and
+// potentially in parallel, which allows for races between selecting and using a
+// name.
+std::string NextTempBasename() {
+  return absl::StrCat("gvisor_test_temp_", global_temp_file_number++, "_",
+                      absl::ToUnixNanos(absl::Now()));
+}
+
+void TryDeleteRecursively(std::string const& path) {
+  if (!path.empty()) {
+    int undeleted_dirs = 0;
+    int undeleted_files = 0;
+    auto status = RecursivelyDelete(path, &undeleted_dirs, &undeleted_files);
+    if (undeleted_dirs || undeleted_files || !status.ok()) {
+      LOG(WARNING) << path << ": failed to delete " << undeleted_dirs
+                   << " directories and " << undeleted_files
+                   << " files: " << status;
+    }
+  }
+}
+
+}  // namespace
+
+constexpr mode_t TempPath::kDefaultFileMode;
+constexpr mode_t TempPath::kDefaultDirMode;
+
+std::string NewTempAbsPathInDir(absl::string_view const dir) {
+  return JoinPath(dir, NextTempBasename());
+}
+
+std::string NewTempAbsPath() { return NewTempAbsPathInDir(GetAbsoluteTestTmpdir()); }
+
+std::string NewTempRelPath() { return NextTempBasename(); }
+
+std::string GetAbsoluteTestTmpdir() {
+  char* env_tmpdir = getenv("TEST_TMPDIR");
+  std::string tmp_dir = env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp";
+  return MakeAbsolute(tmp_dir, "").ValueOrDie();
+}
+
+PosixErrorOr<TempPath> TempPath::CreateFileWith(absl::string_view const parent,
+                                                absl::string_view const content,
+                                                mode_t const mode) {
+  return CreateIn(parent, [=](absl::string_view path) -> PosixError {
+    // SetContents will call open(O_WRONLY) with the given mode. If the
+    // mode is not user-writable, save/restore cannot preserve the fd. Hence
+    // the little permission dance that's done here.
+    auto res = CreateWithContents(path, content, mode | 0200);
+    RETURN_IF_ERRNO(res);
+
+    return Chmod(path, mode);
+  });
+}
+
+PosixErrorOr<TempPath> TempPath::CreateDirWith(absl::string_view const parent,
+                                               mode_t const mode) {
+  return CreateIn(parent,
+                  [=](absl::string_view path) { return Mkdir(path, mode); });
+}
+
+PosixErrorOr<TempPath> TempPath::CreateSymlinkTo(absl::string_view const parent,
+                                                 std::string const& dest) {
+  return CreateIn(parent, [=](absl::string_view path) {
+    int ret = symlink(dest.c_str(), std::string(path).c_str());
+    if (ret != 0) {
+      return PosixError(errno, "symlink failed");
+    }
+    return NoError();
+  });
+}
+
+PosixErrorOr<TempPath> TempPath::CreateFileIn(absl::string_view const parent) {
+  return TempPath::CreateFileWith(parent, absl::string_view(),
+                                  kDefaultFileMode);
+}
+
+PosixErrorOr<TempPath> TempPath::CreateDirIn(absl::string_view const parent) {
+  return TempPath::CreateDirWith(parent, kDefaultDirMode);
+}
+
+PosixErrorOr<TempPath> TempPath::CreateFileMode(mode_t mode) {
+  return TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), absl::string_view(),
+                                  mode);
+}
+
+PosixErrorOr<TempPath> TempPath::CreateFile() {
+  return TempPath::CreateFileIn(GetAbsoluteTestTmpdir());
+}
+
+PosixErrorOr<TempPath> TempPath::CreateDir() {
+  return TempPath::CreateDirIn(GetAbsoluteTestTmpdir());
+}
+
+TempPath::~TempPath() { TryDeleteRecursively(path_); }
+
+TempPath::TempPath(TempPath&& orig) { reset(orig.release()); }
+
+TempPath& TempPath::operator=(TempPath&& orig) {
+  reset(orig.release());
+  return *this;
+}
+
+std::string TempPath::reset(std::string newpath) {
+  std::string path = path_;
+  TryDeleteRecursively(path_);
+  path_ = std::move(newpath);
+  return path;
+}
+
+std::string TempPath::release() {
+  std::string path = path_;
+  path_ = std::string();
+  return path;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/temp_path.h b/test/util/temp_path.h
new file mode 100644
index 000000000..33eb6a72c
--- /dev/null
+++ b/test/util/temp_path.h
@@ -0,0 +1,134 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_TEMP_PATH_H_
+#define GVISOR_TEST_UTIL_TEMP_PATH_H_
+
+#include <sys/stat.h>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// Returns an absolute path for a file in `dir` that does not yet exist.
+// Distinct calls to NewTempAbsPathInDir from the same process, even from
+// multiple threads, are guaranteed to return different paths. Distinct calls to
+// NewTempAbsPathInDir from different processes are not synchronized.
+std::string NewTempAbsPathInDir(absl::string_view base);
+
+// Like NewTempAbsPathInDir, but the returned path is in the test's temporary
+// directory, as provided by the testing framework.
+std::string NewTempAbsPath();
+
+// Like NewTempAbsPathInDir, but the returned path is relative (to the current
+// working directory).
+std::string NewTempRelPath();
+
+// Returns the absolute path for the test temp dir.
+std::string GetAbsoluteTestTmpdir();
+
+// Represents a temporary file or directory.
+class TempPath {
+ public:
+  // Default creation mode for files.
+  static constexpr mode_t kDefaultFileMode = 0644;
+
+  // Default creation mode for directories.
+  static constexpr mode_t kDefaultDirMode = 0755;
+
+  // Creates a temporary file in directory `parent` with mode `mode` and
+  // contents `content`.
+  static PosixErrorOr<TempPath> CreateFileWith(absl::string_view parent,
+                                               absl::string_view content,
+                                               mode_t mode);
+
+  // Creates an empty temporary subdirectory in directory `parent` with mode
+  // `mode`.
+  static PosixErrorOr<TempPath> CreateDirWith(absl::string_view parent,
+                                              mode_t mode);
+
+  // Creates a temporary symlink in directory `parent` to destination `dest`.
+  static PosixErrorOr<TempPath> CreateSymlinkTo(absl::string_view parent,
+                                                std::string const& dest);
+
+  // Creates an empty temporary file in directory `parent` with mode
+  // kDefaultFileMode.
+  static PosixErrorOr<TempPath> CreateFileIn(absl::string_view parent);
+
+  // Creates an empty temporary subdirectory in directory `parent` with mode
+  // kDefaultDirMode.
+  static PosixErrorOr<TempPath> CreateDirIn(absl::string_view parent);
+
+  // Creates an empty temporary file in the test's temporary directory with mode
+  // `mode`.
+  static PosixErrorOr<TempPath> CreateFileMode(mode_t mode);
+
+  // Creates an empty temporary file in the test's temporary directory with
+  // mode kDefaultFileMode.
+  static PosixErrorOr<TempPath> CreateFile();
+
+  // Creates an empty temporary subdirectory in the test's temporary directory
+  // with mode kDefaultDirMode.
+  static PosixErrorOr<TempPath> CreateDir();
+
+  // Constructs a TempPath that represents nothing.
+  TempPath() = default;
+
+  // Constructs a TempPath that represents the given path, which will be deleted
+  // when the TempPath is destroyed.
+  explicit TempPath(std::string path) : path_(std::move(path)) {}
+
+  // Attempts to delete the represented temporary file or directory (in the
+  // latter case, also attempts to delete its contents).
+  ~TempPath();
+
+  // Attempts to delete the represented temporary file or directory, then
+  // transfers ownership of the path represented by orig to this TempPath.
+  TempPath(TempPath&& orig);
+  TempPath& operator=(TempPath&& orig);
+
+  // Changes the path this TempPath represents. If the TempPath already
+  // represented a path, deletes and returns that path. Otherwise returns the
+  // empty std::string.
+  std::string reset(std::string newpath);
+  std::string reset() { return reset(""); }
+
+  // Forgets and returns the path this TempPath represents. The path is not
+  // deleted.
+  std::string release();
+
+  // Returns the path this TempPath represents.
+  std::string path() const { return path_; }
+
+ private:
+  template <typename F>
+  static PosixErrorOr<TempPath> CreateIn(absl::string_view const parent,
+                                         F const& f) {
+    std::string path = NewTempAbsPathInDir(parent);
+    RETURN_IF_ERRNO(f(path));
+    return TempPath(std::move(path));
+  }
+
+  std::string path_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_TEMP_PATH_H_
diff --git a/test/util/test_main.cc b/test/util/test_main.cc
new file mode 100644
index 000000000..4c6b5e860
--- /dev/null
+++ b/test/util/test_main.cc
@@ -0,0 +1,20 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/test_util.h"
+
+int main(int argc, char** argv) {
+  gvisor::testing::TestInit(&argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
new file mode 100644
index 000000000..7b40260d1
--- /dev/null
+++ b/test/util/test_util.cc
@@ -0,0 +1,248 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/test_util.h"
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include <ctime>
+#include <vector>
+#include "absl/base/attributes.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/time/time.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+#define TEST_ON_GVISOR "TEST_ON_GVISOR"
+
+bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; }
+
+Platform GvisorPlatform() {
+  // Set by runner.go.
+  char* env = getenv(TEST_ON_GVISOR);
+  if (!env) {
+    return Platform::kNative;
+  }
+  if (strcmp(env, "ptrace") == 0) {
+    return Platform::kPtrace;
+  }
+  if (strcmp(env, "kvm") == 0) {
+    return Platform::kKVM;
+  }
+  LOG(FATAL) << "unknown platform " << env;
+  __builtin_unreachable();
+}
+
+// Inline cpuid instruction.  Preserve %ebx/%rbx register. In PIC compilations
+// %ebx contains the address of the global offset table. %rbx is occasionally
+// used to address stack variables in presence of dynamic allocas.
+#if defined(__x86_64__)
+#define GETCPUID(a, b, c, d, a_inp, c_inp) \
+  asm("mov %%rbx, %%rdi\n"                 \
+      "cpuid\n"                            \
+      "xchg %%rdi, %%rbx\n"                \
+      : "=a"(a), "=D"(b), "=c"(c), "=d"(d) \
+      : "a"(a_inp), "2"(c_inp))
+#endif  // defined(__x86_64__)
+
+CPUVendor GetCPUVendor() {
+  uint32_t eax, ebx, ecx, edx;
+  std::string vendor_str;
+  // Get vendor std::string (issue CPUID with eax = 0)
+  GETCPUID(eax, ebx, ecx, edx, 0, 0);
+  vendor_str.append(reinterpret_cast<char*>(&ebx), 4);
+  vendor_str.append(reinterpret_cast<char*>(&edx), 4);
+  vendor_str.append(reinterpret_cast<char*>(&ecx), 4);
+  if (vendor_str == "GenuineIntel") {
+    return CPUVendor::kIntel;
+  } else if (vendor_str == "AuthenticAMD") {
+    return CPUVendor::kAMD;
+  }
+  return CPUVendor::kUnknownVendor;
+}
+
+bool operator==(const KernelVersion& first, const KernelVersion& second) {
+  return first.major == second.major && first.minor == second.minor &&
+         first.micro == second.micro;
+}
+
+PosixErrorOr<KernelVersion> ParseKernelVersion(absl::string_view vers_str) {
+  KernelVersion version = {};
+  std::vector<std::string> values = absl::StrSplit(vers_str, absl::ByAnyChar(".-"));
+  if (values.size() == 2) {
+    ASSIGN_OR_RETURN_ERRNO(version.major, Atoi<int>(values[0]));
+    ASSIGN_OR_RETURN_ERRNO(version.minor, Atoi<int>(values[1]));
+    return version;
+  } else if (values.size() >= 3) {
+    ASSIGN_OR_RETURN_ERRNO(version.major, Atoi<int>(values[0]));
+    ASSIGN_OR_RETURN_ERRNO(version.minor, Atoi<int>(values[1]));
+    ASSIGN_OR_RETURN_ERRNO(version.micro, Atoi<int>(values[2]));
+    return version;
+  }
+  return PosixError(EINVAL, absl::StrCat("Unknown kernel release: ", vers_str));
+}
+
+PosixErrorOr<KernelVersion> GetKernelVersion() {
+  utsname buf;
+  RETURN_ERROR_IF_SYSCALL_FAIL(uname(&buf));
+  return ParseKernelVersion(buf.release);
+}
+
+void SetupGvisorDeathTest() {
+}
+
+std::string CPUSetToString(const cpu_set_t& set, size_t cpus) {
+  std::string str = "cpuset[";
+  for (unsigned int n = 0; n < cpus; n++) {
+    if (CPU_ISSET(n, &set)) {
+      if (n != 0) {
+        absl::StrAppend(&str, " ");
+      }
+      absl::StrAppend(&str, n);
+    }
+  }
+  absl::StrAppend(&str, "]");
+  return str;
+}
+
+// An overloaded operator<< makes it easy to dump the value of an OpenFd.
+std::ostream& operator<<(std::ostream& out, OpenFd const& ofd) {
+  out << ofd.fd << " -> " << ofd.link;
+  return out;
+}
+
+// An overloaded operator<< makes it easy to dump a vector of OpenFDs.
+std::ostream& operator<<(std::ostream& out, std::vector<OpenFd> const& v) {
+  for (const auto& ofd : v) {
+    out << ofd << std::endl;
+  }
+  return out;
+}
+
+PosixErrorOr<std::vector<OpenFd>> GetOpenFDs() {
+  // Get the results from /proc/self/fd.
+  ASSIGN_OR_RETURN_ERRNO(auto dir_list,
+                         ListDir("/proc/self/fd", /*skipdots=*/true));
+
+  std::vector<OpenFd> ret_fds;
+  for (const auto& str_fd : dir_list) {
+    OpenFd open_fd = {};
+    ASSIGN_OR_RETURN_ERRNO(open_fd.fd, Atoi<int>(str_fd));
+    std::string path = absl::StrCat("/proc/self/fd/", open_fd.fd);
+
+    // Resolve the link.
+    char buf[PATH_MAX] = {};
+    int ret = readlink(path.c_str(), buf, sizeof(buf));
+    if (ret < 0) {
+      if (errno == ENOENT) {
+        // The FD may have been closed, let's be resilient.
+        continue;
+      }
+
+      return PosixError(
+          errno, absl::StrCat("readlink of ", path, " returned errno ", errno));
+    }
+    open_fd.link = std::string(buf, ret);
+    ret_fds.emplace_back(std::move(open_fd));
+  }
+  return ret_fds;
+}
+
+PosixErrorOr<uint64_t> Links(const std::string& path) {
+  struct stat st;
+  if (stat(path.c_str(), &st)) {
+    return PosixError(errno, absl::StrCat("Failed to stat ", path));
+  }
+  return static_cast<uint64_t>(st.st_nlink);
+}
+
+void RandomizeBuffer(void* buffer, size_t len) {
+  struct timespec ts = {};
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  uint32_t seed = static_cast<uint32_t>(ts.tv_nsec);
+  char* const buf = static_cast<char*>(buffer);
+  for (size_t i = 0; i < len; i++) {
+    buf[i] = rand_r(&seed) % 255;
+  }
+}
+
+std::vector<std::vector<struct iovec>> GenerateIovecs(uint64_t total_size,
+                                                      void* buf,
+                                                      size_t buflen) {
+  std::vector<std::vector<struct iovec>> result;
+  for (uint64_t offset = 0; offset < total_size;) {
+    auto& iovec_array = *result.emplace(result.end());
+
+    for (; offset < total_size && iovec_array.size() < IOV_MAX;
+         offset += buflen) {
+      struct iovec iov = {};
+      iov.iov_base = buf;
+      iov.iov_len = std::min<uint64_t>(total_size - offset, buflen);
+      iovec_array.push_back(iov);
+    }
+  }
+
+  return result;
+}
+
+void SleepSafe(absl::Duration duration) {
+  if (duration == absl::ZeroDuration()) {
+    return;
+  }
+
+  struct timespec ts = absl::ToTimespec(duration);
+  int ret;
+  while (1) {
+    ret = syscall(__NR_nanosleep, &ts, &ts);
+    if (ret == 0 || (ret <= 0 && errno != EINTR)) {
+      break;
+    }
+  }
+}
+
+uint64_t Megabytes(uint64_t n) {
+  // Overflow check, upper 20 bits in n shouldn't be set.
+  CHECK(!(0xfffff00000000000 & n));
+  return n << 20;
+}
+
+bool Equivalent(uint64_t current, uint64_t target, double tolerance) {
+  auto abs_diff = target > current ? target - current : current - target;
+  return abs_diff <= static_cast<uint64_t>(tolerance * target);
+}
+void TestInit(int* argc, char*** argv) {
+  ::testing::InitGoogleTest(argc, *argv);
+  ::gflags::ParseCommandLineFlags(argc, argv, true);
+
+  // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_IGN;
+  TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/test_util.h b/test/util/test_util.h
new file mode 100644
index 000000000..2a7609e5c
--- /dev/null
+++ b/test/util/test_util.h
@@ -0,0 +1,794 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Utilities for syscall testing.
+//
+// Initialization
+// ==============
+//
+// Prior to calling RUN_ALL_TESTS, all tests must use TestInit(&argc, &argv).
+// See the TestInit function for exact side-effects and semantics.
+//
+// Configuration
+// =============
+//
+// IsRunningOnGvisor returns true if the test is known to be running on gVisor.
+// GvisorPlatform can be used to get more detail:
+//
+//   switch (GvisorPlatform()) {
+//     case Platform::kNative:
+//     case Platform::kGvisor:
+//       EXPECT_THAT(mmap(...), SyscallSucceeds());
+//       break;
+//     case Platform::kPtrace:
+//       EXPECT_THAT(mmap(...), SyscallFailsWithErrno(ENOSYS));
+//       break;
+//   }
+//
+// Matchers
+// ========
+//
+// ElementOf(xs) matches if the matched value is equal to an element of the
+// container xs. Example:
+//
+//   // PASS
+//   EXPECT_THAT(1, ElementOf({0, 1, 2}));
+//
+//   // FAIL
+//   // Value of: 3
+//   // Expected: one of {0, 1, 2}
+//   //   Actual: 3
+//   EXPECT_THAT(3, ElementOf({0, 1, 2}));
+//
+// SyscallSucceeds() matches if the syscall is successful. A successful syscall
+// is defined by either a return value not equal to -1, or a return value of -1
+// with an errno of 0 (which is a possible successful return for e.g.
+// PTRACE_PEEK). Example:
+//
+//   // PASS
+//   EXPECT_THAT(open("/dev/null", O_RDONLY), SyscallSucceeds());
+//
+//   // FAIL
+//   // Value of: open("/", O_RDWR)
+//   // Expected: not -1 (success)
+//   //   Actual: -1 (of type int), with errno 21 (Is a directory)
+//   EXPECT_THAT(open("/", O_RDWR), SyscallSucceeds());
+//
+// SyscallSucceedsWithValue(m) matches if the syscall is successful, and the
+// value also matches m. Example:
+//
+//   // PASS
+//   EXPECT_THAT(read(4, buf, 8192), SyscallSucceedsWithValue(8192));
+//
+//   // FAIL
+//   // Value of: read(-1, buf, 8192)
+//   // Expected: is equal to 8192
+//   //   Actual: -1 (of type long), with errno 9 (Bad file number)
+//   EXPECT_THAT(read(-1, buf, 8192), SyscallSucceedsWithValue(8192));
+//
+//   // FAIL
+//   // Value of: read(4, buf, 1)
+//   // Expected: is > 4096
+//   //   Actual: 1 (of type long)
+//   EXPECT_THAT(read(4, buf, 1), SyscallSucceedsWithValue(Gt(4096)));
+//
+// SyscallFails() matches if the syscall is unsuccessful. An unsuccessful
+// syscall is defined by a return value of -1 with a non-zero errno. Example:
+//
+//   // PASS
+//   EXPECT_THAT(open("/", O_RDWR), SyscallFails());
+//
+//   // FAIL
+//   // Value of: open("/dev/null", O_RDONLY)
+//   // Expected: -1 (failure)
+//   //   Actual: 0 (of type int)
+//   EXPECT_THAT(open("/dev/null", O_RDONLY), SyscallFails());
+//
+// SyscallFailsWithErrno(m) matches if the syscall is unsuccessful, and errno
+// matches m. Example:
+//
+//   // PASS
+//   EXPECT_THAT(open("/", O_RDWR), SyscallFailsWithErrno(EISDIR));
+//
+//   // PASS
+//   EXPECT_THAT(open("/etc/passwd", O_RDWR | O_DIRECTORY),
+//               SyscallFailsWithErrno(AnyOf(EACCES, ENOTDIR)));
+//
+//   // FAIL
+//   // Value of: open("/dev/null", O_RDONLY)
+//   // Expected: -1 (failure) with errno 21 (Is a directory)
+//   //   Actual: 0 (of type int)
+//   EXPECT_THAT(open("/dev/null", O_RDONLY), SyscallFailsWithErrno(EISDIR));
+//
+//   // FAIL
+//   // Value of: open("/", O_RDWR)
+//   // Expected: -1 (failure) with errno 22 (Invalid argument)
+//   //   Actual: -1 (of type int), failure, but with errno 21 (Is a directory)
+//   EXPECT_THAT(open("/", O_RDWR), SyscallFailsWithErrno(EINVAL));
+//
+// Because the syscall matchers encode save/restore functionality, their meaning
+// should not be inverted via Not. That is, AnyOf(SyscallSucceedsWithValue(1),
+// SyscallSucceedsWithValue(2)) is permitted, but not
+// Not(SyscallFailsWithErrno(EPERM)).
+//
+// Syscalls
+// ========
+//
+// RetryEINTR wraps a function that returns -1 and sets errno on failure
+// to be automatically retried when EINTR occurs. Example:
+//
+//   auto rv = RetryEINTR(waitpid)(pid, &status, 0);
+//
+// ReadFd/WriteFd/PreadFd/PwriteFd are interface-compatible wrappers around the
+// read/write/pread/pwrite syscalls to handle both EINTR and partial
+// reads/writes. Example:
+//
+//   EXPECT_THAT(ReadFd(fd, &buf, size), SyscallSucceedsWithValue(size));
+//
+// General Utilities
+// =================
+//
+// ApplyVec(f, xs) returns a vector containing the result of applying function
+// `f` to each value in `xs`.
+//
+// AllBitwiseCombinations takes a variadic number of ranges containing integers
+// and returns a vector containing every integer that can be formed by ORing
+// together exactly one integer from each list. List<T> is an alias for
+// std::initializer_list<T> that makes AllBitwiseCombinations more ergonomic to
+// use with list literals (initializer lists do not otherwise participate in
+// template argument deduction). Example:
+//
+//     EXPECT_THAT(
+//         AllBitwiseCombinations<int>(
+//             List<int>{SOCK_DGRAM, SOCK_STREAM},
+//             List<int>{0, SOCK_NONBLOCK}),
+//         Contains({SOCK_DGRAM, SOCK_STREAM, SOCK_DGRAM | SOCK_NONBLOCK,
+//                   SOCK_STREAM | SOCK_NONBLOCK}));
+//
+// VecCat takes a variadic number of containers and returns a vector containing
+// the concatenated contents.
+//
+// VecAppend takes an initial container and a variadic number of containers and
+// appends each to the initial container.
+//
+// RandomizeBuffer will use MTRandom to fill the given buffer with random bytes.
+//
+// GenerateIovecs will return the smallest number of iovec arrays for writing a
+// given total number of bytes to a file, each iovec array size up to IOV_MAX,
+// each iovec in each array pointing to the same buffer.
+
+#ifndef GVISOR_TEST_UTIL_TEST_UTIL_H_
+#define GVISOR_TEST_UTIL_TEST_UTIL_H_
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cerrno>
+#include <initializer_list>
+#include <iterator>
+#include <string>
+#include <thread>  // NOLINT: using std::thread::hardware_concurrency().
+#include <utility>
+#include <vector>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "gmock/gmock.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// TestInit must be called prior to RUN_ALL_TESTS.
+//
+// This parses all arguments and adjusts argc and argv appropriately.
+//
+// TestInit may create background threads.
+void TestInit(int* argc, char*** argv);
+
+// SKIP_IF may be used to skip a test case.
+//
+// These cases are still emitted, but a SKIPPED line will appear.
+#define SKIP_IF(expr)                                                        \
+  do {                                                                       \
+    if (expr) {                                                              \
+      std::cout << "\033[0;33m[  SKIPPED ]\033[m => " << #expr << std::endl; \
+      return;                                                                \
+    }                                                                        \
+  } while (0)
+
+#define SKIP_BEFORE_KERNEL(maj, min)                              \
+  do {                                                            \
+    auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion()); \
+    SKIP_IF(version.major < (maj) ||                              \
+            (version.major == (maj) && version.minor < (min)));   \
+  } while (0)
+
+enum class Platform {
+  kNative,
+  kKVM,
+  kPtrace,
+};
+bool IsRunningOnGvisor();
+Platform GvisorPlatform();
+
+void SetupGvisorDeathTest();
+
+struct KernelVersion {
+  int major;
+  int minor;
+  int micro;
+};
+
+bool operator==(const KernelVersion& first, const KernelVersion& second);
+
+PosixErrorOr<KernelVersion> ParseKernelVersion(absl::string_view vers_string);
+PosixErrorOr<KernelVersion> GetKernelVersion();
+
+static const size_t kPageSize = sysconf(_SC_PAGESIZE);
+
+enum class CPUVendor { kIntel, kAMD, kUnknownVendor };
+
+CPUVendor GetCPUVendor();
+
+inline int NumCPUs() { return std::thread::hardware_concurrency(); }
+
+// Converts cpu_set_t to a std::string for easy examination.
+std::string CPUSetToString(const cpu_set_t& set, size_t cpus = CPU_SETSIZE);
+
+struct OpenFd {
+  // fd is the open file descriptor number.
+  int fd = -1;
+
+  // link is the resolution of the symbolic link.
+  std::string link;
+};
+
+// Make it easier to log OpenFds to error streams.
+std::ostream& operator<<(std::ostream& out, std::vector<OpenFd> const& v);
+std::ostream& operator<<(std::ostream& out, OpenFd const& ofd);
+
+// Gets a detailed list of open fds for this process.
+PosixErrorOr<std::vector<OpenFd>> GetOpenFDs();
+
+// Returns the number of hard links to a path.
+PosixErrorOr<uint64_t> Links(const std::string& path);
+
+namespace internal {
+
+inline std::string ErrnoWithMessage(int const errnum) {
+  char buf[1024] = {};
+  const char* str = strerror_r(errnum, buf, sizeof(buf));
+  if (str == nullptr || str[0] == '\0') {
+    snprintf(buf, sizeof(buf), "Unknown error %d", errnum);
+    str = buf;
+  }
+  return absl::StrCat(errnum, " (", str, ")");
+}
+
+template <typename Container>
+class ElementOfMatcher {
+ public:
+  explicit ElementOfMatcher(Container container)
+      : container_(::std::move(container)) {}
+
+  template <typename T>
+  bool MatchAndExplain(T const& rv,
+                       ::testing::MatchResultListener* const listener) const {
+    using std::count;
+    return count(container_.begin(), container_.end(), rv) != 0;
+  }
+
+  void DescribeTo(::std::ostream* const os) const {
+    *os << "one of {";
+    char const* sep = "";
+    for (auto const& elem : container_) {
+      *os << sep << elem;
+      sep = ", ";
+    }
+    *os << "}";
+  }
+
+  void DescribeNegationTo(::std::ostream* const os) const {
+    *os << "none of {";
+    char const* sep = "";
+    for (auto const& elem : container_) {
+      *os << sep << elem;
+      sep = ", ";
+    }
+    *os << "}";
+  }
+
+ private:
+  Container const container_;
+};
+
+template <typename E>
+class SyscallSuccessMatcher {
+ public:
+  explicit SyscallSuccessMatcher(E expected)
+      : expected_(::std::move(expected)) {}
+
+  template <typename T>
+  operator ::testing::Matcher<T>() const {
+    // E is one of three things:
+    // - T, or a type losslessly and implicitly convertible to T.
+    // - A monomorphic Matcher<T>.
+    // - A polymorphic matcher.
+    // SafeMatcherCast handles any of the above correctly.
+    //
+    // Similarly, gMock will invoke this conversion operator to obtain a
+    // monomorphic matcher (this is how polymorphic matchers are implemented).
+    return ::testing::MakeMatcher(
+        new Impl<T>(::testing::SafeMatcherCast<T>(expected_)));
+  }
+
+ private:
+  template <typename T>
+  class Impl : public ::testing::MatcherInterface<T> {
+   public:
+    explicit Impl(::testing::Matcher<T> matcher)
+        : matcher_(::std::move(matcher)) {}
+
+    bool MatchAndExplain(
+        T const& rv,
+        ::testing::MatchResultListener* const listener) const override {
+      if (rv == static_cast<decltype(rv)>(-1) && errno != 0) {
+        *listener << "with errno " << ErrnoWithMessage(errno);
+        return false;
+      }
+      bool match = matcher_.MatchAndExplain(rv, listener);
+      if (match) {
+        MaybeSave();
+      }
+      return match;
+    }
+
+    void DescribeTo(::std::ostream* const os) const override {
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* const os) const override {
+      matcher_.DescribeNegationTo(os);
+    }
+
+   private:
+    ::testing::Matcher<T> matcher_;
+  };
+
+ private:
+  E expected_;
+};
+
+// A polymorphic matcher equivalent to ::testing::internal::AnyMatcher, except
+// not in namespace ::testing::internal, and describing SyscallSucceeds()'s
+// match constraints (which are enforced by SyscallSuccessMatcher::Impl).
+class AnySuccessValueMatcher {
+ public:
+  template <typename T>
+  operator ::testing::Matcher<T>() const {
+    return ::testing::MakeMatcher(new Impl<T>());
+  }
+
+ private:
+  template <typename T>
+  class Impl : public ::testing::MatcherInterface<T> {
+   public:
+    bool MatchAndExplain(
+        T const& rv,
+        ::testing::MatchResultListener* const listener) const override {
+      return true;
+    }
+
+    void DescribeTo(::std::ostream* const os) const override {
+      *os << "not -1 (success)";
+    }
+
+    void DescribeNegationTo(::std::ostream* const os) const override {
+      *os << "-1 (failure)";
+    }
+  };
+};
+
+class SyscallFailureMatcher {
+ public:
+  explicit SyscallFailureMatcher(::testing::Matcher<int> errno_matcher)
+      : errno_matcher_(std::move(errno_matcher)) {}
+
+  template <typename T>
+  bool MatchAndExplain(T const& rv,
+                       ::testing::MatchResultListener* const listener) const {
+    if (rv != static_cast<decltype(rv)>(-1)) {
+      return false;
+    }
+    int actual_errno = errno;
+    *listener << "with errno " << ErrnoWithMessage(actual_errno);
+    bool match = errno_matcher_.MatchAndExplain(actual_errno, listener);
+    if (match) {
+      MaybeSave();
+    }
+    return match;
+  }
+
+  void DescribeTo(::std::ostream* const os) const {
+    *os << "-1 (failure), with errno ";
+    errno_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* const os) const {
+    *os << "not -1 (success), with errno ";
+    errno_matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  ::testing::Matcher<int> errno_matcher_;
+};
+
+class SpecificErrnoMatcher : public ::testing::MatcherInterface<int> {
+ public:
+  explicit SpecificErrnoMatcher(int const expected) : expected_(expected) {}
+
+  bool MatchAndExplain(
+      int const actual_errno,
+      ::testing::MatchResultListener* const listener) const override {
+    return actual_errno == expected_;
+  }
+
+  void DescribeTo(::std::ostream* const os) const override {
+    *os << ErrnoWithMessage(expected_);
+  }
+
+  void DescribeNegationTo(::std::ostream* const os) const override {
+    *os << "not " << ErrnoWithMessage(expected_);
+  }
+
+ private:
+  int const expected_;
+};
+
+inline ::testing::Matcher<int> SpecificErrno(int const expected) {
+  return ::testing::MakeMatcher(new SpecificErrnoMatcher(expected));
+}
+
+}  // namespace internal
+
+template <typename Container>
+inline ::testing::PolymorphicMatcher<internal::ElementOfMatcher<Container>>
+ElementOf(Container container) {
+  return ::testing::MakePolymorphicMatcher(
+      internal::ElementOfMatcher<Container>(::std::move(container)));
+}
+
+template <typename T>
+inline ::testing::PolymorphicMatcher<
+    internal::ElementOfMatcher<::std::vector<T>>>
+ElementOf(::std::initializer_list<T> elems) {
+  return ::testing::MakePolymorphicMatcher(
+      internal::ElementOfMatcher<::std::vector<T>>(::std::vector<T>(elems)));
+}
+
+template <typename E>
+inline internal::SyscallSuccessMatcher<E> SyscallSucceedsWithValue(E expected) {
+  return internal::SyscallSuccessMatcher<E>(::std::move(expected));
+}
+
+inline internal::SyscallSuccessMatcher<internal::AnySuccessValueMatcher>
+SyscallSucceeds() {
+  return SyscallSucceedsWithValue(
+      ::gvisor::testing::internal::AnySuccessValueMatcher());
+}
+
+inline ::testing::PolymorphicMatcher<internal::SyscallFailureMatcher>
+SyscallFailsWithErrno(::testing::Matcher<int> expected) {
+  return ::testing::MakePolymorphicMatcher(
+      internal::SyscallFailureMatcher(::std::move(expected)));
+}
+
+// Overload taking an int so that SyscallFailsWithErrno(<specific errno>) uses
+// internal::SpecificErrno (which stringifies the errno) rather than
+// ::testing::Eq (which doesn't).
+inline ::testing::PolymorphicMatcher<internal::SyscallFailureMatcher>
+SyscallFailsWithErrno(int const expected) {
+  return SyscallFailsWithErrno(internal::SpecificErrno(expected));
+}
+
+inline ::testing::PolymorphicMatcher<internal::SyscallFailureMatcher>
+SyscallFails() {
+  return SyscallFailsWithErrno(::testing::Gt(0));
+}
+
+// As of GCC 7.2, -Wall => -Wc++17-compat => -Wnoexcept-type generates an
+// irrelevant, non-actionable warning about ABI compatibility when
+// RetryEINTRImpl is constructed with a noexcept function, such as glibc's
+// syscall(). See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80985.
+#if defined(__GNUC__) && !defined(__clang__) && \
+    (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnoexcept-type"
+#endif
+
+namespace internal {
+
+template <typename F>
+struct RetryEINTRImpl {
+  F const f;
+
+  explicit constexpr RetryEINTRImpl(F f) : f(std::move(f)) {}
+
+  template <typename... Args>
+  auto operator()(Args&&... args) const
+      -> decltype(f(std::forward<Args>(args)...)) {
+    while (true) {
+      errno = 0;
+      auto const ret = f(std::forward<Args>(args)...);
+      if (ret != -1 || errno != EINTR) {
+        return ret;
+      }
+    }
+  }
+};
+
+}  // namespace internal
+
+template <typename F>
+constexpr internal::RetryEINTRImpl<F> RetryEINTR(F&& f) {
+  return internal::RetryEINTRImpl<F>(std::forward<F>(f));
+}
+
+#if defined(__GNUC__) && !defined(__clang__) && \
+    (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2))
+#pragma GCC diagnostic pop
+#endif
+
+namespace internal {
+
+template <typename F>
+ssize_t ApplyFileIoSyscall(F const& f, size_t const count) {
+  size_t completed = 0;
+  // `do ... while` because some callers actually want to make a syscall with a
+  // count of 0.
+  do {
+    auto const cur = RetryEINTR(f)(completed);
+    if (cur < 0) {
+      return cur;
+    } else if (cur == 0) {
+      break;
+    }
+    completed += cur;
+  } while (completed < count);
+  return completed;
+}
+
+}  // namespace internal
+
+inline ssize_t ReadFd(int fd, void* buf, size_t count) {
+  return internal::ApplyFileIoSyscall(
+      [&](size_t completed) {
+        return read(fd, static_cast<char*>(buf) + completed, count - completed);
+      },
+      count);
+}
+
+inline ssize_t WriteFd(int fd, void const* buf, size_t count) {
+  return internal::ApplyFileIoSyscall(
+      [&](size_t completed) {
+        return write(fd, static_cast<char const*>(buf) + completed,
+                     count - completed);
+      },
+      count);
+}
+
+inline ssize_t PreadFd(int fd, void* buf, size_t count, off_t offset) {
+  return internal::ApplyFileIoSyscall(
+      [&](size_t completed) {
+        return pread(fd, static_cast<char*>(buf) + completed, count - completed,
+                     offset + completed);
+      },
+      count);
+}
+
+inline ssize_t PwriteFd(int fd, void const* buf, size_t count, off_t offset) {
+  return internal::ApplyFileIoSyscall(
+      [&](size_t completed) {
+        return pwrite(fd, static_cast<char const*>(buf) + completed,
+                      count - completed, offset + completed);
+      },
+      count);
+}
+
+template <typename T>
+using List = std::initializer_list<T>;
+
+namespace internal {
+
+template <typename T>
+void AppendAllBitwiseCombinations(std::vector<T>* combinations, T current) {
+  combinations->push_back(current);
+}
+
+template <typename T, typename Arg, typename... Args>
+void AppendAllBitwiseCombinations(std::vector<T>* combinations, T current,
+                                  Arg&& next, Args&&... rest) {
+  for (auto const option : next) {
+    AppendAllBitwiseCombinations(combinations, current | option, rest...);
+  }
+}
+
+inline size_t CombinedSize(size_t accum) { return accum; }
+
+template <typename T, typename... Args>
+size_t CombinedSize(size_t accum, T const& x, Args&&... xs) {
+  return CombinedSize(accum + x.size(), std::forward<Args>(xs)...);
+}
+
+// Base case: no more containers, so do nothing.
+template <typename T>
+void DoMoveExtendContainer(T* c) {}
+
+// Append each container next to c.
+template <typename T, typename U, typename... Args>
+void DoMoveExtendContainer(T* c, U&& next, Args&&... rest) {
+  std::move(std::begin(next), std::end(next), std::back_inserter(*c));
+  DoMoveExtendContainer(c, std::forward<Args>(rest)...);
+}
+
+}  // namespace internal
+
+template <typename T = int>
+std::vector<T> AllBitwiseCombinations() {
+  return std::vector<T>();
+}
+
+template <typename T = int, typename... Args>
+std::vector<T> AllBitwiseCombinations(Args&&... args) {
+  std::vector<T> combinations;
+  internal::AppendAllBitwiseCombinations(&combinations, 0, args...);
+  return combinations;
+}
+
+template <typename T, typename U, typename F>
+std::vector<T> ApplyVec(F const& f, std::vector<U> const& us) {
+  std::vector<T> vec;
+  vec.reserve(us.size());
+  for (auto const& u : us) {
+    vec.push_back(f(u));
+  }
+  return vec;
+}
+
+template <typename T, typename U>
+std::vector<T> ApplyVecToVec(std::vector<std::function<T(U)>> const& fs,
+                             std::vector<U> const& us) {
+  std::vector<T> vec;
+  vec.reserve(us.size() * fs.size());
+  for (auto const& f : fs) {
+    for (auto const& u : us) {
+      vec.push_back(f(u));
+    }
+  }
+  return vec;
+}
+
+// Moves all elements from the containers `args` to the end of `c`.
+template <typename T, typename... Args>
+void VecAppend(T* c, Args&&... args) {
+  c->reserve(internal::CombinedSize(c->size(), args...));
+  internal::DoMoveExtendContainer(c, std::forward<Args>(args)...);
+}
+
+// Returns a vector containing the concatenated contents of the containers
+// `args`.
+template <typename T, typename... Args>
+std::vector<T> VecCat(Args&&... args) {
+  std::vector<T> combined;
+  VecAppend(&combined, std::forward<Args>(args)...);
+  return combined;
+}
+
+#define RETURN_ERROR_IF_SYSCALL_FAIL(syscall) \
+  do {                                        \
+    if ((syscall) < 0 && errno != 0) {        \
+      return PosixError(errno, #syscall);     \
+    }                                         \
+  } while (false)
+
+// Fill the given buffer with random bytes.
+void RandomizeBuffer(void* buffer, size_t len);
+
+template <typename T>
+inline PosixErrorOr<T> Atoi(absl::string_view str) {
+  T ret;
+  if (!absl::SimpleAtoi<T>(str, &ret)) {
+    return PosixError(EINVAL, "String not a number.");
+  }
+  return ret;
+}
+
+inline PosixErrorOr<uint64_t> AtoiBase(absl::string_view str, int base) {
+  if (base > 255 || base < 2) {
+    return PosixError(EINVAL, "Invalid Base");
+  }
+
+  uint64_t ret = 0;
+  if (!absl::numbers_internal::safe_strtou64_base(str, &ret, base)) {
+    return PosixError(EINVAL, "String not a number.");
+  }
+
+  return ret;
+}
+
+inline PosixErrorOr<double> Atod(absl::string_view str) {
+  double ret;
+  if (!absl::SimpleAtod(str, &ret)) {
+    return PosixError(EINVAL, "String not a double type.");
+  }
+  return ret;
+}
+
+inline PosixErrorOr<float> Atof(absl::string_view str) {
+  float ret;
+  if (!absl::SimpleAtof(str, &ret)) {
+    return PosixError(EINVAL, "String not a float type.");
+  }
+  return ret;
+}
+
+// Return the smallest number of iovec arrays that can be used to write
+// "total_bytes" number of bytes, each iovec writing one "buf".
+std::vector<std::vector<struct iovec>> GenerateIovecs(uint64_t total_size,
+                                                      void* buf, size_t buflen);
+
+// Sleep for at least the specified duration. Avoids glibc.
+void SleepSafe(absl::Duration duration);
+
+// Returns bytes in 'n' megabytes. Used for readability.
+uint64_t Megabytes(uint64_t n);
+
+// Predicate for checking that a value is within some tolerance of another
+// value. Returns true iff current is in the range [target * (1 - tolerance),
+// target * (1 + tolerance)].
+bool Equivalent(uint64_t current, uint64_t target, double tolerance);
+
+// Matcher wrapping the Equivalent predicate.
+MATCHER_P2(EquivalentWithin, target, tolerance,
+           std::string(negation ? "Isn't" : "Is") +
+               ::absl::StrFormat(" within %.2f%% of the target of %zd bytes",
+                                 tolerance * 100, target)) {
+  if (target == 0) {
+    *result_listener << ::absl::StreamFormat("difference of infinity%%");
+  } else {
+    int64_t delta = static_cast<int64_t>(arg) - static_cast<int64_t>(target);
+    double delta_percent =
+        static_cast<double>(delta) / static_cast<double>(target) * 100;
+    *result_listener << ::absl::StreamFormat("difference of %.2f%%",
+                                             delta_percent);
+  }
+  return Equivalent(arg, target, tolerance);
+}
+
+void TestInit(int* argc, char*** argv);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_TEST_UTIL_H_
diff --git a/test/util/test_util_test.cc b/test/util/test_util_test.cc
new file mode 100644
index 000000000..5889651d1
--- /dev/null
+++ b/test/util/test_util_test.cc
@@ -0,0 +1,250 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/test_util.h"
+
+#include <errno.h>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using ::testing::AnyOf;
+using ::testing::Gt;
+using ::testing::IsEmpty;
+using ::testing::Lt;
+using ::testing::Not;
+using ::testing::TypedEq;
+using ::testing::UnorderedElementsAre;
+using ::testing::UnorderedElementsAreArray;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(KernelVersionParsing, ValidateParsing) {
+  KernelVersion v = ASSERT_NO_ERRNO_AND_VALUE(
+      ParseKernelVersion("4.18.10-1foo2-amd64 baz blah"));
+  ASSERT_TRUE(v == KernelVersion({4, 18, 10}));
+
+  v = ASSERT_NO_ERRNO_AND_VALUE(ParseKernelVersion("4.18.10-1foo2-amd64"));
+  ASSERT_TRUE(v == KernelVersion({4, 18, 10}));
+
+  v = ASSERT_NO_ERRNO_AND_VALUE(ParseKernelVersion("4.18.10-14-amd64"));
+  ASSERT_TRUE(v == KernelVersion({4, 18, 10}));
+
+  v = ASSERT_NO_ERRNO_AND_VALUE(ParseKernelVersion("4.18.10-amd64"));
+  ASSERT_TRUE(v == KernelVersion({4, 18, 10}));
+
+  v = ASSERT_NO_ERRNO_AND_VALUE(ParseKernelVersion("4.18.10"));
+  ASSERT_TRUE(v == KernelVersion({4, 18, 10}));
+
+  v = ASSERT_NO_ERRNO_AND_VALUE(ParseKernelVersion("4.0.10"));
+  ASSERT_TRUE(v == KernelVersion({4, 0, 10}));
+
+  v = ASSERT_NO_ERRNO_AND_VALUE(ParseKernelVersion("4.0"));
+  ASSERT_TRUE(v == KernelVersion({4, 0, 0}));
+
+  ASSERT_THAT(ParseKernelVersion("4.a"), PosixErrorIs(EINVAL, ::testing::_));
+  ASSERT_THAT(ParseKernelVersion("3"), PosixErrorIs(EINVAL, ::testing::_));
+  ASSERT_THAT(ParseKernelVersion(""), PosixErrorIs(EINVAL, ::testing::_));
+  ASSERT_THAT(ParseKernelVersion("version 3.3.10"),
+              PosixErrorIs(EINVAL, ::testing::_));
+}
+
+TEST(MatchersTest, SyscallSucceeds) {
+  EXPECT_THAT(0, SyscallSucceeds());
+  EXPECT_THAT(0L, SyscallSucceeds());
+
+  errno = 0;
+  EXPECT_THAT(-1, SyscallSucceeds());
+  EXPECT_THAT(-1L, SyscallSucceeds());
+
+  errno = ENOMEM;
+  EXPECT_THAT(-1, Not(SyscallSucceeds()));
+  EXPECT_THAT(-1L, Not(SyscallSucceeds()));
+}
+
+TEST(MatchersTest, SyscallSucceedsWithValue) {
+  EXPECT_THAT(0, SyscallSucceedsWithValue(0));
+  EXPECT_THAT(1, SyscallSucceedsWithValue(Lt(3)));
+  EXPECT_THAT(-1, Not(SyscallSucceedsWithValue(Lt(3))));
+  EXPECT_THAT(4, Not(SyscallSucceedsWithValue(Lt(3))));
+
+  // Non-int -1
+  EXPECT_THAT(-1L, Not(SyscallSucceedsWithValue(0)));
+
+  // Non-int, truncates to -1 if converted to int, with expected value
+  EXPECT_THAT(0xffffffffL, SyscallSucceedsWithValue(0xffffffffL));
+
+  // Non-int, truncates to -1 if converted to int, with monomorphic matcher
+  EXPECT_THAT(0xffffffffL,
+              SyscallSucceedsWithValue(TypedEq<long>(0xffffffffL)));
+
+  // Non-int, truncates to -1 if converted to int, with polymorphic matcher
+  EXPECT_THAT(0xffffffffL, SyscallSucceedsWithValue(Gt(1)));
+}
+
+TEST(MatchersTest, SyscallFails) {
+  EXPECT_THAT(0, Not(SyscallFails()));
+  EXPECT_THAT(0L, Not(SyscallFails()));
+
+  errno = 0;
+  EXPECT_THAT(-1, Not(SyscallFails()));
+  EXPECT_THAT(-1L, Not(SyscallFails()));
+
+  errno = ENOMEM;
+  EXPECT_THAT(-1, SyscallFails());
+  EXPECT_THAT(-1L, SyscallFails());
+}
+
+TEST(MatchersTest, SyscallFailsWithErrno) {
+  EXPECT_THAT(0, Not(SyscallFailsWithErrno(EINVAL)));
+  EXPECT_THAT(0L, Not(SyscallFailsWithErrno(EINVAL)));
+
+  errno = ENOMEM;
+  EXPECT_THAT(-1, Not(SyscallFailsWithErrno(EINVAL)));
+  EXPECT_THAT(-1L, Not(SyscallFailsWithErrno(EINVAL)));
+
+  errno = EINVAL;
+  EXPECT_THAT(-1, SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(-1L, SyscallFailsWithErrno(EINVAL));
+
+  EXPECT_THAT(-1, SyscallFailsWithErrno(AnyOf(EINVAL, ENOMEM)));
+  EXPECT_THAT(-1L, SyscallFailsWithErrno(AnyOf(EINVAL, ENOMEM)));
+
+  std::vector<int> expected_errnos({EINVAL, ENOMEM});
+  errno = ENOMEM;
+  EXPECT_THAT(-1, SyscallFailsWithErrno(ElementOf(expected_errnos)));
+  EXPECT_THAT(-1L, SyscallFailsWithErrno(ElementOf(expected_errnos)));
+}
+
+TEST(AllBitwiseCombinationsTest, NoArguments) {
+  EXPECT_THAT(AllBitwiseCombinations(), IsEmpty());
+}
+
+TEST(AllBitwiseCombinationsTest, EmptyList) {
+  EXPECT_THAT(AllBitwiseCombinations(List<int>{}), IsEmpty());
+}
+
+TEST(AllBitwiseCombinationsTest, SingleElementList) {
+  EXPECT_THAT(AllBitwiseCombinations(List<int>{5}), UnorderedElementsAre(5));
+}
+
+TEST(AllBitwiseCombinationsTest, SingleList) {
+  EXPECT_THAT(AllBitwiseCombinations(List<int>{0, 1, 2, 4}),
+              UnorderedElementsAre(0, 1, 2, 4));
+}
+
+TEST(AllBitwiseCombinationsTest, MultipleLists) {
+  EXPECT_THAT(
+      AllBitwiseCombinations(List<int>{0, 1, 2, 3}, List<int>{0, 4, 8, 12}),
+      UnorderedElementsAreArray(
+          {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}));
+}
+
+TEST(RandomizeBuffer, Works) {
+  const std::vector<char> original(4096);
+  std::vector<char> buffer = original;
+  RandomizeBuffer(buffer.data(), buffer.size());
+  EXPECT_NE(buffer, original);
+}
+
+// Enable comparison of vectors of iovec arrays for the following test.
+MATCHER_P(IovecsListEq, expected, "") {
+  if (arg.size() != expected.size()) {
+    *result_listener << "sizes are different (actual: " << arg.size()
+                     << ", expected: " << expected.size() << ")";
+    return false;
+  }
+
+  for (uint64_t i = 0; i < expected.size(); ++i) {
+    const std::vector<struct iovec>& actual_iovecs = arg[i];
+    const std::vector<struct iovec>& expected_iovecs = expected[i];
+    if (actual_iovecs.size() != expected_iovecs.size()) {
+      *result_listener << "iovec array size at position " << i
+                       << " is different (actual: " << actual_iovecs.size()
+                       << ", expected: " << expected_iovecs.size() << ")";
+      return false;
+    }
+
+    for (uint64_t j = 0; j < expected_iovecs.size(); ++j) {
+      const struct iovec& actual_iov = actual_iovecs[j];
+      const struct iovec& expected_iov = expected_iovecs[j];
+      if (actual_iov.iov_base != expected_iov.iov_base) {
+        *result_listener << "iovecs in array " << i << " at position " << j
+                         << " are different (expected iov_base: "
+                         << expected_iov.iov_base
+                         << ", got: " << actual_iov.iov_base << ")";
+        return false;
+      }
+      if (actual_iov.iov_len != expected_iov.iov_len) {
+        *result_listener << "iovecs in array " << i << " at position " << j
+                         << " are different (expected iov_len: "
+                         << expected_iov.iov_len
+                         << ", got: " << actual_iov.iov_len << ")";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Verify empty iovec list generation.
+TEST(GenerateIovecs, EmptyList) {
+  std::vector<char> buffer = {'a', 'b', 'c'};
+
+  EXPECT_THAT(GenerateIovecs(0, buffer.data(), buffer.size()),
+              IovecsListEq(std::vector<std::vector<struct iovec>>()));
+}
+
+// Verify generating a single array of only one, partial, iovec.
+TEST(GenerateIovecs, OneArray) {
+  std::vector<char> buffer = {'a', 'b', 'c'};
+
+  std::vector<std::vector<struct iovec>> expected;
+  struct iovec iov = {};
+  iov.iov_base = buffer.data();
+  iov.iov_len = 2;
+  expected.push_back(std::vector<struct iovec>({iov}));
+  EXPECT_THAT(GenerateIovecs(2, buffer.data(), buffer.size()),
+              IovecsListEq(expected));
+}
+
+// Verify that it wraps around after IOV_MAX iovecs.
+TEST(GenerateIovecs, WrapsAtIovMax) {
+  std::vector<char> buffer = {'a', 'b', 'c'};
+
+  std::vector<std::vector<struct iovec>> expected;
+  struct iovec iov = {};
+  iov.iov_base = buffer.data();
+  iov.iov_len = buffer.size();
+  expected.emplace_back();
+  for (int i = 0; i < IOV_MAX; ++i) {
+    expected[0].push_back(iov);
+  }
+  iov.iov_len = 1;
+  expected.push_back(std::vector<struct iovec>({iov}));
+
+  EXPECT_THAT(
+      GenerateIovecs(IOV_MAX * buffer.size() + 1, buffer.data(), buffer.size()),
+      IovecsListEq(expected));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/thread_util.h b/test/util/thread_util.h
new file mode 100644
index 000000000..df09ac8cf
--- /dev/null
+++ b/test/util/thread_util.h
@@ -0,0 +1,89 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_THREAD_UTIL_H_
+#define GVISOR_TEST_UTIL_THREAD_UTIL_H_
+
+#include <pthread.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <functional>
+#include <utility>
+
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+// ScopedThread is a minimal wrapper around pthreads.
+//
+// This is used in lieu of more complex mechanisms because it provides very
+// predictable behavior (no messing with timers, etc.) The thread will
+// automatically joined when it is destructed (goes out of scope), but can be
+// joined manually as well.
+class ScopedThread {
+ public:
+  // Constructs a thread that executes f exactly once.
+  explicit ScopedThread(std::function<void*()> f) : f_(std::move(f)) {
+    CreateThread();
+  }
+
+  explicit ScopedThread(const std::function<void()>& f) {
+    f_ = [=] {
+      f();
+      return nullptr;
+    };
+    CreateThread();
+  }
+
+  ScopedThread(const ScopedThread& other) = delete;
+  ScopedThread& operator=(const ScopedThread& other) = delete;
+
+  // Joins the thread.
+  ~ScopedThread() { Join(); }
+
+  // Waits until this thread has finished executing. Join is idempotent and may
+  // be called multiple times, however Join itself is not thread-safe.
+  void* Join() {
+    if (!joined_) {
+      TEST_PCHECK(pthread_join(pt_, &retval_) == 0);
+      joined_ = true;
+    }
+    return retval_;
+  }
+
+ private:
+  void CreateThread() {
+    TEST_PCHECK_MSG(
+        pthread_create(&pt_, /* attr = */ nullptr,
+                       +[](void* arg) -> void* {
+                         return static_cast<ScopedThread*>(arg)->f_();
+                       },
+                       this) == 0,
+        "thread creation failed");
+  }
+
+  std::function<void*()> f_;
+  pthread_t pt_;
+  bool joined_ = false;
+  void* retval_ = nullptr;
+};
+
+inline pid_t gettid() { return syscall(SYS_gettid); }
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_THREAD_UTIL_H_
diff --git a/test/util/timer_util.cc b/test/util/timer_util.cc
new file mode 100644
index 000000000..681fafb69
--- /dev/null
+++ b/test/util/timer_util.cc
@@ -0,0 +1,27 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+absl::Time Now(clockid_t id) {
+  struct timespec now;
+  TEST_PCHECK(clock_gettime(id, &now) == 0);
+  return absl::TimeFromTimespec(now);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/timer_util.h b/test/util/timer_util.h
new file mode 100644
index 000000000..9bdc51a57
--- /dev/null
+++ b/test/util/timer_util.h
@@ -0,0 +1,74 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_TIMER_UTIL_H_
+#define GVISOR_TEST_UTIL_TIMER_UTIL_H_
+
+#include <errno.h>
+#include <sys/time.h>
+
+#include <functional>
+
+#include "gmock/gmock.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// MonotonicTimer is a simple timer that uses a monotic clock.
+class MonotonicTimer {
+ public:
+  MonotonicTimer() {}
+  absl::Duration Duration() {
+    struct timespec ts;
+    TEST_CHECK(clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
+    return absl::TimeFromTimespec(ts) - start_;
+  }
+
+  void Start() {
+    struct timespec ts;
+    TEST_CHECK(clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
+    start_ = absl::TimeFromTimespec(ts);
+  }
+
+ protected:
+  absl::Time start_;
+};
+
+// Sets the given itimer and returns a cleanup function that restores the
+// previous itimer when it goes out of scope.
+inline PosixErrorOr<Cleanup> ScopedItimer(int which,
+                                          struct itimerval const& new_value) {
+  struct itimerval old_value;
+  int rc = setitimer(which, &new_value, &old_value);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "setitimer failed");
+  }
+  return Cleanup(std::function<void(void)>([which, old_value] {
+    EXPECT_THAT(setitimer(which, &old_value, nullptr), SyscallSucceeds());
+  }));
+}
+
+// Returns the current time.
+absl::Time Now(clockid_t id);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_TIMER_UTIL_H_
-- 
cgit v1.2.3


From 24c1158b9c21f7f8b7126e810d623a518422052e Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 11 Dec 2018 16:11:53 -0800
Subject: Add "trace signal" option

This option is effectively equivalent to -panic-signal, except that the
sandbox does not die after logging the traceback.

PiperOrigin-RevId: 225089593
Change-Id: Ifb1c411210110b6104613f404334bd02175e484e
---
 runsc/boot/config.go | 10 +++++++++-
 runsc/boot/loader.go |  6 +++++-
 runsc/main.go        |  4 +++-
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 2d89ad87e..b98e38ae9 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -198,10 +198,17 @@ type Config struct {
 	// WatchdogAction sets what action the watchdog takes when triggered.
 	WatchdogAction watchdog.Action
 
-	// PanicSignal register signal handling that panics. Usually set to
+	// PanicSignal registers signal handling that panics. Usually set to
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
+	//
+	// PanicSignal takes precedence over TraceSignal.
 	PanicSignal int
 
+	// TraceSignal registers signal handling that logs a traceback of all
+	// goroutines. Usually set to SIGUSR2(12) to troubleshoot hangs. -1
+	// disables it.
+	TraceSignal int
+
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
@@ -228,5 +235,6 @@ func (c *Config) ToFlags() []string {
 		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
 		"--watchdog-action=" + c.WatchdogAction.String(),
 		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
+		"--trace-signal=" + strconv.Itoa(c.TraceSignal),
 	}
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 7cac346c9..a9c549790 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -324,10 +324,14 @@ func New(args Args) (*Loader, error) {
 	// Handle signals by forwarding them to the root container process
 	// (except for panic signal, which should cause a panic).
 	l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
-		// Panic signal should cause a panic.
+		// Tracing signals should cause their respective actions.
 		if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
 			panic("Signal-induced panic")
 		}
+		if args.Conf.TraceSignal != -1 && sig == linux.Signal(args.Conf.TraceSignal) {
+			log.TracebackAll("Signal-induced traceback")
+			return
+		}
 
 		// Otherwise forward to root container.
 		deliveryMode := DeliverToProcess
diff --git a/runsc/main.go b/runsc/main.go
index 81c36067b..013b250f7 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -62,7 +62,8 @@ var (
 	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
-	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it. This takes precendence over -trace-signal.")
+	traceSignal    = flag.Int("trace-signal", -1, "register signal handling that logs a traceback of all goroutines. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 )
 
 // gitRevision is set during linking.
@@ -144,6 +145,7 @@ func main() {
 		StraceLogSize:  *straceLogSize,
 		WatchdogAction: wa,
 		PanicSignal:    *panicSignal,
+		TraceSignal:    *traceSignal,
 	}
 	if len(*straceSyscalls) != 0 {
 		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
-- 
cgit v1.2.3


From 1775a0e11e56ee619a35b46d3d1561d99095a01c Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 13 Dec 2018 15:36:59 -0800
Subject: container.Destroy should clean up container metadata even if other
 cleanups fail

If the sandbox process is dead (because of a panic or some other problem),
container.Destroy will never remove the container metadata file, since it will
always fail when calling container.stop().

This CL changes container.Destroy() to always perform the three necessary
cleanup operations:
* Stop the sandbox and gofer processes.
* Remove the container fs on the host.
* Delete the container metadata directory.

Errors from these three operations will be concatenated and returned from
Destroy().

PiperOrigin-RevId: 225448164
Change-Id: I99c6311b2e4fe5f6e2ca991424edf1ebeae9df32
---
 runsc/container/container.go | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 80a27df4a..07924d23a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -626,20 +626,36 @@ func (c *Container) Processes() ([]*control.Process, error) {
 }
 
 // Destroy stops all processes and frees all resources associated with the
-// container. It fails fast and is idempotent.
+// container.
 func (c *Container) Destroy() error {
 	log.Debugf("Destroy container %q", c.ID)
 
+	// We must perform the following cleanup steps:
+	// * stop the container and gofer processes,
+	// * remove the container filesystem on the host, and
+	// * delete the container metadata directory.
+	//
+	// It's possible for one or more of these steps to fail, but we should
+	// do our best to perform all of the cleanups. Hence, we keep a slice
+	// of errors return their concatenation.
+	var errs []string
+
 	if err := c.stop(); err != nil {
-		return fmt.Errorf("error stopping container: %v", err)
+		err = fmt.Errorf("error stopping container: %v", err)
+		log.Warningf("%v", err)
+		errs = append(errs, err.Error())
 	}
 
 	if err := destroyFS(c.Spec); err != nil {
-		return fmt.Errorf("error destroying container fs: %v", err)
+		err = fmt.Errorf("error destroying container fs: %v", err)
+		log.Warningf("%v", err)
+		errs = append(errs, err.Error())
 	}
 
 	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
+		err = fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
+		log.Warningf("%v", err)
+		errs = append(errs, err.Error())
 	}
 
 	// "If any poststop hook fails, the runtime MUST log a warning, but the
@@ -655,7 +671,11 @@ func (c *Container) Destroy() error {
 	}
 
 	c.changeStatus(Stopped)
-	return nil
+
+	if len(errs) == 0 {
+		return nil
+	}
+	return fmt.Errorf(strings.Join(errs, "\n"))
 }
 
 // save saves the container metadata to a file.
-- 
cgit v1.2.3


From 2421006426445a1827422c2dbdd6fc6a47087147 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 17 Dec 2018 11:37:38 -0800
Subject: Implement mlock(), kind of.

Currently mlock() and friends do nothing whatsoever. However, mlocking
is directly application-visible in a number of ways; for example,
madvise(MADV_DONTNEED) and msync(MS_INVALIDATE) both fail on mlocked
regions. We handle this inconsistently: MADV_DONTNEED is too important
to not work, but MS_INVALIDATE is rejected.

Change MM to track mlocked regions in a manner consistent with Linux.
It still will not actually pin pages into host physical memory, but:

- mlock() will now cause sentry memory management to precommit mlocked
pages.

- MADV_DONTNEED and MS_INVALIDATE will interact with mlocked pages as
described above.

PiperOrigin-RevId: 225861605
Change-Id: Iee187204979ac9a4d15d0e037c152c0902c8d0ee
---
 pkg/abi/linux/limits.go                 |   2 +-
 pkg/abi/linux/mm.go                     |  12 +
 pkg/sentry/limits/limits.go             |   2 +-
 pkg/sentry/limits/linux.go              |   2 +-
 pkg/sentry/memmap/memmap.go             |  37 +++
 pkg/sentry/mm/BUILD                     |   1 +
 pkg/sentry/mm/address_space.go          |  12 +-
 pkg/sentry/mm/lifecycle.go              |  24 +-
 pkg/sentry/mm/mm.go                     |  24 +-
 pkg/sentry/mm/syscalls.go               | 423 +++++++++++++++++++++++++++-----
 pkg/sentry/mm/vma.go                    |  38 +++
 pkg/sentry/syscalls/linux/linux64.go    |  15 +-
 pkg/sentry/syscalls/linux/sys_mmap.go   | 106 +++++---
 pkg/sentry/syscalls/linux/sys_rlimit.go |   1 +
 runsc/boot/limits.go                    |   4 +-
 test/syscalls/linux/BUILD               |  15 ++
 test/syscalls/linux/mlock.cc            | 344 ++++++++++++++++++++++++++
 test/syscalls/linux/msync.cc            |  20 +-
 18 files changed, 947 insertions(+), 135 deletions(-)
 create mode 100644 test/syscalls/linux/mlock.cc

(limited to 'runsc')

diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index b2e51b9bd..e0aa5b31d 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -60,7 +60,7 @@ const (
 	DefaultNofileHardLimit = 4096
 
 	// DefaultMemlockLimit is called MLOCK_LIMIT in Linux.
-	DefaultMemlockLimit = 64 * 1094
+	DefaultMemlockLimit = 64 * 1024
 
 	// DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux.
 	DefaultMsgqueueLimit = 819200
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index 3fcdf8235..eda8d9788 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -49,6 +49,18 @@ const (
 	MREMAP_FIXED   = 1 << 1
 )
 
+// Flags for mlock2(2).
+const (
+	MLOCK_ONFAULT = 0x01
+)
+
+// Flags for mlockall(2).
+const (
+	MCL_CURRENT = 1
+	MCL_FUTURE  = 2
+	MCL_ONFAULT = 4
+)
+
 // Advice for madvise(2).
 const (
 	MADV_NORMAL       = 0
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index ba0b7d4fd..eeca01876 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -33,7 +33,7 @@ const (
 	Rss
 	ProcessCount
 	NumberOfFiles
-	MemoryPagesLocked
+	MemoryLocked
 	AS
 	Locks
 	SignalsPending
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 511db6733..295f9c398 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{
 	linux.RLIMIT_RSS:        Rss,
 	linux.RLIMIT_NPROC:      ProcessCount,
 	linux.RLIMIT_NOFILE:     NumberOfFiles,
-	linux.RLIMIT_MEMLOCK:    MemoryPagesLocked,
+	linux.RLIMIT_MEMLOCK:    MemoryLocked,
 	linux.RLIMIT_AS:         AS,
 	linux.RLIMIT_LOCKS:      Locks,
 	linux.RLIMIT_SIGPENDING: SignalsPending,
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 28e2bed9b..cf20b11e3 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -243,6 +243,40 @@ type MappingIdentity interface {
 	Msync(ctx context.Context, mr MappableRange) error
 }
 
+// MLockMode specifies the memory locking behavior of a memory mapping.
+type MLockMode int
+
+// Note that the ordering of MLockModes is significant; see
+// mm.MemoryManager.defMLockMode.
+const (
+	// MLockNone specifies that a mapping has no memory locking behavior.
+	//
+	// This must be the zero value for MLockMode.
+	MLockNone MLockMode = iota
+
+	// MLockEager specifies that a mapping is memory-locked, as by mlock() or
+	// similar. Pages in the mapping should be made, and kept, resident in
+	// physical memory as soon as possible.
+	//
+	// As of this writing, MLockEager does not cause memory-locking to be
+	// requested from the host; it only affects the sentry's memory management
+	// behavior.
+	//
+	// MLockEager is analogous to Linux's VM_LOCKED.
+	MLockEager
+
+	// MLockLazy specifies that a mapping is memory-locked, as by mlock() or
+	// similar. Pages in the mapping should be kept resident in physical memory
+	// once they have been made resident due to e.g. a page fault.
+	//
+	// As of this writing, MLockLazy does not cause memory-locking to be
+	// requested from the host; in fact, it has virtually no effect, except for
+	// interactions between mlocked pages and other syscalls.
+	//
+	// MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT.
+	MLockLazy
+)
+
 // MMapOpts specifies a request to create a memory mapping.
 type MMapOpts struct {
 	// Length is the length of the mapping.
@@ -303,6 +337,9 @@ type MMapOpts struct {
 	// mapping (see platform.AddressSpace.MapFile).
 	Precommit bool
 
+	// MLockMode specifies the memory locking behavior of the mapping.
+	MLockMode MLockMode
+
 	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
 	// empty, MappingIdentity.MappedName() will be used instead.
 	//
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 744e73a39..5a9185e5d 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -106,6 +106,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 7488f7c4a..e7aa24c69 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() {
 // for all addresses in ar should be precommitted.
 //
 // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg.Range().Contains(ar.Start).
+// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
@@ -173,7 +173,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		}
 	}
 
-	for {
+	// Since this checks ar.End and not mapAR.End, we will never map a pma that
+	// is not required.
+	for pseg.Ok() && pseg.Start() < ar.End {
 		pma := pseg.ValuePtr()
 		pmaAR := pseg.Range()
 		pmaMapAR := pmaAR.Intersect(mapAR)
@@ -184,13 +186,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
 			return err
 		}
-		// Since this checks ar.End and not mapAR.End, we will never map a pma
-		// that is not required.
-		if ar.End <= pmaAR.End {
-			return nil
-		}
 		pseg = pseg.NextSegment()
 	}
+	return nil
 }
 
 // unmapASLocked removes all AddressSpace mappings for addresses in ar.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 1613ce11d..a42e32b43 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -58,13 +59,17 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
 	mm2 := &MemoryManager{
-		p:                    mm.p,
-		haveASIO:             mm.haveASIO,
-		layout:               mm.layout,
-		privateRefs:          mm.privateRefs,
-		users:                1,
-		usageAS:              mm.usageAS,
-		brk:                  mm.brk,
+		p:           mm.p,
+		haveASIO:    mm.haveASIO,
+		layout:      mm.layout,
+		privateRefs: mm.privateRefs,
+		users:       1,
+		brk:         mm.brk,
+		usageAS:     mm.usageAS,
+		// "The child does not inherit its parent's memory locks (mlock(2),
+		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
+		// MLockNone, both of which are zero values. vma.mlockMode is reset
+		// when copied below.
 		captureInvalidations: true,
 		argv:                 mm.argv,
 		envv:                 mm.envv,
@@ -77,7 +82,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	// Copy vmas.
 	dstvgap := mm2.vmas.FirstGap()
 	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
-		vma := srcvseg.ValuePtr()
+		vma := srcvseg.Value() // makes a copy of the vma
 		vmaAR := srcvseg.Range()
 		// Inform the Mappable, if any, of the new mapping.
 		if vma.mappable != nil {
@@ -89,7 +94,8 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
+		vma.mlockMode = memmap.MLockNone
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
 		// We don't need to update mm2.usageAS since we copied it from mm
 		// above.
 	}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index b1e39e898..c0632d232 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -95,11 +95,6 @@ type MemoryManager struct {
 	// vmas is protected by mappingMu.
 	vmas vmaSet
 
-	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
-	//
-	// usageAS is protected by mappingMu.
-	usageAS uint64
-
 	// brk is the mm's brk, which is manipulated using the brk(2) system call.
 	// The brk is initially set up by the loader which maps an executable
 	// binary into the mm.
@@ -107,6 +102,23 @@ type MemoryManager struct {
 	// brk is protected by mappingMu.
 	brk usermem.AddrRange
 
+	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
+	//
+	// usageAS is protected by mappingMu.
+	usageAS uint64
+
+	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
+	// memmap.MLockNone.
+	//
+	// lockedAS is protected by mappingMu.
+	lockedAS uint64
+
+	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
+	// defMLockMode is greater.
+	//
+	// defMLockMode is protected by mappingMu.
+	defMLockMode memmap.MLockMode
+
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
 	activeMu ssync.DowngradableRWMutex `state:"nosave"`
@@ -252,6 +264,8 @@ type vma struct {
 	// metag, none of which we currently support.
 	growsDown bool `state:"manual"`
 
+	mlockMode memmap.MLockMode
+
 	// If id is not nil, it controls the lifecycle of mappable and provides vma
 	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
 	id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index daaae4da1..383703ec3 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@@ -128,16 +129,24 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 
 	// Get the new vma.
 	mm.mappingMu.Lock()
+	if opts.MLockMode < mm.defMLockMode {
+		opts.MLockMode = mm.defMLockMode
+	}
 	vseg, ar, err := mm.createVMALocked(ctx, opts)
 	if err != nil {
 		mm.mappingMu.Unlock()
 		return 0, err
 	}
 
+	// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
+	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
+	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
+	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
+	// populate_vma_page_range(). Confirm this behavior.
 	switch {
-	case opts.Precommit:
+	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
 		// Get pmas and map with precommit as requested.
-		mm.populateAndUnlock(ctx, vseg, ar, true)
+		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
 
 	case opts.Mappable == nil && length <= privateAllocUnit:
 		// NOTE: Get pmas and map eagerly in the hope
@@ -146,7 +155,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		// memmap.Mappable.Translate is unknown; and only for small mappings,
 		// to avoid needing to allocate large amounts of memory that we may
 		// subsequently need to checkpoint.
-		mm.populateAndUnlock(ctx, vseg, ar, false)
+		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
 
 	default:
 		mm.mappingMu.Unlock()
@@ -155,31 +164,29 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 	return ar.Start, nil
 }
 
-// Preconditions: mm.mappingMu must be locked for writing.
+// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
+// into mm.as if it is active.
 //
-// Postconditions: mm.mappingMu will be unlocked.
-func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
 		// mm/gup.c:populate_vma_page_range.
-		mm.mappingMu.Unlock()
 		return
 	}
 
 	mm.activeMu.Lock()
+	// Can't defer mm.activeMu.Unlock(); see below.
 
-	// Even if we get a new pma, we can't actually map it if we don't have an
+	// Even if we get new pmas, we can't actually map them if we don't have an
 	// AddressSpace.
 	if mm.as == nil {
 		mm.activeMu.Unlock()
-		mm.mappingMu.Unlock()
 		return
 	}
 
 	// Ensure that we have usable pmas.
-	mm.mappingMu.DowngradeLock()
 	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
-	mm.mappingMu.RUnlock()
 	if err != nil {
 		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
 		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
@@ -197,6 +204,45 @@ func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator
 	mm.activeMu.RUnlock()
 }
 
+// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
+// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
+// preferable to populateVMA since it unlocks mm.mappingMu before performing
+// expensive operations that don't require it to be locked.
+//
+// Preconditions: mm.mappingMu must be locked for writing.
+// vseg.Range().IsSupersetOf(ar).
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	// See populateVMA above for commentary.
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	mm.activeMu.Lock()
+
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
+	// isn't needed at all for mapASLocked.
+	mm.mappingMu.DowngradeLock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return
+	}
+
+	mm.activeMu.DowngradeLock()
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
 // MapStack allocates the initial process stack.
 func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
 	// maxStackSize is the maximum supported process stack size in bytes.
@@ -236,6 +282,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error
 		MaxPerms:  usermem.AnyAccess,
 		Private:   true,
 		GrowsDown: true,
+		MLockMode: mm.defMLockMode,
 		Hint:      "[stack]",
 	})
 	return ar, err
@@ -334,6 +381,19 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// occupies at least part of the destination. Thus the NoMove case always
 	// fails and the MayMove case always falls back to copying.
 
+	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
+		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
+		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
+		// !CAP_IPC_LOCK.
+		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
+				return 0, syserror.EAGAIN
+			}
+		}
+	}
+
 	if opts.Move != MRemapMustMove {
 		// Handle no-ops and in-place shrinking. These cases don't care if
 		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
@@ -360,7 +420,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.mappable != nil {
 			newOffset = vseg.mappableRange().End
 		}
-		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length:          newSize - oldSize,
 			MappingIdentity: vma.id,
 			Mappable:        vma.mappable,
@@ -371,9 +431,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			MaxPerms:        vma.maxPerms,
 			Private:         vma.private,
 			GrowsDown:       vma.growsDown,
+			MLockMode:       vma.mlockMode,
 			Hint:            vma.hint,
 		})
 		if err == nil {
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, ar, true)
+			}
 			return oldAddr, nil
 		}
 		// In-place growth failed. In the MRemapMayMove case, fall through to
@@ -462,8 +526,14 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		mm.vmas.Add(newAR, vma)
+		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 		mm.usageAS += uint64(newAR.Length())
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS += uint64(newAR.Length())
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, newAR, true)
+			}
+		}
 		return newAR.Start, nil
 	}
 
@@ -485,8 +555,11 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	vseg = mm.vmas.Isolate(vseg, oldAR)
 	vma := vseg.Value()
 	mm.vmas.Remove(vseg)
-	mm.vmas.Add(newAR, vma)
+	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if vma.mlockMode != memmap.MLockNone {
+		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	}
 
 	// Move pmas. This is technically optional for non-private pmas, which
 	// could just go through memmap.Mappable.Translate again, but it's required
@@ -501,6 +574,10 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
 	}
 
+	if vma.mlockMode == memmap.MLockEager {
+		mm.populateVMA(ctx, vseg, newAR, true)
+	}
+
 	return newAR.Start, nil
 }
 
@@ -611,9 +688,10 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
 // error on failure.
 func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
 	mm.mappingMu.Lock()
-	defer mm.mappingMu.Unlock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
 
 	if addr < mm.brk.Start {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EINVAL
 	}
 
@@ -623,21 +701,24 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 	// heap + data + bss. The segment sizes need to be plumbed from the
 	// loader package to fully enforce RLIMIT_DATA.
 	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.ENOMEM
 	}
 
 	oldbrkpg, _ := mm.brk.End.RoundUp()
 	newbrkpg, ok := addr.RoundUp()
 	if !ok {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EFAULT
 	}
 
 	switch {
 	case newbrkpg < oldbrkpg:
 		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+		mm.mappingMu.Unlock()
 
 	case oldbrkpg < newbrkpg:
-		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length: uint64(newbrkpg - oldbrkpg),
 			Addr:   oldbrkpg,
 			Fixed:  true,
@@ -646,17 +727,221 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 			Perms:    usermem.ReadWrite,
 			MaxPerms: usermem.AnyAccess,
 			Private:  true,
-			Hint:     "[heap]",
+			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
+			// mm->def_flags.
+			MLockMode: mm.defMLockMode,
+			Hint:      "[heap]",
 		})
 		if err != nil {
+			mm.mappingMu.Unlock()
 			return mm.brk.End, err
 		}
+		if mm.defMLockMode == memmap.MLockEager {
+			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+		} else {
+			mm.mappingMu.Unlock()
+		}
+
+	default:
+		// Nothing to do.
+		mm.mappingMu.Unlock()
 	}
 
 	mm.brk.End = addr
 	return addr, nil
 }
 
+// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
+// depending on mode.
+func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
+	// Linux allows this to overflow.
+	la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
+	ar, ok := addr.RoundDown().ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if mode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				mm.mappingMu.Unlock()
+				return syserror.EPERM
+			}
+			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
+				mm.mappingMu.Unlock()
+				return syserror.ENOMEM
+			}
+		}
+	}
+
+	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
+	if ar.Length() == 0 {
+		mm.mappingMu.Unlock()
+		return nil
+	}
+
+	// Apply the new mlock mode to vmas.
+	var unmapped bool
+	vseg := mm.vmas.FindSegment(ar.Start)
+	for {
+		if !vseg.Ok() {
+			unmapped = true
+			break
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		prevMode := vma.mlockMode
+		vma.mlockMode = mode
+		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+			mm.lockedAS += uint64(vseg.Range().Length())
+		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vseg.Range().Length())
+		}
+		if ar.End <= vseg.End() {
+			break
+		}
+		vseg, _ = vseg.NextNonEmpty()
+	}
+	mm.vmas.MergeRange(ar)
+	mm.vmas.MergeAdjacent(ar)
+	if unmapped {
+		mm.mappingMu.Unlock()
+		return syserror.ENOMEM
+	}
+
+	if mode == memmap.MLockEager {
+		// Ensure that we have usable pmas. Since we didn't return ENOMEM
+		// above, ar must be fully covered by vmas, so we can just use
+		// NextSegment below.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+			if !vseg.ValuePtr().effectivePerms.Any() {
+				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
+				// case, which is converted to ENOMEM by mlock.
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				return syserror.ENOMEM
+			}
+			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{})
+			if err != nil {
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				// Linux: mm/mlock.c:__mlock_posix_error_return()
+				if err == syserror.EFAULT {
+					return syserror.ENOMEM
+				}
+				if err == syserror.ENOMEM {
+					return syserror.EAGAIN
+				}
+				return err
+			}
+		}
+
+		// Map pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
+			mm.activeMu.RUnlock()
+			if err != nil {
+				return err
+			}
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+
+	return nil
+}
+
+// MLockAllOpts holds options to MLockAll.
+type MLockAllOpts struct {
+	// If Current is true, change the memory-locking behavior of all mappings
+	// to Mode. If Future is true, upgrade the memory-locking behavior of all
+	// future mappings to Mode. At least one of Current or Future must be true.
+	Current bool
+	Future  bool
+	Mode    memmap.MLockMode
+}
+
+// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
+// depending on opts.
+func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
+	if !opts.Current && !opts.Future {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if opts.Current {
+		if opts.Mode != memmap.MLockNone {
+			// Check against RLIMIT_MEMLOCK.
+			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+				if mlockLimit == 0 {
+					mm.mappingMu.Unlock()
+					return syserror.EPERM
+				}
+				if uint64(mm.vmas.Span()) > mlockLimit {
+					mm.mappingMu.Unlock()
+					return syserror.ENOMEM
+				}
+			}
+		}
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			vma := vseg.ValuePtr()
+			prevMode := vma.mlockMode
+			vma.mlockMode = opts.Mode
+			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+				mm.lockedAS += uint64(vseg.Range().Length())
+			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+				mm.lockedAS -= uint64(vseg.Range().Length())
+			}
+		}
+	}
+
+	if opts.Future {
+		mm.defMLockMode = opts.Mode
+	}
+
+	if opts.Current && opts.Mode == memmap.MLockEager {
+		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
+		// ignores the return value of __mm_populate(), so all errors below are
+		// ignored.
+		//
+		// Try to get usable pmas.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			if vseg.ValuePtr().effectivePerms.Any() {
+				mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{})
+			}
+		}
+
+		// Map all pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
+			mm.activeMu.RUnlock()
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+	return nil
+}
+
 // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
 func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	ar, ok := addr.ToRange(length)
@@ -680,46 +965,49 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	// ensures that Decommit immediately reduces host memory usage.
 	var didUnmapAS bool
 	pseg := mm.pmas.LowerBoundSegment(ar.Start)
-	vseg := mm.vmas.LowerBoundSegment(ar.Start)
 	mem := mm.p.Memory()
-	for pseg.Ok() && pseg.Start() < ar.End {
-		pma := pseg.ValuePtr()
-		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
-			psegAR := pseg.Range().Intersect(ar)
-			vseg = vseg.seekNextLowerBound(psegAR.Start)
-			if checkInvariants {
-				if !vseg.Ok() {
-					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
-				}
-				if psegAR.Start < vseg.Start() {
-					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
-				}
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		vma := vseg.ValuePtr()
+		if vma.mlockMode != memmap.MLockNone {
+			return syserror.EINVAL
+		}
+		vsegAR := vseg.Range().Intersect(ar)
+		// pseg should already correspond to either this vma or a later one,
+		// since there can't be a pma without a corresponding vma.
+		if checkInvariants {
+			if pseg.Ok() && pseg.End() <= vsegAR.Start {
+				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
 			}
-			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
-				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
-					pseg = pseg.NextSegment()
-					continue
+		}
+		for pseg.Ok() && pseg.Start() < vsegAR.End {
+			pma := pseg.ValuePtr()
+			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+				psegAR := pseg.Range().Intersect(ar)
+				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
+					if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+						pseg = pseg.NextSegment()
+						continue
+					}
+					// If an error occurs, fall through to the general
+					// invalidation case below.
 				}
-				// If an error occurs, fall through to the general
-				// invalidation case below.
 			}
+			pseg = mm.pmas.Isolate(pseg, vsegAR)
+			pma = pseg.ValuePtr()
+			if !didUnmapAS {
+				// Unmap all of ar, not just pseg.Range(), to minimize host
+				// syscalls. AddressSpace mappings must be removed before
+				// mm.decPrivateRef().
+				mm.unmapASLocked(ar)
+				didUnmapAS = true
+			}
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			pma.file.DecRef(pseg.fileRange())
+			mm.removeRSSLocked(pseg.Range())
+			pseg = mm.pmas.Remove(pseg).NextSegment()
 		}
-		pseg = mm.pmas.Isolate(pseg, ar)
-		pma = pseg.ValuePtr()
-		if !didUnmapAS {
-			// Unmap all of ar, not just pseg.Range(), to minimize host
-			// syscalls. AddressSpace mappings must be removed before
-			// mm.decPrivateRef().
-			mm.unmapASLocked(ar)
-			didUnmapAS = true
-		}
-		if pma.private {
-			mm.decPrivateRef(pseg.fileRange())
-		}
-		pma.file.DecRef(pseg.fileRange())
-		mm.removeRSSLocked(pseg.Range())
-
-		pseg = mm.pmas.Remove(pseg).NextSegment()
 	}
 
 	// "If there are some parts of the specified address space that are not
@@ -732,9 +1020,28 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	return nil
 }
 
-// Sync implements the semantics of Linux's msync(MS_SYNC).
-func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
-	ar, ok := addr.ToRange(length)
+// MSyncOpts holds options to MSync.
+type MSyncOpts struct {
+	// Sync has the semantics of MS_SYNC.
+	Sync bool
+
+	// Invalidate has the semantics of MS_INVALIDATE.
+	Invalidate bool
+}
+
+// MSync implements the semantics of Linux's msync().
+func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(la))
 	if !ok {
 		return syserror.ENOMEM
 	}
@@ -759,10 +1066,14 @@ func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uin
 		}
 		lastEnd = vseg.End()
 		vma := vseg.ValuePtr()
+		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
+			mm.mappingMu.RUnlock()
+			return syserror.EBUSY
+		}
 		// It's only possible to have dirtied the Mappable through a shared
 		// mapping. Don't check if the mapping is writable, because mprotect
 		// may have changed this, and also because Linux doesn't.
-		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
+		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
 			// We can't call memmap.MappingIdentity.Msync while holding
 			// mm.mappingMu since it may take fs locks that precede it in the
 			// lock order.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 5c2c802f6..28ba9f2f5 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -17,8 +17,10 @@ package mm
 import (
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -53,6 +55,23 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
 	}
 
+	if opts.MLockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
+			}
+			newLockedAS := mm.lockedAS + opts.Length
+			if opts.Unmap {
+				newLockedAS -= mm.mlockedBytesRangeLocked(ar)
+			}
+			if newLockedAS > mlockLimit {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
+			}
+		}
+	}
+
 	// Remove overwritten mappings. This ordering is consistent with Linux:
 	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
 	// file->f_op->mmap().
@@ -85,10 +104,14 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		maxPerms:       opts.MaxPerms,
 		private:        opts.Private,
 		growsDown:      opts.GrowsDown,
+		mlockMode:      opts.MLockMode,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
 	})
 	mm.usageAS += opts.Length
+	if opts.MLockMode != memmap.MLockNone {
+		mm.lockedAS += opts.Length
+	}
 
 	return vseg, ar, nil
 }
@@ -201,6 +224,17 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo
 	return 0, syserror.ENOMEM
 }
 
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
+	var total uint64
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		if vseg.ValuePtr().mlockMode != memmap.MLockNone {
+			total += uint64(vseg.Range().Intersect(ar).Length())
+		}
+	}
+	return total
+}
+
 // getVMAsLocked ensures that vmas exist for all addresses in ar, and support
 // access of type (at, ignorePermissions). It returns:
 //
@@ -338,6 +372,9 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 			vma.id.DecRef()
 		}
 		mm.usageAS -= uint64(vmaAR.Length())
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vmaAR.Length())
+		}
 		vgap = mm.vmas.Remove(vseg)
 		vseg = vgap.NextSegment()
 	}
@@ -368,6 +405,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
 		vma1.maxPerms != vma2.maxPerms ||
 		vma1.private != vma2.private ||
 		vma1.growsDown != vma2.growsDown ||
+		vma1.mlockMode != vma2.mlockMode ||
 		vma1.id != vma2.id ||
 		vma1.hint != vma2.hint {
 		return vma{}, false
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 2aab948da..cc5ebb955 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{
 		145: SchedGetscheduler,
 		146: SchedGetPriorityMax,
 		147: SchedGetPriorityMin,
-		148: syscalls.ErrorWithEvent(syscall.EPERM),      // SchedRrGetInterval,
-		149: syscalls.Error(nil),                         // Mlock, TODO
-		150: syscalls.Error(nil),                         // Munlock, TODO
-		151: syscalls.Error(nil),                         // Mlockall, TODO
-		152: syscalls.Error(nil),                         // Munlockall, TODO
+		148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
+		149: Mlock,
+		150: Munlock,
+		151: Mlockall,
+		152: Munlockall,
 		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
 		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
 		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
@@ -373,8 +373,9 @@ var AMD64 = &kernel.SyscallTable{
 		//     322: Execveat, TODO
 		//     323: Userfaultfd, TODO
 		//     324: Membarrier, TODO
-		// Syscalls after 325 are backports from 4.6.
-		325: syscalls.Error(nil), // Mlock2, TODO
+		325: Mlock2,
+		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
+		//	326: CopyFileRange,
 		327: Preadv2,
 		//	328: Pwritev2,  // Pwritev2, TODO
 	},
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 145f7846c..8732861e0 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -69,6 +69,9 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
 		Precommit: linux.MAP_POPULATE&flags != 0,
 	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
 	defer func() {
 		if opts.MappingIdentity != nil {
 			opts.MappingIdentity.DecRef()
@@ -384,16 +387,6 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	length := args[1].SizeT()
 	flags := args[2].Int()
 
-	if addr != addr.RoundDown() {
-		return 0, nil, syserror.EINVAL
-	}
-	if length == 0 {
-		return 0, nil, nil
-	}
-	la, ok := usermem.Addr(length).RoundUp()
-	if !ok {
-		return 0, nil, syserror.ENOMEM
-	}
 	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
 	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
 	// permits a call to msync() that specifies neither of these flags, with
@@ -406,39 +399,72 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if sync && flags&linux.MS_ASYNC != 0 {
 		return 0, nil, syserror.EINVAL
 	}
+	err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
+		Sync:       sync,
+		Invalidate: flags&linux.MS_INVALIDATE != 0,
+	})
+	// MSync calls fsync, the same interrupt conversion rules apply, see
+	// mm/msync.c, fsync POSIX.1-2008.
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Mlock implements linux syscall mlock(2).
+func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
+}
 
-	// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
-	// that they can be updated with the fresh values just written)". This is a
-	// no-op given that shared memory exists. However, MS_INVALIDATE can also
-	// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
-	// and a memory lock exists for the specified address range." Given that
-	// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
-	// some user program could be using it for synchronization.
-	if flags&linux.MS_INVALIDATE != 0 {
+// Mlock2 implements linux syscall mlock2(2).
+func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	if flags&^(linux.MLOCK_ONFAULT) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-	// MS_SYNC "requests an update and waits for it to complete."
-	if sync {
-		err := t.MemoryManager().Sync(t, addr, uint64(la))
-		// Sync calls fsync, the same interrupt conversion rules apply, see
-		// mm/msync.c, fsync POSIX.1-2008.
-		return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
-	}
-	// MS_ASYNC "specifies that an update be scheduled, but the call returns
-	// immediately". As long as dirty pages are tracked and eventually written
-	// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
-	// is in fact a no-op, since the kernel properly tracks dirty pages and
-	// flushes them to storage as necessary.")
-	//
-	// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
-	// This applies even for MS_ASYNC.
-	ar, ok := addr.ToRange(uint64(la))
-	if !ok {
-		return 0, nil, syserror.ENOMEM
+
+	mode := memmap.MLockEager
+	if flags&linux.MLOCK_ONFAULT != 0 {
+		mode = memmap.MLockLazy
 	}
-	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
-	if mapped != uint64(la) {
-		return 0, nil, syserror.ENOMEM
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
+}
+
+// Munlock implements linux syscall munlock(2).
+func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
+}
+
+// Mlockall implements linux syscall mlockall(2).
+func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+
+	if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
+		return 0, nil, syserror.EINVAL
 	}
-	return 0, nil, nil
+
+	mode := memmap.MLockEager
+	if flags&linux.MCL_ONFAULT != 0 {
+		mode = memmap.MLockLazy
+	}
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: flags&linux.MCL_CURRENT != 0,
+		Future:  flags&linux.MCL_FUTURE != 0,
+		Mode:    mode,
+	})
+}
+
+// Munlockall implements linux syscall munlockall(2).
+func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: true,
+		Future:  true,
+		Mode:    memmap.MLockNone,
+	})
 }
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 2f16e1791..b0b216045 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -90,6 +90,7 @@ var setableLimits = map[limits.LimitType]struct{}{
 	limits.CPU:           {},
 	limits.Data:          {},
 	limits.FileSize:      {},
+	limits.MemoryLocked:  {},
 	limits.Stack:         {},
 	// These are not enforced, but we include them here to avoid returning
 	// EPERM, since some apps expect them to succeed.
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 8ecda6d0e..e3e716bf9 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -29,7 +29,7 @@ var fromLinuxResource = map[string]limits.LimitType{
 	"RLIMIT_DATA":       limits.Data,
 	"RLIMIT_FSIZE":      limits.FileSize,
 	"RLIMIT_LOCKS":      limits.Locks,
-	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
+	"RLIMIT_MEMLOCK":    limits.MemoryLocked,
 	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
 	"RLIMIT_NICE":       limits.Nice,
 	"RLIMIT_NOFILE":     limits.NumberOfFiles,
@@ -55,7 +55,7 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
 	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
-	ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
 	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
 	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
 	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 784997c18..aca55f492 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1019,6 +1019,21 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "mlock_test",
+    testonly = 1,
+    srcs = ["mlock.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        "//test/util:memory_util",
+        "//test/util:multiprocess_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "mmap_test",
     testonly = 1,
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
new file mode 100644
index 000000000..a0d876c2e
--- /dev/null
+++ b/test/syscalls/linux/mlock.cc
@@ -0,0 +1,344 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+using ::testing::_;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<bool> CanMlock() {
+  struct rlimit rlim;
+  if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
+    return PosixError(errno, "getrlimit(RLIMIT_MEMLOCK)");
+  }
+  if (rlim.rlim_cur != 0) {
+    return true;
+  }
+  return HaveCapability(CAP_IPC_LOCK);
+}
+
+// Returns true if the page containing addr is mlocked.
+bool IsPageMlocked(uintptr_t addr) {
+  // This relies on msync(MS_INVALIDATE) interacting correctly with mlocked
+  // pages, which is tested for by the MsyncInvalidate case below.
+  int const rv = msync(reinterpret_cast<void*>(addr & ~(kPageSize - 1)),
+                       kPageSize, MS_ASYNC | MS_INVALIDATE);
+  if (rv == 0) {
+    return false;
+  }
+  // This uses TEST_PCHECK_MSG since it's used in subprocesses.
+  TEST_PCHECK_MSG(errno == EBUSY, "msync failed with unexpected errno");
+  return true;
+}
+
+PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval) {
+  struct rlimit old_rlim;
+  if (getrlimit(resource, &old_rlim) != 0) {
+    return PosixError(errno, "getrlimit failed");
+  }
+  struct rlimit new_rlim = old_rlim;
+  new_rlim.rlim_cur = newval;
+  if (setrlimit(resource, &new_rlim) != 0) {
+    return PosixError(errno, "setrlimit failed");
+  }
+  return Cleanup([resource, old_rlim] {
+    TEST_PCHECK(setrlimit(resource, &old_rlim) == 0);
+  });
+}
+
+TEST(MlockTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MlockTest, ProtNone) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(ENOMEM));
+  // ENOMEM is returned because mlock can't populate the page, but it's still
+  // considered locked.
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MlockTest, MadviseDontneed) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_THAT(madvise(mapping.ptr(), mapping.len(), MADV_DONTNEED),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(MlockTest, MsyncInvalidate) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_ASYNC | MS_INVALIDATE),
+              SyscallFailsWithErrno(EBUSY));
+  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_SYNC | MS_INVALIDATE),
+              SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(MlockTest, Fork) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(
+      InForkedProcess([&] { TEST_CHECK(!IsPageMlocked(mapping.addr())); }),
+      IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MlockTest, RlimitMemlockZero) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MlockTest, RlimitMemlockInsufficient) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(ENOMEM));
+}
+
+TEST(MunlockTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MunlockTest, NotLocked) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+// There is currently no test for mlockall(MCL_CURRENT) because the default
+// RLIMIT_MEMLOCK of 64 KB is insufficient to actually invoke
+// mlockall(MCL_CURRENT).
+
+TEST(MlockallTest, Future) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+
+  // Run this test in a separate (single-threaded) subprocess to ensure that a
+  // background thread doesn't try to mmap a large amount of memory, fail due
+  // to hitting RLIMIT_MEMLOCK, and explode the process violently.
+  EXPECT_THAT(InForkedProcess([] {
+                auto const mapping =
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)
+                        .ValueOrDie();
+                TEST_CHECK(!IsPageMlocked(mapping.addr()));
+                TEST_PCHECK(mlockall(MCL_FUTURE) == 0);
+                // Ensure that mlockall(MCL_FUTURE) is turned off before the end
+                // of the test, as otherwise mmaps may fail unexpectedly.
+                Cleanup do_munlockall([] { TEST_PCHECK(munlockall() == 0); });
+                auto const mapping2 = ASSERT_NO_ERRNO_AND_VALUE(
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+                TEST_CHECK(IsPageMlocked(mapping2.addr()));
+                // Fire munlockall() and check that it disables
+                // mlockall(MCL_FUTURE).
+                do_munlockall.Release()();
+                auto const mapping3 = ASSERT_NO_ERRNO_AND_VALUE(
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+                TEST_CHECK(!IsPageMlocked(mapping2.addr()));
+              }),
+              IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MunlockallTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(munlockall(), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+#ifndef SYS_mlock2
+#ifdef __x86_64__
+#define SYS_mlock2 325
+#endif
+#endif
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 0x01  // Linux: include/uapi/asm-generic/mman-common.h
+#endif
+
+#ifdef SYS_mlock2
+
+int mlock2(void const* addr, size_t len, int flags) {
+  return syscall(SYS_mlock2, addr, len, flags);
+}
+
+TEST(Mlock2Test, NoFlags) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), 0), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(Mlock2Test, MlockOnfault) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), MLOCK_ONFAULT),
+              SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(Mlock2Test, UnknownFlags) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_THAT(mlock2(mapping.ptr(), mapping.len(), ~0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+#endif  // defined(SYS_mlock2)
+
+TEST(MapLockedTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MapLockedTest, RlimitMemlockZero) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  EXPECT_THAT(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
+      PosixErrorIs(EPERM, _));
+}
+
+TEST(MapLockedTest, RlimitMemlockInsufficient) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
+  EXPECT_THAT(
+      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
+      PosixErrorIs(EAGAIN, _));
+}
+
+TEST(MremapLockedTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  if (addr == MAP_FAILED) {
+    FAIL() << "mremap failed: " << errno << " (" << strerror(errno) << ")";
+  }
+  mapping.release();
+  mapping.reset(addr, 2 * mapping.len());
+  EXPECT_TRUE(IsPageMlocked(reinterpret_cast<uintptr_t>(addr)));
+}
+
+TEST(MremapLockedTest, RlimitMemlockZero) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
+      << "addr = " << addr << ", errno = " << errno;
+}
+
+TEST(MremapLockedTest, RlimitMemlockInsufficient) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit = ASSERT_NO_ERRNO_AND_VALUE(
+      ScopedSetSoftRlimit(RLIMIT_MEMLOCK, mapping.len()));
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
+      << "addr = " << addr << ", errno = " << errno;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index 0ddc621aa..72d90dc78 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -43,14 +43,13 @@ class MsyncParameterizedTest : public ::testing::TestWithParam<MsyncTestParam> {
  protected:
   int msync_flags() const { return std::get<0>(GetParam()); }
 
-  PosixErrorOr<Mapping> GetMapping() const {
-    auto rv = std::get<1>(GetParam())();
-    return rv;
-  }
+  PosixErrorOr<Mapping> GetMapping() const { return std::get<1>(GetParam())(); }
 };
 
-// All valid msync(2) flag combinations (not including MS_INVALIDATE, which
-// gVisor doesn't implement).
+// All valid msync(2) flag combinations, not including MS_INVALIDATE. ("Linux
+// permits a call to msync() that specifies neither [MS_SYNC or MS_ASYNC], with
+// semantics that are (currently) equivalent to specifying MS_ASYNC." -
+// msync(2))
 constexpr std::initializer_list<int> kMsyncFlags = {MS_SYNC, MS_ASYNC, 0};
 
 // Returns functions that return mappings that should be successfully
@@ -134,6 +133,15 @@ TEST_P(MsyncFullParamTest, UnalignedAddressFails) {
       SyscallFailsWithErrno(EINVAL));
 }
 
+TEST_P(MsyncFullParamTest, InvalidateUnlockedSucceeds) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+  EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags() | MS_INVALIDATE),
+              SyscallSucceeds());
+}
+
+// The test for MS_INVALIDATE on mlocked pages is in mlock.cc since it requires
+// probing for mlock support.
+
 INSTANTIATE_TEST_CASE_P(
     All, MsyncFullParamTest,
     ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
-- 
cgit v1.2.3


From b62591e6a813ec19a1fd74943584c4fead81f670 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 17 Dec 2018 17:34:09 -0800
Subject: Expose internal testing flag

Never to used outside of runsc tests!

PiperOrigin-RevId: 225919013
Change-Id: Ib3b14aa2a2564b5246fb3f8933d95e01027ed186
---
 runsc/boot/config.go      | 7 ++++++-
 runsc/main.go             | 3 +++
 runsc/test/testutil/BUILD | 5 +----
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index b98e38ae9..e00d44df9 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -218,7 +218,7 @@ type Config struct {
 
 // ToFlags returns a slice of flags that correspond to the given Config.
 func (c *Config) ToFlags() []string {
-	return []string{
+	f := []string{
 		"--root=" + c.RootDir,
 		"--debug=" + strconv.FormatBool(c.Debug),
 		"--log=" + c.LogFilename,
@@ -237,4 +237,9 @@ func (c *Config) ToFlags() []string {
 		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
 		"--trace-signal=" + strconv.Itoa(c.TraceSignal),
 	}
+	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		// Only include if set since it is never to be used by users.
+		f = append(f, "-TESTONLY-unsafe-nonroot=true")
+	}
+	return f
 }
diff --git a/runsc/main.go b/runsc/main.go
index 013b250f7..a6ea0e9fa 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -64,6 +64,8 @@ var (
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it. This takes precendence over -trace-signal.")
 	traceSignal    = flag.Int("trace-signal", -1, "register signal handling that logs a traceback of all goroutines. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+
+	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
 )
 
 // gitRevision is set during linking.
@@ -146,6 +148,7 @@ func main() {
 		WatchdogAction: wa,
 		PanicSignal:    *panicSignal,
 		TraceSignal:    *traceSignal,
+		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 	}
 	if len(*straceSyscalls) != 0 {
 		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 826b7bf0b..8c3919320 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -11,10 +11,7 @@ go_library(
         "testutil_race.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/test/testutil",
-    visibility = [
-        "//runsc:__subpackages__",
-        "//test:__subpackages__",
-    ],
+    visibility = ["//:sandbox"],
     deps = [
         "//runsc/boot",
         "//runsc/specutils",
-- 
cgit v1.2.3


From 86c9bd254749ebf65270aa60f728d9c847ac02d4 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Wed, 19 Dec 2018 13:29:10 -0800
Subject: Automated rollback of changelist 225861605

PiperOrigin-RevId: 226224230
Change-Id: Id24c7d3733722fd41d5fe74ef64e0ce8c68f0b12
---
 pkg/abi/linux/limits.go                 |   2 +-
 pkg/abi/linux/mm.go                     |  12 -
 pkg/sentry/limits/limits.go             |   2 +-
 pkg/sentry/limits/linux.go              |   2 +-
 pkg/sentry/memmap/memmap.go             |  37 ---
 pkg/sentry/mm/BUILD                     |   1 -
 pkg/sentry/mm/address_space.go          |  12 +-
 pkg/sentry/mm/lifecycle.go              |  24 +-
 pkg/sentry/mm/mm.go                     |  24 +-
 pkg/sentry/mm/syscalls.go               | 423 +++++---------------------------
 pkg/sentry/mm/vma.go                    |  38 ---
 pkg/sentry/syscalls/linux/linux64.go    |  15 +-
 pkg/sentry/syscalls/linux/sys_mmap.go   | 106 +++-----
 pkg/sentry/syscalls/linux/sys_rlimit.go |   1 -
 runsc/boot/limits.go                    |   4 +-
 test/syscalls/linux/BUILD               |  15 --
 test/syscalls/linux/mlock.cc            | 344 --------------------------
 test/syscalls/linux/msync.cc            |  20 +-
 18 files changed, 135 insertions(+), 947 deletions(-)
 delete mode 100644 test/syscalls/linux/mlock.cc

(limited to 'runsc')

diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index e0aa5b31d..b2e51b9bd 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -60,7 +60,7 @@ const (
 	DefaultNofileHardLimit = 4096
 
 	// DefaultMemlockLimit is called MLOCK_LIMIT in Linux.
-	DefaultMemlockLimit = 64 * 1024
+	DefaultMemlockLimit = 64 * 1094
 
 	// DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux.
 	DefaultMsgqueueLimit = 819200
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index eda8d9788..3fcdf8235 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -49,18 +49,6 @@ const (
 	MREMAP_FIXED   = 1 << 1
 )
 
-// Flags for mlock2(2).
-const (
-	MLOCK_ONFAULT = 0x01
-)
-
-// Flags for mlockall(2).
-const (
-	MCL_CURRENT = 1
-	MCL_FUTURE  = 2
-	MCL_ONFAULT = 4
-)
-
 // Advice for madvise(2).
 const (
 	MADV_NORMAL       = 0
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index eeca01876..ba0b7d4fd 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -33,7 +33,7 @@ const (
 	Rss
 	ProcessCount
 	NumberOfFiles
-	MemoryLocked
+	MemoryPagesLocked
 	AS
 	Locks
 	SignalsPending
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 295f9c398..511db6733 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{
 	linux.RLIMIT_RSS:        Rss,
 	linux.RLIMIT_NPROC:      ProcessCount,
 	linux.RLIMIT_NOFILE:     NumberOfFiles,
-	linux.RLIMIT_MEMLOCK:    MemoryLocked,
+	linux.RLIMIT_MEMLOCK:    MemoryPagesLocked,
 	linux.RLIMIT_AS:         AS,
 	linux.RLIMIT_LOCKS:      Locks,
 	linux.RLIMIT_SIGPENDING: SignalsPending,
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index cf20b11e3..28e2bed9b 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -243,40 +243,6 @@ type MappingIdentity interface {
 	Msync(ctx context.Context, mr MappableRange) error
 }
 
-// MLockMode specifies the memory locking behavior of a memory mapping.
-type MLockMode int
-
-// Note that the ordering of MLockModes is significant; see
-// mm.MemoryManager.defMLockMode.
-const (
-	// MLockNone specifies that a mapping has no memory locking behavior.
-	//
-	// This must be the zero value for MLockMode.
-	MLockNone MLockMode = iota
-
-	// MLockEager specifies that a mapping is memory-locked, as by mlock() or
-	// similar. Pages in the mapping should be made, and kept, resident in
-	// physical memory as soon as possible.
-	//
-	// As of this writing, MLockEager does not cause memory-locking to be
-	// requested from the host; it only affects the sentry's memory management
-	// behavior.
-	//
-	// MLockEager is analogous to Linux's VM_LOCKED.
-	MLockEager
-
-	// MLockLazy specifies that a mapping is memory-locked, as by mlock() or
-	// similar. Pages in the mapping should be kept resident in physical memory
-	// once they have been made resident due to e.g. a page fault.
-	//
-	// As of this writing, MLockLazy does not cause memory-locking to be
-	// requested from the host; in fact, it has virtually no effect, except for
-	// interactions between mlocked pages and other syscalls.
-	//
-	// MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT.
-	MLockLazy
-)
-
 // MMapOpts specifies a request to create a memory mapping.
 type MMapOpts struct {
 	// Length is the length of the mapping.
@@ -337,9 +303,6 @@ type MMapOpts struct {
 	// mapping (see platform.AddressSpace.MapFile).
 	Precommit bool
 
-	// MLockMode specifies the memory locking behavior of the mapping.
-	MLockMode MLockMode
-
 	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
 	// empty, MappingIdentity.MappedName() will be used instead.
 	//
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 5a9185e5d..744e73a39 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -106,7 +106,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
-        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index e7aa24c69..7488f7c4a 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() {
 // for all addresses in ar should be precommitted.
 //
 // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
+// ar must be page-aligned. pseg.Range().Contains(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
@@ -173,9 +173,7 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		}
 	}
 
-	// Since this checks ar.End and not mapAR.End, we will never map a pma that
-	// is not required.
-	for pseg.Ok() && pseg.Start() < ar.End {
+	for {
 		pma := pseg.ValuePtr()
 		pmaAR := pseg.Range()
 		pmaMapAR := pmaAR.Intersect(mapAR)
@@ -186,9 +184,13 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
 			return err
 		}
+		// Since this checks ar.End and not mapAR.End, we will never map a pma
+		// that is not required.
+		if ar.End <= pmaAR.End {
+			return nil
+		}
 		pseg = pseg.NextSegment()
 	}
-	return nil
 }
 
 // unmapASLocked removes all AddressSpace mappings for addresses in ar.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index a42e32b43..1613ce11d 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -59,17 +58,13 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
 	mm2 := &MemoryManager{
-		p:           mm.p,
-		haveASIO:    mm.haveASIO,
-		layout:      mm.layout,
-		privateRefs: mm.privateRefs,
-		users:       1,
-		brk:         mm.brk,
-		usageAS:     mm.usageAS,
-		// "The child does not inherit its parent's memory locks (mlock(2),
-		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
-		// MLockNone, both of which are zero values. vma.mlockMode is reset
-		// when copied below.
+		p:                    mm.p,
+		haveASIO:             mm.haveASIO,
+		layout:               mm.layout,
+		privateRefs:          mm.privateRefs,
+		users:                1,
+		usageAS:              mm.usageAS,
+		brk:                  mm.brk,
 		captureInvalidations: true,
 		argv:                 mm.argv,
 		envv:                 mm.envv,
@@ -82,7 +77,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	// Copy vmas.
 	dstvgap := mm2.vmas.FirstGap()
 	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
-		vma := srcvseg.Value() // makes a copy of the vma
+		vma := srcvseg.ValuePtr()
 		vmaAR := srcvseg.Range()
 		// Inform the Mappable, if any, of the new mapping.
 		if vma.mappable != nil {
@@ -94,8 +89,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		vma.mlockMode = memmap.MLockNone
-		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
 		// We don't need to update mm2.usageAS since we copied it from mm
 		// above.
 	}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index c0632d232..b1e39e898 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -95,29 +95,17 @@ type MemoryManager struct {
 	// vmas is protected by mappingMu.
 	vmas vmaSet
 
-	// brk is the mm's brk, which is manipulated using the brk(2) system call.
-	// The brk is initially set up by the loader which maps an executable
-	// binary into the mm.
-	//
-	// brk is protected by mappingMu.
-	brk usermem.AddrRange
-
 	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
 	//
 	// usageAS is protected by mappingMu.
 	usageAS uint64
 
-	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
-	// memmap.MLockNone.
-	//
-	// lockedAS is protected by mappingMu.
-	lockedAS uint64
-
-	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
-	// defMLockMode is greater.
+	// brk is the mm's brk, which is manipulated using the brk(2) system call.
+	// The brk is initially set up by the loader which maps an executable
+	// binary into the mm.
 	//
-	// defMLockMode is protected by mappingMu.
-	defMLockMode memmap.MLockMode
+	// brk is protected by mappingMu.
+	brk usermem.AddrRange
 
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
@@ -264,8 +252,6 @@ type vma struct {
 	// metag, none of which we currently support.
 	growsDown bool `state:"manual"`
 
-	mlockMode memmap.MLockMode
-
 	// If id is not nil, it controls the lifecycle of mappable and provides vma
 	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
 	id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 383703ec3..daaae4da1 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -20,7 +20,6 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@@ -129,24 +128,16 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 
 	// Get the new vma.
 	mm.mappingMu.Lock()
-	if opts.MLockMode < mm.defMLockMode {
-		opts.MLockMode = mm.defMLockMode
-	}
 	vseg, ar, err := mm.createVMALocked(ctx, opts)
 	if err != nil {
 		mm.mappingMu.Unlock()
 		return 0, err
 	}
 
-	// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
-	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
-	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
-	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
-	// populate_vma_page_range(). Confirm this behavior.
 	switch {
-	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
+	case opts.Precommit:
 		// Get pmas and map with precommit as requested.
-		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+		mm.populateAndUnlock(ctx, vseg, ar, true)
 
 	case opts.Mappable == nil && length <= privateAllocUnit:
 		// NOTE: Get pmas and map eagerly in the hope
@@ -155,7 +146,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		// memmap.Mappable.Translate is unknown; and only for small mappings,
 		// to avoid needing to allocate large amounts of memory that we may
 		// subsequently need to checkpoint.
-		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
+		mm.populateAndUnlock(ctx, vseg, ar, false)
 
 	default:
 		mm.mappingMu.Unlock()
@@ -164,29 +155,31 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 	return ar.Start, nil
 }
 
-// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
-// into mm.as if it is active.
+// Preconditions: mm.mappingMu must be locked for writing.
 //
-// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
-func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
 		// mm/gup.c:populate_vma_page_range.
+		mm.mappingMu.Unlock()
 		return
 	}
 
 	mm.activeMu.Lock()
-	// Can't defer mm.activeMu.Unlock(); see below.
 
-	// Even if we get new pmas, we can't actually map them if we don't have an
+	// Even if we get a new pma, we can't actually map it if we don't have an
 	// AddressSpace.
 	if mm.as == nil {
 		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
 		return
 	}
 
 	// Ensure that we have usable pmas.
+	mm.mappingMu.DowngradeLock()
 	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
 	if err != nil {
 		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
 		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
@@ -204,45 +197,6 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u
 	mm.activeMu.RUnlock()
 }
 
-// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
-// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
-// preferable to populateVMA since it unlocks mm.mappingMu before performing
-// expensive operations that don't require it to be locked.
-//
-// Preconditions: mm.mappingMu must be locked for writing.
-// vseg.Range().IsSupersetOf(ar).
-//
-// Postconditions: mm.mappingMu will be unlocked.
-func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
-	// See populateVMA above for commentary.
-	if !vseg.ValuePtr().effectivePerms.Any() {
-		mm.mappingMu.Unlock()
-		return
-	}
-
-	mm.activeMu.Lock()
-
-	if mm.as == nil {
-		mm.activeMu.Unlock()
-		mm.mappingMu.Unlock()
-		return
-	}
-
-	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
-	// isn't needed at all for mapASLocked.
-	mm.mappingMu.DowngradeLock()
-	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
-	mm.mappingMu.RUnlock()
-	if err != nil {
-		mm.activeMu.Unlock()
-		return
-	}
-
-	mm.activeMu.DowngradeLock()
-	mm.mapASLocked(pseg, ar, precommit)
-	mm.activeMu.RUnlock()
-}
-
 // MapStack allocates the initial process stack.
 func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
 	// maxStackSize is the maximum supported process stack size in bytes.
@@ -282,7 +236,6 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error
 		MaxPerms:  usermem.AnyAccess,
 		Private:   true,
 		GrowsDown: true,
-		MLockMode: mm.defMLockMode,
 		Hint:      "[stack]",
 	})
 	return ar, err
@@ -381,19 +334,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// occupies at least part of the destination. Thus the NoMove case always
 	// fails and the MayMove case always falls back to copying.
 
-	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
-		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
-		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
-		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
-		// !CAP_IPC_LOCK.
-		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
-		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
-			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
-				return 0, syserror.EAGAIN
-			}
-		}
-	}
-
 	if opts.Move != MRemapMustMove {
 		// Handle no-ops and in-place shrinking. These cases don't care if
 		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
@@ -420,7 +360,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.mappable != nil {
 			newOffset = vseg.mappableRange().End
 		}
-		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length:          newSize - oldSize,
 			MappingIdentity: vma.id,
 			Mappable:        vma.mappable,
@@ -431,13 +371,9 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			MaxPerms:        vma.maxPerms,
 			Private:         vma.private,
 			GrowsDown:       vma.growsDown,
-			MLockMode:       vma.mlockMode,
 			Hint:            vma.hint,
 		})
 		if err == nil {
-			if vma.mlockMode == memmap.MLockEager {
-				mm.populateVMA(ctx, vseg, ar, true)
-			}
 			return oldAddr, nil
 		}
 		// In-place growth failed. In the MRemapMayMove case, fall through to
@@ -526,14 +462,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
+		mm.vmas.Add(newAR, vma)
 		mm.usageAS += uint64(newAR.Length())
-		if vma.mlockMode != memmap.MLockNone {
-			mm.lockedAS += uint64(newAR.Length())
-			if vma.mlockMode == memmap.MLockEager {
-				mm.populateVMA(ctx, vseg, newAR, true)
-			}
-		}
 		return newAR.Start, nil
 	}
 
@@ -555,11 +485,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	vseg = mm.vmas.Isolate(vseg, oldAR)
 	vma := vseg.Value()
 	mm.vmas.Remove(vseg)
-	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
+	mm.vmas.Add(newAR, vma)
 	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
-	if vma.mlockMode != memmap.MLockNone {
-		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
-	}
 
 	// Move pmas. This is technically optional for non-private pmas, which
 	// could just go through memmap.Mappable.Translate again, but it's required
@@ -574,10 +501,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
 	}
 
-	if vma.mlockMode == memmap.MLockEager {
-		mm.populateVMA(ctx, vseg, newAR, true)
-	}
-
 	return newAR.Start, nil
 }
 
@@ -688,10 +611,9 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
 // error on failure.
 func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
 	mm.mappingMu.Lock()
-	// Can't defer mm.mappingMu.Unlock(); see below.
+	defer mm.mappingMu.Unlock()
 
 	if addr < mm.brk.Start {
-		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EINVAL
 	}
 
@@ -701,24 +623,21 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 	// heap + data + bss. The segment sizes need to be plumbed from the
 	// loader package to fully enforce RLIMIT_DATA.
 	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
-		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.ENOMEM
 	}
 
 	oldbrkpg, _ := mm.brk.End.RoundUp()
 	newbrkpg, ok := addr.RoundUp()
 	if !ok {
-		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EFAULT
 	}
 
 	switch {
 	case newbrkpg < oldbrkpg:
 		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
-		mm.mappingMu.Unlock()
 
 	case oldbrkpg < newbrkpg:
-		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length: uint64(newbrkpg - oldbrkpg),
 			Addr:   oldbrkpg,
 			Fixed:  true,
@@ -727,221 +646,17 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 			Perms:    usermem.ReadWrite,
 			MaxPerms: usermem.AnyAccess,
 			Private:  true,
-			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
-			// mm->def_flags.
-			MLockMode: mm.defMLockMode,
-			Hint:      "[heap]",
+			Hint:     "[heap]",
 		})
 		if err != nil {
-			mm.mappingMu.Unlock()
 			return mm.brk.End, err
 		}
-		if mm.defMLockMode == memmap.MLockEager {
-			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
-		} else {
-			mm.mappingMu.Unlock()
-		}
-
-	default:
-		// Nothing to do.
-		mm.mappingMu.Unlock()
 	}
 
 	mm.brk.End = addr
 	return addr, nil
 }
 
-// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
-// depending on mode.
-func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
-	// Linux allows this to overflow.
-	la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
-	ar, ok := addr.RoundDown().ToRange(uint64(la))
-	if !ok {
-		return syserror.EINVAL
-	}
-
-	mm.mappingMu.Lock()
-	// Can't defer mm.mappingMu.Unlock(); see below.
-
-	if mode != memmap.MLockNone {
-		// Check against RLIMIT_MEMLOCK.
-		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
-			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
-			if mlockLimit == 0 {
-				mm.mappingMu.Unlock()
-				return syserror.EPERM
-			}
-			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
-				mm.mappingMu.Unlock()
-				return syserror.ENOMEM
-			}
-		}
-	}
-
-	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
-	if ar.Length() == 0 {
-		mm.mappingMu.Unlock()
-		return nil
-	}
-
-	// Apply the new mlock mode to vmas.
-	var unmapped bool
-	vseg := mm.vmas.FindSegment(ar.Start)
-	for {
-		if !vseg.Ok() {
-			unmapped = true
-			break
-		}
-		vseg = mm.vmas.Isolate(vseg, ar)
-		vma := vseg.ValuePtr()
-		prevMode := vma.mlockMode
-		vma.mlockMode = mode
-		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
-			mm.lockedAS += uint64(vseg.Range().Length())
-		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
-			mm.lockedAS -= uint64(vseg.Range().Length())
-		}
-		if ar.End <= vseg.End() {
-			break
-		}
-		vseg, _ = vseg.NextNonEmpty()
-	}
-	mm.vmas.MergeRange(ar)
-	mm.vmas.MergeAdjacent(ar)
-	if unmapped {
-		mm.mappingMu.Unlock()
-		return syserror.ENOMEM
-	}
-
-	if mode == memmap.MLockEager {
-		// Ensure that we have usable pmas. Since we didn't return ENOMEM
-		// above, ar must be fully covered by vmas, so we can just use
-		// NextSegment below.
-		mm.activeMu.Lock()
-		mm.mappingMu.DowngradeLock()
-		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
-			if !vseg.ValuePtr().effectivePerms.Any() {
-				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
-				// case, which is converted to ENOMEM by mlock.
-				mm.activeMu.Unlock()
-				mm.mappingMu.RUnlock()
-				return syserror.ENOMEM
-			}
-			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{})
-			if err != nil {
-				mm.activeMu.Unlock()
-				mm.mappingMu.RUnlock()
-				// Linux: mm/mlock.c:__mlock_posix_error_return()
-				if err == syserror.EFAULT {
-					return syserror.ENOMEM
-				}
-				if err == syserror.ENOMEM {
-					return syserror.EAGAIN
-				}
-				return err
-			}
-		}
-
-		// Map pmas into the active AddressSpace, if we have one.
-		mm.mappingMu.RUnlock()
-		if mm.as != nil {
-			mm.activeMu.DowngradeLock()
-			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
-			mm.activeMu.RUnlock()
-			if err != nil {
-				return err
-			}
-		} else {
-			mm.activeMu.Unlock()
-		}
-	} else {
-		mm.mappingMu.Unlock()
-	}
-
-	return nil
-}
-
-// MLockAllOpts holds options to MLockAll.
-type MLockAllOpts struct {
-	// If Current is true, change the memory-locking behavior of all mappings
-	// to Mode. If Future is true, upgrade the memory-locking behavior of all
-	// future mappings to Mode. At least one of Current or Future must be true.
-	Current bool
-	Future  bool
-	Mode    memmap.MLockMode
-}
-
-// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
-// depending on opts.
-func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
-	if !opts.Current && !opts.Future {
-		return syserror.EINVAL
-	}
-
-	mm.mappingMu.Lock()
-	// Can't defer mm.mappingMu.Unlock(); see below.
-
-	if opts.Current {
-		if opts.Mode != memmap.MLockNone {
-			// Check against RLIMIT_MEMLOCK.
-			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
-				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
-				if mlockLimit == 0 {
-					mm.mappingMu.Unlock()
-					return syserror.EPERM
-				}
-				if uint64(mm.vmas.Span()) > mlockLimit {
-					mm.mappingMu.Unlock()
-					return syserror.ENOMEM
-				}
-			}
-		}
-		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
-			vma := vseg.ValuePtr()
-			prevMode := vma.mlockMode
-			vma.mlockMode = opts.Mode
-			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
-				mm.lockedAS += uint64(vseg.Range().Length())
-			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
-				mm.lockedAS -= uint64(vseg.Range().Length())
-			}
-		}
-	}
-
-	if opts.Future {
-		mm.defMLockMode = opts.Mode
-	}
-
-	if opts.Current && opts.Mode == memmap.MLockEager {
-		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
-		// ignores the return value of __mm_populate(), so all errors below are
-		// ignored.
-		//
-		// Try to get usable pmas.
-		mm.activeMu.Lock()
-		mm.mappingMu.DowngradeLock()
-		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
-			if vseg.ValuePtr().effectivePerms.Any() {
-				mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{})
-			}
-		}
-
-		// Map all pmas into the active AddressSpace, if we have one.
-		mm.mappingMu.RUnlock()
-		if mm.as != nil {
-			mm.activeMu.DowngradeLock()
-			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
-			mm.activeMu.RUnlock()
-		} else {
-			mm.activeMu.Unlock()
-		}
-	} else {
-		mm.mappingMu.Unlock()
-	}
-	return nil
-}
-
 // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
 func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	ar, ok := addr.ToRange(length)
@@ -965,49 +680,46 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	// ensures that Decommit immediately reduces host memory usage.
 	var didUnmapAS bool
 	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
 	mem := mm.p.Memory()
-	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
-		vma := vseg.ValuePtr()
-		if vma.mlockMode != memmap.MLockNone {
-			return syserror.EINVAL
-		}
-		vsegAR := vseg.Range().Intersect(ar)
-		// pseg should already correspond to either this vma or a later one,
-		// since there can't be a pma without a corresponding vma.
-		if checkInvariants {
-			if pseg.Ok() && pseg.End() <= vsegAR.Start {
-				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
-			}
-		}
-		for pseg.Ok() && pseg.Start() < vsegAR.End {
-			pma := pseg.ValuePtr()
-			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
-				psegAR := pseg.Range().Intersect(ar)
-				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
-					if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
-						pseg = pseg.NextSegment()
-						continue
-					}
-					// If an error occurs, fall through to the general
-					// invalidation case below.
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+			psegAR := pseg.Range().Intersect(ar)
+			vseg = vseg.seekNextLowerBound(psegAR.Start)
+			if checkInvariants {
+				if !vseg.Ok() {
+					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
+				}
+				if psegAR.Start < vseg.Start() {
+					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
 				}
 			}
-			pseg = mm.pmas.Isolate(pseg, vsegAR)
-			pma = pseg.ValuePtr()
-			if !didUnmapAS {
-				// Unmap all of ar, not just pseg.Range(), to minimize host
-				// syscalls. AddressSpace mappings must be removed before
-				// mm.decPrivateRef().
-				mm.unmapASLocked(ar)
-				didUnmapAS = true
-			}
-			if pma.private {
-				mm.decPrivateRef(pseg.fileRange())
+			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
+				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+					pseg = pseg.NextSegment()
+					continue
+				}
+				// If an error occurs, fall through to the general
+				// invalidation case below.
 			}
-			pma.file.DecRef(pseg.fileRange())
-			mm.removeRSSLocked(pseg.Range())
-			pseg = mm.pmas.Remove(pseg).NextSegment()
 		}
+		pseg = mm.pmas.Isolate(pseg, ar)
+		pma = pseg.ValuePtr()
+		if !didUnmapAS {
+			// Unmap all of ar, not just pseg.Range(), to minimize host
+			// syscalls. AddressSpace mappings must be removed before
+			// mm.decPrivateRef().
+			mm.unmapASLocked(ar)
+			didUnmapAS = true
+		}
+		if pma.private {
+			mm.decPrivateRef(pseg.fileRange())
+		}
+		pma.file.DecRef(pseg.fileRange())
+		mm.removeRSSLocked(pseg.Range())
+
+		pseg = mm.pmas.Remove(pseg).NextSegment()
 	}
 
 	// "If there are some parts of the specified address space that are not
@@ -1020,28 +732,9 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	return nil
 }
 
-// MSyncOpts holds options to MSync.
-type MSyncOpts struct {
-	// Sync has the semantics of MS_SYNC.
-	Sync bool
-
-	// Invalidate has the semantics of MS_INVALIDATE.
-	Invalidate bool
-}
-
-// MSync implements the semantics of Linux's msync().
-func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
-	if addr != addr.RoundDown() {
-		return syserror.EINVAL
-	}
-	if length == 0 {
-		return nil
-	}
-	la, ok := usermem.Addr(length).RoundUp()
-	if !ok {
-		return syserror.ENOMEM
-	}
-	ar, ok := addr.ToRange(uint64(la))
+// Sync implements the semantics of Linux's msync(MS_SYNC).
+func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
 	if !ok {
 		return syserror.ENOMEM
 	}
@@ -1066,14 +759,10 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui
 		}
 		lastEnd = vseg.End()
 		vma := vseg.ValuePtr()
-		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
-			mm.mappingMu.RUnlock()
-			return syserror.EBUSY
-		}
 		// It's only possible to have dirtied the Mappable through a shared
 		// mapping. Don't check if the mapping is writable, because mprotect
 		// may have changed this, and also because Linux doesn't.
-		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
+		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
 			// We can't call memmap.MappingIdentity.Msync while holding
 			// mm.mappingMu since it may take fs locks that precede it in the
 			// lock order.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 28ba9f2f5..5c2c802f6 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -17,10 +17,8 @@ package mm
 import (
 	"fmt"
 
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -55,23 +53,6 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
 	}
 
-	if opts.MLockMode != memmap.MLockNone {
-		// Check against RLIMIT_MEMLOCK.
-		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
-			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
-			if mlockLimit == 0 {
-				return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
-			}
-			newLockedAS := mm.lockedAS + opts.Length
-			if opts.Unmap {
-				newLockedAS -= mm.mlockedBytesRangeLocked(ar)
-			}
-			if newLockedAS > mlockLimit {
-				return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
-			}
-		}
-	}
-
 	// Remove overwritten mappings. This ordering is consistent with Linux:
 	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
 	// file->f_op->mmap().
@@ -104,14 +85,10 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		maxPerms:       opts.MaxPerms,
 		private:        opts.Private,
 		growsDown:      opts.GrowsDown,
-		mlockMode:      opts.MLockMode,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
 	})
 	mm.usageAS += opts.Length
-	if opts.MLockMode != memmap.MLockNone {
-		mm.lockedAS += opts.Length
-	}
 
 	return vseg, ar, nil
 }
@@ -224,17 +201,6 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo
 	return 0, syserror.ENOMEM
 }
 
-// Preconditions: mm.mappingMu must be locked.
-func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
-	var total uint64
-	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
-		if vseg.ValuePtr().mlockMode != memmap.MLockNone {
-			total += uint64(vseg.Range().Intersect(ar).Length())
-		}
-	}
-	return total
-}
-
 // getVMAsLocked ensures that vmas exist for all addresses in ar, and support
 // access of type (at, ignorePermissions). It returns:
 //
@@ -372,9 +338,6 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 			vma.id.DecRef()
 		}
 		mm.usageAS -= uint64(vmaAR.Length())
-		if vma.mlockMode != memmap.MLockNone {
-			mm.lockedAS -= uint64(vmaAR.Length())
-		}
 		vgap = mm.vmas.Remove(vseg)
 		vseg = vgap.NextSegment()
 	}
@@ -405,7 +368,6 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
 		vma1.maxPerms != vma2.maxPerms ||
 		vma1.private != vma2.private ||
 		vma1.growsDown != vma2.growsDown ||
-		vma1.mlockMode != vma2.mlockMode ||
 		vma1.id != vma2.id ||
 		vma1.hint != vma2.hint {
 		return vma{}, false
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index e855590e6..7a5c93f9b 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{
 		145: SchedGetscheduler,
 		146: SchedGetPriorityMax,
 		147: SchedGetPriorityMin,
-		148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
-		149: Mlock,
-		150: Munlock,
-		151: Mlockall,
-		152: Munlockall,
+		148: syscalls.ErrorWithEvent(syscall.EPERM),      // SchedRrGetInterval,
+		149: syscalls.Error(nil),                         // Mlock, TODO
+		150: syscalls.Error(nil),                         // Munlock, TODO
+		151: syscalls.Error(nil),                         // Mlockall, TODO
+		152: syscalls.Error(nil),                         // Munlockall, TODO
 		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
 		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
 		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
@@ -373,9 +373,8 @@ var AMD64 = &kernel.SyscallTable{
 		//     322: Execveat, TODO
 		//     323: Userfaultfd, TODO
 		//     324: Membarrier, TODO
-		325: Mlock2,
-		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
-		//	326: CopyFileRange,
+		// Syscalls after 325 are backports from 4.6.
+		325: syscalls.Error(nil), // Mlock2, TODO
 		327: Preadv2,
 		328: Pwritev2,
 	},
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 8732861e0..145f7846c 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -69,9 +69,6 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
 		Precommit: linux.MAP_POPULATE&flags != 0,
 	}
-	if linux.MAP_LOCKED&flags != 0 {
-		opts.MLockMode = memmap.MLockEager
-	}
 	defer func() {
 		if opts.MappingIdentity != nil {
 			opts.MappingIdentity.DecRef()
@@ -387,6 +384,16 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	length := args[1].SizeT()
 	flags := args[2].Int()
 
+	if addr != addr.RoundDown() {
+		return 0, nil, syserror.EINVAL
+	}
+	if length == 0 {
+		return 0, nil, nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
 	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
 	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
 	// permits a call to msync() that specifies neither of these flags, with
@@ -399,72 +406,39 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if sync && flags&linux.MS_ASYNC != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-	err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
-		Sync:       sync,
-		Invalidate: flags&linux.MS_INVALIDATE != 0,
-	})
-	// MSync calls fsync, the same interrupt conversion rules apply, see
-	// mm/msync.c, fsync POSIX.1-2008.
-	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
-}
-
-// Mlock implements linux syscall mlock(2).
-func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	addr := args[0].Pointer()
-	length := args[1].SizeT()
-
-	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
-}
 
-// Mlock2 implements linux syscall mlock2(2).
-func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	addr := args[0].Pointer()
-	length := args[1].SizeT()
-	flags := args[2].Int()
-
-	if flags&^(linux.MLOCK_ONFAULT) != 0 {
+	// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
+	// that they can be updated with the fresh values just written)". This is a
+	// no-op given that shared memory exists. However, MS_INVALIDATE can also
+	// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
+	// and a memory lock exists for the specified address range." Given that
+	// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
+	// some user program could be using it for synchronization.
+	if flags&linux.MS_INVALIDATE != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-
-	mode := memmap.MLockEager
-	if flags&linux.MLOCK_ONFAULT != 0 {
-		mode = memmap.MLockLazy
-	}
-	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
-}
-
-// Munlock implements linux syscall munlock(2).
-func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	addr := args[0].Pointer()
-	length := args[1].SizeT()
-
-	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
-}
-
-// Mlockall implements linux syscall mlockall(2).
-func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	flags := args[0].Int()
-
-	if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
-		return 0, nil, syserror.EINVAL
+	// MS_SYNC "requests an update and waits for it to complete."
+	if sync {
+		err := t.MemoryManager().Sync(t, addr, uint64(la))
+		// Sync calls fsync, the same interrupt conversion rules apply, see
+		// mm/msync.c, fsync POSIX.1-2008.
+		return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	}
+	// MS_ASYNC "specifies that an update be scheduled, but the call returns
+	// immediately". As long as dirty pages are tracked and eventually written
+	// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
+	// is in fact a no-op, since the kernel properly tracks dirty pages and
+	// flushes them to storage as necessary.")
+	//
+	// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
+	// This applies even for MS_ASYNC.
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return 0, nil, syserror.ENOMEM
 	}
-
-	mode := memmap.MLockEager
-	if flags&linux.MCL_ONFAULT != 0 {
-		mode = memmap.MLockLazy
+	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
+	if mapped != uint64(la) {
+		return 0, nil, syserror.ENOMEM
 	}
-	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
-		Current: flags&linux.MCL_CURRENT != 0,
-		Future:  flags&linux.MCL_FUTURE != 0,
-		Mode:    mode,
-	})
-}
-
-// Munlockall implements linux syscall munlockall(2).
-func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
-		Current: true,
-		Future:  true,
-		Mode:    memmap.MLockNone,
-	})
+	return 0, nil, nil
 }
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index b0b216045..2f16e1791 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -90,7 +90,6 @@ var setableLimits = map[limits.LimitType]struct{}{
 	limits.CPU:           {},
 	limits.Data:          {},
 	limits.FileSize:      {},
-	limits.MemoryLocked:  {},
 	limits.Stack:         {},
 	// These are not enforced, but we include them here to avoid returning
 	// EPERM, since some apps expect them to succeed.
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index e3e716bf9..8ecda6d0e 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -29,7 +29,7 @@ var fromLinuxResource = map[string]limits.LimitType{
 	"RLIMIT_DATA":       limits.Data,
 	"RLIMIT_FSIZE":      limits.FileSize,
 	"RLIMIT_LOCKS":      limits.Locks,
-	"RLIMIT_MEMLOCK":    limits.MemoryLocked,
+	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
 	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
 	"RLIMIT_NICE":       limits.Nice,
 	"RLIMIT_NOFILE":     limits.NumberOfFiles,
@@ -55,7 +55,7 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
 	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
-	ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536})
 	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
 	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
 	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index f13e32daa..c0b8246b5 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1019,21 +1019,6 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "mlock_test",
-    testonly = 1,
-    srcs = ["mlock.cc"],
-    linkstatic = 1,
-    deps = [
-        "//test/util:capability_util",
-        "//test/util:cleanup",
-        "//test/util:memory_util",
-        "//test/util:multiprocess_util",
-        "//test/util:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_binary(
     name = "mmap_test",
     testonly = 1,
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
deleted file mode 100644
index a0d876c2e..000000000
--- a/test/syscalls/linux/mlock.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <errno.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/resource.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-
-#include "test/util/capability_util.h"
-#include "test/util/cleanup.h"
-#include "test/util/memory_util.h"
-#include "test/util/multiprocess_util.h"
-#include "test/util/test_util.h"
-
-using ::testing::_;
-
-namespace gvisor {
-namespace testing {
-
-namespace {
-
-PosixErrorOr<bool> CanMlock() {
-  struct rlimit rlim;
-  if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
-    return PosixError(errno, "getrlimit(RLIMIT_MEMLOCK)");
-  }
-  if (rlim.rlim_cur != 0) {
-    return true;
-  }
-  return HaveCapability(CAP_IPC_LOCK);
-}
-
-// Returns true if the page containing addr is mlocked.
-bool IsPageMlocked(uintptr_t addr) {
-  // This relies on msync(MS_INVALIDATE) interacting correctly with mlocked
-  // pages, which is tested for by the MsyncInvalidate case below.
-  int const rv = msync(reinterpret_cast<void*>(addr & ~(kPageSize - 1)),
-                       kPageSize, MS_ASYNC | MS_INVALIDATE);
-  if (rv == 0) {
-    return false;
-  }
-  // This uses TEST_PCHECK_MSG since it's used in subprocesses.
-  TEST_PCHECK_MSG(errno == EBUSY, "msync failed with unexpected errno");
-  return true;
-}
-
-PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval) {
-  struct rlimit old_rlim;
-  if (getrlimit(resource, &old_rlim) != 0) {
-    return PosixError(errno, "getrlimit failed");
-  }
-  struct rlimit new_rlim = old_rlim;
-  new_rlim.rlim_cur = newval;
-  if (setrlimit(resource, &new_rlim) != 0) {
-    return PosixError(errno, "setrlimit failed");
-  }
-  return Cleanup([resource, old_rlim] {
-    TEST_PCHECK(setrlimit(resource, &old_rlim) == 0);
-  });
-}
-
-TEST(MlockTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(MlockTest, ProtNone) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping =
-      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
-              SyscallFailsWithErrno(ENOMEM));
-  // ENOMEM is returned because mlock can't populate the page, but it's still
-  // considered locked.
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(MlockTest, MadviseDontneed) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_THAT(madvise(mapping.ptr(), mapping.len(), MADV_DONTNEED),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-TEST(MlockTest, MsyncInvalidate) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_ASYNC | MS_INVALIDATE),
-              SyscallFailsWithErrno(EBUSY));
-  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_SYNC | MS_INVALIDATE),
-              SyscallFailsWithErrno(EBUSY));
-}
-
-TEST(MlockTest, Fork) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-  EXPECT_THAT(
-      InForkedProcess([&] { TEST_CHECK(!IsPageMlocked(mapping.addr())); }),
-      IsPosixErrorOkAndHolds(0));
-}
-
-TEST(MlockTest, RlimitMemlockZero) {
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
-              SyscallFailsWithErrno(EPERM));
-}
-
-TEST(MlockTest, RlimitMemlockInsufficient) {
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
-              SyscallFailsWithErrno(ENOMEM));
-}
-
-TEST(MunlockTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(MunlockTest, NotLocked) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-}
-
-// There is currently no test for mlockall(MCL_CURRENT) because the default
-// RLIMIT_MEMLOCK of 64 KB is insufficient to actually invoke
-// mlockall(MCL_CURRENT).
-
-TEST(MlockallTest, Future) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-
-  // Run this test in a separate (single-threaded) subprocess to ensure that a
-  // background thread doesn't try to mmap a large amount of memory, fail due
-  // to hitting RLIMIT_MEMLOCK, and explode the process violently.
-  EXPECT_THAT(InForkedProcess([] {
-                auto const mapping =
-                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)
-                        .ValueOrDie();
-                TEST_CHECK(!IsPageMlocked(mapping.addr()));
-                TEST_PCHECK(mlockall(MCL_FUTURE) == 0);
-                // Ensure that mlockall(MCL_FUTURE) is turned off before the end
-                // of the test, as otherwise mmaps may fail unexpectedly.
-                Cleanup do_munlockall([] { TEST_PCHECK(munlockall() == 0); });
-                auto const mapping2 = ASSERT_NO_ERRNO_AND_VALUE(
-                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-                TEST_CHECK(IsPageMlocked(mapping2.addr()));
-                // Fire munlockall() and check that it disables
-                // mlockall(MCL_FUTURE).
-                do_munlockall.Release()();
-                auto const mapping3 = ASSERT_NO_ERRNO_AND_VALUE(
-                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-                TEST_CHECK(!IsPageMlocked(mapping2.addr()));
-              }),
-              IsPosixErrorOkAndHolds(0));
-}
-
-TEST(MunlockallTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(munlockall(), SyscallSucceeds());
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-}
-
-#ifndef SYS_mlock2
-#ifdef __x86_64__
-#define SYS_mlock2 325
-#endif
-#endif
-
-#ifndef MLOCK_ONFAULT
-#define MLOCK_ONFAULT 0x01  // Linux: include/uapi/asm-generic/mman-common.h
-#endif
-
-#ifdef SYS_mlock2
-
-int mlock2(void const* addr, size_t len, int flags) {
-  return syscall(SYS_mlock2, addr, len, flags);
-}
-
-TEST(Mlock2Test, NoFlags) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), 0), SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(Mlock2Test, MlockOnfault) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), MLOCK_ONFAULT),
-              SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(Mlock2Test, UnknownFlags) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_THAT(mlock2(mapping.ptr(), mapping.len(), ~0),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-#endif  // defined(SYS_mlock2)
-
-TEST(MapLockedTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(MapLockedTest, RlimitMemlockZero) {
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
-  EXPECT_THAT(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
-      PosixErrorIs(EPERM, _));
-}
-
-TEST(MapLockedTest, RlimitMemlockInsufficient) {
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
-  EXPECT_THAT(
-      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
-      PosixErrorIs(EAGAIN, _));
-}
-
-TEST(MremapLockedTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-
-  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
-                      MREMAP_MAYMOVE, nullptr);
-  if (addr == MAP_FAILED) {
-    FAIL() << "mremap failed: " << errno << " (" << strerror(errno) << ")";
-  }
-  mapping.release();
-  mapping.reset(addr, 2 * mapping.len());
-  EXPECT_TRUE(IsPageMlocked(reinterpret_cast<uintptr_t>(addr)));
-}
-
-TEST(MremapLockedTest, RlimitMemlockZero) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
-  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
-                      MREMAP_MAYMOVE, nullptr);
-  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
-      << "addr = " << addr << ", errno = " << errno;
-}
-
-TEST(MremapLockedTest, RlimitMemlockInsufficient) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit = ASSERT_NO_ERRNO_AND_VALUE(
-      ScopedSetSoftRlimit(RLIMIT_MEMLOCK, mapping.len()));
-  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
-                      MREMAP_MAYMOVE, nullptr);
-  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
-      << "addr = " << addr << ", errno = " << errno;
-}
-
-}  // namespace
-
-}  // namespace testing
-}  // namespace gvisor
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index 72d90dc78..0ddc621aa 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -43,13 +43,14 @@ class MsyncParameterizedTest : public ::testing::TestWithParam<MsyncTestParam> {
  protected:
   int msync_flags() const { return std::get<0>(GetParam()); }
 
-  PosixErrorOr<Mapping> GetMapping() const { return std::get<1>(GetParam())(); }
+  PosixErrorOr<Mapping> GetMapping() const {
+    auto rv = std::get<1>(GetParam())();
+    return rv;
+  }
 };
 
-// All valid msync(2) flag combinations, not including MS_INVALIDATE. ("Linux
-// permits a call to msync() that specifies neither [MS_SYNC or MS_ASYNC], with
-// semantics that are (currently) equivalent to specifying MS_ASYNC." -
-// msync(2))
+// All valid msync(2) flag combinations (not including MS_INVALIDATE, which
+// gVisor doesn't implement).
 constexpr std::initializer_list<int> kMsyncFlags = {MS_SYNC, MS_ASYNC, 0};
 
 // Returns functions that return mappings that should be successfully
@@ -133,15 +134,6 @@ TEST_P(MsyncFullParamTest, UnalignedAddressFails) {
       SyscallFailsWithErrno(EINVAL));
 }
 
-TEST_P(MsyncFullParamTest, InvalidateUnlockedSucceeds) {
-  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
-  EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags() | MS_INVALIDATE),
-              SyscallSucceeds());
-}
-
-// The test for MS_INVALIDATE on mlocked pages is in mlock.cc since it requires
-// probing for mlock support.
-
 INSTANTIATE_TEST_CASE_P(
     All, MsyncFullParamTest,
     ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
-- 
cgit v1.2.3


From 194ef586fcb1bec049ee8777c2e5f70997de7a87 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 20 Dec 2018 13:27:25 -0800
Subject: Rename limits.MemoryPagesLocked to limits.MemoryLocked.

"RLIMIT_MEMLOCK: This is the maximum number of bytes of memory that may
be locked into RAM." - getrlimit(2)

PiperOrigin-RevId: 226384346
Change-Id: Iefac4a1bb69f7714dc813b5b871226a8344dc800
---
 pkg/sentry/limits/limits.go | 2 +-
 pkg/sentry/limits/linux.go  | 2 +-
 runsc/boot/limits.go        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index ba0b7d4fd..eeca01876 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -33,7 +33,7 @@ const (
 	Rss
 	ProcessCount
 	NumberOfFiles
-	MemoryPagesLocked
+	MemoryLocked
 	AS
 	Locks
 	SignalsPending
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 511db6733..295f9c398 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{
 	linux.RLIMIT_RSS:        Rss,
 	linux.RLIMIT_NPROC:      ProcessCount,
 	linux.RLIMIT_NOFILE:     NumberOfFiles,
-	linux.RLIMIT_MEMLOCK:    MemoryPagesLocked,
+	linux.RLIMIT_MEMLOCK:    MemoryLocked,
 	linux.RLIMIT_AS:         AS,
 	linux.RLIMIT_LOCKS:      Locks,
 	linux.RLIMIT_SIGPENDING: SignalsPending,
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 8ecda6d0e..e3e716bf9 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -29,7 +29,7 @@ var fromLinuxResource = map[string]limits.LimitType{
 	"RLIMIT_DATA":       limits.Data,
 	"RLIMIT_FSIZE":      limits.FileSize,
 	"RLIMIT_LOCKS":      limits.Locks,
-	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
+	"RLIMIT_MEMLOCK":    limits.MemoryLocked,
 	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
 	"RLIMIT_NICE":       limits.Nice,
 	"RLIMIT_NOFILE":     limits.NumberOfFiles,
@@ -55,7 +55,7 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
 	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
-	ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
 	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
 	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
 	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
-- 
cgit v1.2.3


From a891afad6d7e3b09bafdccb4cc4b9fc4e577620e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Dec 2018 13:47:19 -0800
Subject: Simplify synchronization between runsc and sandbox process

Make 'runsc create' join cgroup before creating sandbox process.
This removes the need to synchronize platform creation and ensure
that sandbox process is charged to the right cgroup from the start.

PiperOrigin-RevId: 227166451
Change-Id: Ieb4b18e6ca0daf7b331dc897699ca419bc5ee3a2
---
 runsc/boot/controller.go       | 32 ++++--------------
 runsc/boot/loader.go           | 38 +++++++++-------------
 runsc/cgroup/cgroup.go         |  5 +++
 runsc/cmd/boot.go              | 23 ++++---------
 runsc/sandbox/sandbox.go       | 74 +++++++++++++-----------------------------
 runsc/test/root/cgroup_test.go | 32 ++++++++++++++++++
 6 files changed, 89 insertions(+), 115 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 05d4f3a5b..36e9d2c6b 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -81,9 +81,6 @@ const (
 	// and return its ExitStatus.
 	ContainerWait = "containerManager.Wait"
 
-	// ContainerWaitForLoader blocks until the container's loader has been created.
-	ContainerWaitForLoader = "containerManager.WaitForLoader"
-
 	// ContainerWaitPID is used to wait on a process with a certain PID in
 	// the sandbox and return its ExitStatus.
 	ContainerWaitPID = "containerManager.WaitPID"
@@ -115,21 +112,22 @@ type controller struct {
 	manager *containerManager
 }
 
-// newController creates a new controller and starts it listening.
-func newController(fd int, k *kernel.Kernel, w *watchdog.Watchdog) (*controller, error) {
+// newController creates a new controller. The caller must call
+// controller.srv.StartServing() to start the controller.
+func newController(fd int, l *Loader) (*controller, error) {
 	srv, err := server.CreateFromFD(fd)
 	if err != nil {
 		return nil, err
 	}
 
 	manager := &containerManager{
-		startChan:         make(chan struct{}),
-		startResultChan:   make(chan error),
-		loaderCreatedChan: make(chan struct{}),
+		startChan:       make(chan struct{}),
+		startResultChan: make(chan error),
+		l:               l,
 	}
 	srv.Register(manager)
 
-	if eps, ok := k.NetworkStack().(*epsocket.Stack); ok {
+	if eps, ok := l.k.NetworkStack().(*epsocket.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
@@ -138,10 +136,6 @@ func newController(fd int, k *kernel.Kernel, w *watchdog.Watchdog) (*controller,
 
 	srv.Register(&debug{})
 
-	if err := srv.StartServing(); err != nil {
-		return nil, err
-	}
-
 	return &controller{
 		srv:     srv,
 		manager: manager,
@@ -161,11 +155,6 @@ type containerManager struct {
 
 	// l is the loader that creates containers and sandboxes.
 	l *Loader
-
-	// loaderCreatedChan is used to signal when the loader has been created.
-	// After a loader is created, a notify method is called that writes to
-	// this channel.
-	loaderCreatedChan chan struct{}
 }
 
 // StartRoot will start the root container process.
@@ -291,13 +280,6 @@ func (cm *containerManager) Pause(_, _ *struct{}) error {
 	return nil
 }
 
-// WaitForLoader blocks until the container's loader has been created.
-func (cm *containerManager) WaitForLoader(_, _ *struct{}) error {
-	log.Debugf("containerManager.WaitForLoader")
-	<-cm.loaderCreatedChan
-	return nil
-}
-
 // RestoreOpts contains options related to restoring a container's file system.
 type RestoreOpts struct {
 	// FilePayload contains the state file to be restored, followed by the
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index a9c549790..3c6892446 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -277,20 +277,6 @@ func New(args Args) (*Loader, error) {
 	// Create a watchdog.
 	watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
 
-	// Create the control server using the provided FD.
-	//
-	// This must be done *after* we have initialized the kernel since the
-	// controller is used to configure the kernel's network stack.
-	//
-	// This should also be *before* we create the process, since a
-	// misconfigured process will cause an error, and we want the control
-	// server up before that so that we don't time out trying to connect to
-	// it.
-	ctrl, err := newController(args.ControllerFD, k, watchdog)
-	if err != nil {
-		return nil, fmt.Errorf("error creating control server: %v", err)
-	}
-
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create init process for root container: %v", err)
@@ -303,7 +289,6 @@ func New(args Args) (*Loader, error) {
 	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:            k,
-		ctrl:         ctrl,
 		conf:         args.Conf,
 		console:      args.Console,
 		watchdog:     watchdog,
@@ -348,7 +333,22 @@ func New(args Args) (*Loader, error) {
 		}
 	})
 
-	ctrl.manager.l = l
+	// Create the control server using the provided FD.
+	//
+	// This must be done *after* we have initialized the kernel since the
+	// controller is used to configure the kernel's network stack.
+	ctrl, err := newController(args.ControllerFD, l)
+	if err != nil {
+		return nil, fmt.Errorf("creating control server: %v", err)
+	}
+	l.ctrl = ctrl
+
+	// Only start serving after Loader is set to controller and controller is set
+	// to Loader, because they are both used in the urpc methods.
+	if err := ctrl.srv.StartServing(); err != nil {
+		return nil, fmt.Errorf("starting control server: %v", err)
+	}
+
 	return l, nil
 }
 
@@ -745,12 +745,6 @@ func (l *Loader) WaitForStartSignal() {
 	<-l.ctrl.manager.startChan
 }
 
-// NotifyLoaderCreated sends a signal to the container manager that this
-// loader has been created.
-func (l *Loader) NotifyLoaderCreated() {
-	l.ctrl.manager.loaderCreatedChan <- struct{}{}
-}
-
 // WaitExit waits for the root container to exit, and returns its exit status.
 func (l *Loader) WaitExit() kernel.ExitStatus {
 	// Wait for container.
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 15071387b..2887f3d7f 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -241,6 +241,11 @@ func (c *Cgroup) Uninstall() error {
 	return nil
 }
 
+// Join adds the current process to the all controllers.
+func (c *Cgroup) Join() error {
+	return c.Add(0)
+}
+
 // Add adds given process to all controllers.
 func (c *Cgroup) Add(pid int) error {
 	for key := range controllers {
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 192df7d3c..bb3435284 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -159,14 +159,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		panic("setCapsAndCallSelf must never return success")
 	}
 
-	// Wait until this process has been moved into cgroups.
-	startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
-	defer startSyncFile.Close()
-	buf := make([]byte, 1)
-	if r, err := startSyncFile.Read(buf); err != nil || r != 1 {
-		Fatalf("Unable to read from the start-sync descriptor: %v", err)
-	}
-
 	// Create the loader.
 	bootArgs := boot.Args{
 		ID:           f.Arg(0),
@@ -186,21 +178,20 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("error creating loader: %v", err)
 	}
 
-	// Fatalf exits the process and doesn't run defers. 'l' must be destroyed
-	// explicitly!
+	// Fatalf exits the process and doesn't run defers.
+	// 'l' must be destroyed explicitly after this point!
 
-	// Notify the parent process the controller has been created.
+	// Notify the parent process the sandbox has booted (and that the controller
+	// is up).
+	startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
+	buf := make([]byte, 1)
 	if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
 		l.Destroy()
 		Fatalf("Unable to write into the start-sync descriptor: %v", err)
 	}
-	// startSyncFile is closed here to be sure that starting with this point
-	// the runsc process will not write anything into it.
+	// Closes startSyncFile because 'l.Run()' only returns when the sandbox exits.
 	startSyncFile.Close()
 
-	// Notify other processes the loader has been created.
-	l.NotifyLoaderCreated()
-
 	// Wait for the start signal from runsc.
 	l.WaitForStartSignal()
 
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 0798aef9b..195cd4d6f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -65,7 +65,8 @@ type Sandbox struct {
 }
 
 // Create creates the sandbox process. The caller must call Destroy() on the
-// sandbox.
+// sandbox. If spec specified a cgroup, the current process will have joined
+// the cgroup upon return.
 func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File) (*Sandbox, error) {
 	s := &Sandbox{ID: id}
 	// The Cleanup object cleans up partially created sandboxes when an error occurs.
@@ -78,55 +79,41 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 
 		// If there is cgroup config, install it before creating sandbox process.
 		if err := s.Cgroup.Install(spec.Linux.Resources); err != nil {
-			return nil, fmt.Errorf("error configuring cgroup: %v", err)
+			return nil, fmt.Errorf("configuring cgroup: %v", err)
+		}
+
+		// Make this process join the cgroup to ensure the sandbox (and all its
+		// children processes) are part of the cgroup from the start. Don't bother
+		// moving out because the caller is about to exit anyways.
+		if err := cg.Join(); err != nil {
+			return nil, fmt.Errorf("joining cgroup: %v", err)
 		}
 	}
 
-	// Create a socket pair to synchronize runsc and sandbox processes.
-	// It is used for the following:
-	// * to notify the sandbox process when it has been moved into cgroups.
-	// * to wait for the controller socket.
-	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
-	if err != nil {
-		return nil, fmt.Errorf("error creating a start-sync socket pair %q: %v", s.ID, err)
+	// Create pipe to synchronize when sandbox process has been booted.
+	fds := make([]int, 2)
+	if err := syscall.Pipe(fds); err != nil {
+		return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
 	}
-	startSyncFile := os.NewFile(uintptr(fds[0]), "start-sync socket")
-	defer startSyncFile.Close()
+	clientSyncFile := os.NewFile(uintptr(fds[0]), "client sandbox sync")
+	defer clientSyncFile.Close()
 
-	sandboxSyncFile := os.NewFile(uintptr(fds[1]), "sandbox start-sync socket")
+	sandboxSyncFile := os.NewFile(uintptr(fds[1]), "sandbox sync")
 
 	// Create the sandbox process.
-	err = s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles, sandboxSyncFile)
-	// sandboxSyncFile has to be closed to be able to detect
-	// when the sandbox process exits unexpectedly.
+	err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles, sandboxSyncFile)
+	// sandboxSyncFile has to be closed to be able to detect when the sandbox
+	// process exits unexpectedly.
 	sandboxSyncFile.Close()
 	if err != nil {
 		return nil, err
 	}
 
-	if s.Cgroup != nil {
-		if err := s.Cgroup.Add(s.Pid); err != nil {
-			return nil, fmt.Errorf("error adding sandbox to cgroup: %v", err)
-		}
-	}
-
+	// Wait until the sandbox has booted.
 	b := make([]byte, 1)
-	// Notify the sandbox process it has been moved into cgroups.
-	if l, err := startSyncFile.Write(b); err != nil || l != 1 {
-		return nil, fmt.Errorf("error writing into the start-sync descriptor: %v", err)
-	}
-	// Wait until the sandbox process has initialized the controller socket.
-	if l, err := startSyncFile.Read(b); err != nil || l != 1 {
+	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
 		return nil, fmt.Errorf("error reading from the start-sync descriptor: %v", err)
 	}
-	// startSyncFile is closed here to be sure that starting with this point
-	// the sandbox process will not write anything into it.
-	startSyncFile.Close()
-
-	// Wait for the control server to come up.
-	if err := s.waitForCreated(); err != nil {
-		return nil, err
-	}
 
 	c.Release()
 	return s, nil
@@ -612,23 +599,6 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	return nil
 }
 
-// waitForCreated waits for the sandbox subprocess control server to be
-// running and for the loader to have been created, at which point the sandbox
-// is in Created state.
-func (s *Sandbox) waitForCreated() error {
-	log.Debugf("Waiting for sandbox %q creation", s.ID)
-	conn, err := s.sandboxConnect()
-	if err != nil {
-		return err
-	}
-	defer conn.Close()
-
-	if err := conn.Call(boot.ContainerWaitForLoader, nil, nil); err != nil {
-		return fmt.Errorf("err waiting on loader on sandbox %q, err: %v", s.ID, err)
-	}
-	return nil
-}
-
 // Wait waits for the containerized process to exit, and returns its WaitStatus.
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
diff --git a/runsc/test/root/cgroup_test.go b/runsc/test/root/cgroup_test.go
index fdb94ff64..0eabf9561 100644
--- a/runsc/test/root/cgroup_test.go
+++ b/runsc/test/root/cgroup_test.go
@@ -18,6 +18,7 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"testing"
 
@@ -123,6 +124,8 @@ func TestCgroup(t *testing.T) {
 		t.Fatalf("Docker.ID() failed: %v", err)
 	}
 	t.Logf("cgroup ID: %s", gid)
+
+	// Check list of attributes defined above.
 	for _, attr := range attrs {
 		path := filepath.Join("/sys/fs/cgroup", attr.ctrl, "docker", gid, attr.file)
 		out, err := ioutil.ReadFile(path)
@@ -137,4 +140,33 @@ func TestCgroup(t *testing.T) {
 			t.Errorf("arg: %q, cgroup attribute %s/%s, got: %q, want: %q", attr.arg, attr.ctrl, attr.file, got, attr.want)
 		}
 	}
+
+	// Check that sandbox is inside cgroup.
+	controllers := []string{
+		"blkio",
+		"cpu",
+		"cpuset",
+		"memory",
+		"net_cls",
+		"net_prio",
+		"devices",
+		"freezer",
+		"perf_event",
+		"pids",
+		"systemd",
+	}
+	pid, err := d.SandboxPid()
+	if err != nil {
+		t.Fatalf("SandboxPid: %v", err)
+	}
+	for _, ctrl := range controllers {
+		path := filepath.Join("/sys/fs/cgroup", ctrl, "docker", gid, "cgroup.procs")
+		out, err := ioutil.ReadFile(path)
+		if err != nil {
+			t.Fatalf("failed to read %q: %v", path, err)
+		}
+		if got := string(out); !strings.Contains(got, strconv.Itoa(pid)) {
+			t.Errorf("cgroup control %s processes, got: %q, want: %q", ctrl, got, pid)
+		}
+	}
 }
-- 
cgit v1.2.3


From 33191e1cc4010693c434b24baa4d830d082c8ce6 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 2 Jan 2019 15:46:40 -0800
Subject: Automated rollback of changelist 225089593

PiperOrigin-RevId: 227595007
Change-Id: If14cc5aab869c5fd7a4ebd95929c887ab690e94c
---
 runsc/boot/config.go | 8 --------
 runsc/boot/loader.go | 6 +-----
 runsc/main.go        | 4 +---
 3 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index e00d44df9..400203c99 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -200,15 +200,8 @@ type Config struct {
 
 	// PanicSignal registers signal handling that panics. Usually set to
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
-	//
-	// PanicSignal takes precedence over TraceSignal.
 	PanicSignal int
 
-	// TraceSignal registers signal handling that logs a traceback of all
-	// goroutines. Usually set to SIGUSR2(12) to troubleshoot hangs. -1
-	// disables it.
-	TraceSignal int
-
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
@@ -235,7 +228,6 @@ func (c *Config) ToFlags() []string {
 		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
 		"--watchdog-action=" + c.WatchdogAction.String(),
 		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
-		"--trace-signal=" + strconv.Itoa(c.TraceSignal),
 	}
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 3c6892446..71a2ab962 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -309,14 +309,10 @@ func New(args Args) (*Loader, error) {
 	// Handle signals by forwarding them to the root container process
 	// (except for panic signal, which should cause a panic).
 	l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
-		// Tracing signals should cause their respective actions.
+		// Panic signal should cause a panic.
 		if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
 			panic("Signal-induced panic")
 		}
-		if args.Conf.TraceSignal != -1 && sig == linux.Signal(args.Conf.TraceSignal) {
-			log.TracebackAll("Signal-induced traceback")
-			return
-		}
 
 		// Otherwise forward to root container.
 		deliveryMode := DeliverToProcess
diff --git a/runsc/main.go b/runsc/main.go
index a6ea0e9fa..e036abc44 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -62,8 +62,7 @@ var (
 	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
-	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it. This takes precendence over -trace-signal.")
-	traceSignal    = flag.Int("trace-signal", -1, "register signal handling that logs a traceback of all goroutines. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
 )
@@ -147,7 +146,6 @@ func main() {
 		StraceLogSize:  *straceLogSize,
 		WatchdogAction: wa,
 		PanicSignal:    *panicSignal,
-		TraceSignal:    *traceSignal,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 	}
 	if len(*straceSyscalls) != 0 {
-- 
cgit v1.2.3


From d033a76fa6e215cb302e5383dbd7b0120de4395d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 3 Jan 2019 14:09:47 -0800
Subject: Apply chroot for --network=host too

PiperOrigin-RevId: 227747566
Change-Id: Ide9df4ac1391adcd1c56e08d6570e0d149d85bc4
---
 runsc/sandbox/sandbox.go | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 195cd4d6f..d84995d04 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -529,25 +529,27 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		} else {
 			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
 		}
+	}
 
-		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
-		// bind-mount the executable inside it.
-		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
-		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
-			log.Infof("Sandbox will be started in minimal chroot")
-			chroot, err := setUpChroot()
-			if err != nil {
-				return fmt.Errorf("error setting up chroot: %v", err)
-			}
-			s.Chroot = chroot // Remember path so it can cleaned up.
-			cmd.SysProcAttr.Chroot = chroot
-			cmd.Dir = "/"
-			cmd.Args[0] = "/runsc"
-			cmd.Path = "/runsc"
-		} else {
-			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN and CAP_SYS_CHROOT")
+	// If we have CAP_SYS_ADMIN, we can create an empty chroot and
+	// bind-mount the executable inside it.
+	if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
+
+	} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
+		log.Infof("Sandbox will be started in minimal chroot")
+		chroot, err := setUpChroot()
+		if err != nil {
+			return fmt.Errorf("error setting up chroot: %v", err)
 		}
+		s.Chroot = chroot // Remember path so it can cleaned up.
+		cmd.SysProcAttr.Chroot = chroot
+		cmd.Dir = "/"
+		cmd.Args[0] = "/runsc"
+		cmd.Path = "/runsc"
+
+	} else {
+		return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN and CAP_SYS_CHROOT")
 	}
 
 	if s.Cgroup != nil {
-- 
cgit v1.2.3


From 5ce542ecc749cb9a1e8d216c7181aeaebfbc3110 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 7 Jan 2019 23:01:10 -0800
Subject: Undo changes in case of failure to create file/dir/symlink

File/dir/symlink creation is multi-step and may leave state behind in
case of failure in one of the steps. Added best effort attempt to
clean up.

PiperOrigin-RevId: 228286612
Change-Id: Ib03c27cd3d3e4f44d0352edc6ee212a53412d7f1
---
 runsc/fsgofer/BUILD      |  1 +
 runsc/fsgofer/fsgofer.go | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index ab12388ab..756c20ad7 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -18,6 +18,7 @@ go_library(
         "//pkg/log",
         "//pkg/p9",
         "//pkg/syserr",
+        "//runsc/specutils",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index b5746447f..9955d0750 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
 const (
@@ -379,19 +380,20 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
-	if err := fchown(fd, uid, gid); err != nil {
+	cu := specutils.MakeCleanup(func() {
 		syscall.Close(fd)
-		if e := syscall.Unlinkat(l.controlFD(), name); e != nil {
-			log.Warningf("error unlinking file %q after failed chown: %v", name, e)
+		// Best effort attempt to remove the file in case of failure.
+		if err := syscall.Unlinkat(l.controlFD(), name); err != nil {
+			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
+	})
+	defer cu.Clean()
+
+	if err := fchown(fd, uid, gid); err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 	stat, err := stat(fd)
 	if err != nil {
-		syscall.Close(fd)
-		if e := syscall.Unlinkat(l.controlFD(), name); e != nil {
-			log.Warningf("error unlinking file %q after failed stat: %v", name, e)
-		}
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 
@@ -404,6 +406,8 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		openedFile:  f,
 		mode:        mode,
 	}
+
+	cu.Release()
 	return newFDMaybe(c.openedFile), c, l.attachPoint.makeQID(stat), 0, nil
 }
 
@@ -420,6 +424,13 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
+	cu := specutils.MakeCleanup(func() {
+		// Best effort attempt to remove the dir in case of failure.
+		if err := unix.Unlinkat(l.controlFD(), name, unix.AT_REMOVEDIR); err != nil {
+			log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
+		}
+	})
+	defer cu.Clean()
 
 	// Open directory to change ownership and stat it.
 	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
@@ -436,6 +447,8 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
+
+	cu.Release()
 	return l.attachPoint.makeQID(stat), nil
 }
 
@@ -759,6 +772,13 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
+	cu := specutils.MakeCleanup(func() {
+		// Best effort attempt to remove the symlink in case of failure.
+		if err := syscall.Unlinkat(l.controlFD(), newName); err != nil {
+			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
+		}
+	})
+	defer cu.Clean()
 
 	// Open symlink to change ownership and stat it.
 	fd, err := syscall.Openat(l.controlFD(), newName, unix.O_PATH|openFlags, 0)
@@ -774,6 +794,8 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
+
+	cu.Release()
 	return l.attachPoint.makeQID(stat), nil
 }
 
-- 
cgit v1.2.3


From 0d7023d581612e1670ef36490a827e46968d6d08 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 9 Jan 2019 09:17:04 -0800
Subject: Restore to original cgroup after sandbox and gofer processes are
 created

The original code assumed that it was safe to join and not restore cgroup,
but Container.Run will not exit after calling start, making cgroup cleanup
fail because there were still processes inside the cgroup.

PiperOrigin-RevId: 228529199
Change-Id: I12a48d9adab4bbb02f20d71ec99598c336cbfe51
---
 runsc/cgroup/cgroup.go       | 65 +++++++++++++++++++++++++++++++++++---------
 runsc/container/BUILD        |  1 +
 runsc/container/container.go | 62 +++++++++++++++++++++++++++++-------------
 runsc/sandbox/sandbox.go     | 37 ++++---------------------
 runsc/specutils/specutils.go |  8 +++---
 5 files changed, 106 insertions(+), 67 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 2887f3d7f..65a0b6d7a 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -17,6 +17,7 @@
 package cgroup
 
 import (
+	"bufio"
 	"context"
 	"fmt"
 	"io/ioutil"
@@ -168,12 +169,12 @@ type Cgroup struct {
 }
 
 // New creates a new Cgroup instance if the spec includes a cgroup path.
-// Otherwise it returns nil and false.
-func New(spec *specs.Spec) (*Cgroup, bool) {
+// Returns nil otherwise.
+func New(spec *specs.Spec) *Cgroup {
 	if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
-		return nil, false
+		return nil
 	}
-	return &Cgroup{Name: spec.Linux.CgroupsPath}, true
+	return &Cgroup{Name: spec.Linux.CgroupsPath}
 }
 
 // Install creates and configures cgroups according to 'res'. If cgroup path
@@ -241,19 +242,57 @@ func (c *Cgroup) Uninstall() error {
 	return nil
 }
 
-// Join adds the current process to the all controllers.
-func (c *Cgroup) Join() error {
-	return c.Add(0)
-}
+// Join adds the current process to the all controllers. Returns function that
+// restores cgroup to the original state.
+func (c *Cgroup) Join() (func(), error) {
+	// First save the current state so it can be restored.
+	undo := func() {}
+	f, err := os.Open("/proc/self/cgroup")
+	if err != nil {
+		return undo, err
+	}
+	defer f.Close()
+
+	var undoPaths []string
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		// Format: ID:controller1,controller2:path
+		// Example: 2:cpu,cpuacct:/user.slice
+		tokens := strings.Split(scanner.Text(), ":")
+		if len(tokens) != 3 {
+			return undo, fmt.Errorf("formatting cgroups file, line: %q", scanner.Text())
+		}
+		for _, ctrlr := range strings.Split(tokens[1], ",") {
+			// Skip controllers we don't handle.
+			if _, ok := controllers[ctrlr]; ok {
+				undoPaths = append(undoPaths, filepath.Join(cgroupRoot, ctrlr, tokens[2]))
+				break
+			}
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return undo, err
+	}
 
-// Add adds given process to all controllers.
-func (c *Cgroup) Add(pid int) error {
+	// Replace empty undo with the real thing before changes are made to cgroups.
+	undo = func() {
+		for _, path := range undoPaths {
+			log.Debugf("Restoring cgroup %q", path)
+			if err := setValue(path, "cgroup.procs", "0"); err != nil {
+				log.Warningf("Error restoring cgroup %q: %v", path, err)
+			}
+		}
+	}
+
+	// Now join the cgroups.
 	for key := range controllers {
-		if err := setValue(c.makePath(key), "cgroup.procs", strconv.Itoa(pid)); err != nil {
-			return err
+		path := c.makePath(key)
+		log.Debugf("Joining cgroup %q", path)
+		if err := setValue(path, "cgroup.procs", "0"); err != nil {
+			return undo, err
 		}
 	}
-	return nil
+	return undo, nil
 }
 
 // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 28ec81d3f..d9534cbcc 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/control",
         "//runsc/boot",
+        "//runsc/cgroup",
         "//runsc/sandbox",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 07924d23a..dc9ef86fa 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/cgroup"
 	"gvisor.googlesource.com/gvisor/runsc/sandbox"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -286,18 +287,26 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 			return nil, fmt.Errorf("writing clean spec: %v", err)
 		}
 
-		ioFiles, err := c.createGoferProcess(spec, conf, bundleDir)
-		if err != nil {
-			return nil, err
+		// Create and join cgroup before processes are created to ensure they are
+		// part of the cgroup from the start (and all tneir children processes).
+		cg := cgroup.New(spec)
+		if cg != nil {
+			// If there is cgroup config, install it before creating sandbox process.
+			if err := cg.Install(spec.Linux.Resources); err != nil {
+				return nil, fmt.Errorf("configuring cgroup: %v", err)
+			}
 		}
+		if err := runInCgroup(cg, func() error {
+			ioFiles, err := c.createGoferProcess(spec, conf, bundleDir)
+			if err != nil {
+				return err
+			}
 
-		// Start a new sandbox for this container. Any errors after this point
-		// must destroy the container.
-		c.Sandbox, err = sandbox.Create(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles)
-		if err != nil {
-			return nil, err
-		}
-		if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil {
+			// Start a new sandbox for this container. Any errors after this point
+			// must destroy the container.
+			c.Sandbox, err = sandbox.New(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles, cg)
+			return err
+		}); err != nil {
 			return nil, err
 		}
 	} else {
@@ -381,15 +390,16 @@ func (c *Container) Start(conf *boot.Config) error {
 			return fmt.Errorf("writing clean spec: %v", err)
 		}
 
-		// Create the gofer process.
-		ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
-		if err != nil {
-			return err
-		}
-		if err := c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles); err != nil {
-			return err
-		}
-		if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil {
+		// Join cgroup to strt gofer process to ensure it's part of the cgroup from
+		// the start (and all tneir children processes).
+		if err := runInCgroup(c.Sandbox.Cgroup, func() error {
+			// Create the gofer process.
+			ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+			if err != nil {
+				return err
+			}
+			return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles)
+		}); err != nil {
 			return err
 		}
 	}
@@ -883,3 +893,17 @@ func lockContainerMetadata(containerRootDir string) (func() error, error) {
 	}
 	return l.Unlock, nil
 }
+
+// runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
+// it in the current context.
+func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
+	if cg == nil {
+		return fn()
+	}
+	restore, err := cg.Join()
+	defer restore()
+	if err != nil {
+		return err
+	}
+	return fn()
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index d84995d04..9e95a11b4 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -64,32 +64,15 @@ type Sandbox struct {
 	Cgroup *cgroup.Cgroup `json:"cgroup"`
 }
 
-// Create creates the sandbox process. The caller must call Destroy() on the
-// sandbox. If spec specified a cgroup, the current process will have joined
-// the cgroup upon return.
-func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File) (*Sandbox, error) {
-	s := &Sandbox{ID: id}
-	// The Cleanup object cleans up partially created sandboxes when an error occurs.
-	// Any errors occurring during cleanup itself are ignored.
+// New creates the sandbox process. The caller must call Destroy() on the
+// sandbox.
+func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, cg *cgroup.Cgroup) (*Sandbox, error) {
+	s := &Sandbox{ID: id, Cgroup: cg}
+	// The Cleanup object cleans up partially created sandboxes when an error
+	// occurs. Any errors occurring during cleanup itself are ignored.
 	c := specutils.MakeCleanup(func() { _ = s.destroy() })
 	defer c.Clean()
 
-	if cg, ok := cgroup.New(spec); ok {
-		s.Cgroup = cg
-
-		// If there is cgroup config, install it before creating sandbox process.
-		if err := s.Cgroup.Install(spec.Linux.Resources); err != nil {
-			return nil, fmt.Errorf("configuring cgroup: %v", err)
-		}
-
-		// Make this process join the cgroup to ensure the sandbox (and all its
-		// children processes) are part of the cgroup from the start. Don't bother
-		// moving out because the caller is about to exit anyways.
-		if err := cg.Join(); err != nil {
-			return nil, fmt.Errorf("joining cgroup: %v", err)
-		}
-	}
-
 	// Create pipe to synchronize when sandbox process has been booted.
 	fds := make([]int, 2)
 	if err := syscall.Pipe(fds); err != nil {
@@ -871,14 +854,6 @@ func (s *Sandbox) waitForStopped() error {
 	return backoff.Retry(op, b)
 }
 
-// AddGoferToCgroup adds the gofer process to the sandbox's cgroup.
-func (s *Sandbox) AddGoferToCgroup(pid int) error {
-	if s.Cgroup != nil {
-		return s.Cgroup.Add(pid)
-	}
-	return nil
-}
-
 // deviceFileForPlatform opens the device file for the given platform. If the
 // platform does not need a device file, then nil is returned.
 func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) {
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 055076475..7b0dcf231 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -470,8 +470,7 @@ func ContainsStr(strs []string, str string) bool {
 // c.Release() // on success, aborts closing the file and return it.
 // return f
 type Cleanup struct {
-	clean    func()
-	released bool
+	clean func()
 }
 
 // MakeCleanup creates a new Cleanup object.
@@ -481,13 +480,14 @@ func MakeCleanup(f func()) Cleanup {
 
 // Clean calls the cleanup function.
 func (c *Cleanup) Clean() {
-	if !c.released {
+	if c.clean != nil {
 		c.clean()
+		c.clean = nil
 	}
 }
 
 // Release releases the cleanup from its duties, i.e. cleanup function is not
 // called after this point.
 func (c *Cleanup) Release() {
-	c.released = true
+	c.clean = nil
 }
-- 
cgit v1.2.3


From f8c8f241540fa79b47090ce4808c2c0cfbe44a12 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 11 Jan 2019 10:31:21 -0800
Subject: runsc: Collect zombies of sandbox and gofer processes

And we need to wait a gofer process before cgroup.Uninstall,
because it is running in the sandbox cgroups.

PiperOrigin-RevId: 228904020
Change-Id: Iaf8826d5b9626db32d4057a1c505a8d7daaeb8f9
---
 runsc/container/container.go            | 41 +++++++++++++++++++++++++++++----
 runsc/container/container_test.go       | 34 ++++++++++++++-------------
 runsc/container/multi_container_test.go | 16 ++++++-------
 runsc/sandbox/sandbox.go                | 30 ++++++++++++++++++------
 4 files changed, 85 insertions(+), 36 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index dc9ef86fa..544e7a250 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -119,6 +119,12 @@ type Container struct {
 	// be 0 if the gofer has been killed.
 	GoferPid int `json:"goferPid"`
 
+	// goferIsChild is set if a gofer process is a child of the current process.
+	//
+	// This field isn't saved to json, because only a creator of a gofer
+	// process will have it as a child process.
+	goferIsChild bool
+
 	// Sandbox is the sandbox this container is running in. It's set when the
 	// container is created and reset when the sandbox is destroyed.
 	Sandbox *sandbox.Sandbox `json:"sandbox"`
@@ -708,11 +714,14 @@ func (c *Container) save() error {
 // root containers), and waits for the container or sandbox and the gofer
 // to stop. If any of them doesn't stop before timeout, an error is returned.
 func (c *Container) stop() error {
+	var cgroup *cgroup.Cgroup
+
 	if c.Sandbox != nil {
 		log.Debugf("Destroying container %q", c.ID)
 		if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
 			return fmt.Errorf("error destroying container %q: %v", c.ID, err)
 		}
+		cgroup = c.Sandbox.Cgroup
 		// Only set sandbox to nil after it has been told to destroy the container.
 		c.Sandbox = nil
 	}
@@ -725,7 +734,18 @@ func (c *Container) stop() error {
 			log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err)
 		}
 	}
-	return c.waitForStopped()
+
+	if err := c.waitForStopped(); err != nil {
+		return err
+	}
+
+	// Gofer is running in cgroups, so Cgroup.Uninstall has to be called after it.
+	if cgroup != nil {
+		if err := cgroup.Uninstall(); err != nil {
+			return err
+		}
+	}
+	return nil
 }
 
 func (c *Container) waitForStopped() error {
@@ -738,12 +758,24 @@ func (c *Container) waitForStopped() error {
 				return fmt.Errorf("container is still running")
 			}
 		}
-		if c.GoferPid != 0 {
-			if err := syscall.Kill(c.GoferPid, 0); err == nil {
+		if c.GoferPid == 0 {
+			return nil
+		}
+		if c.goferIsChild {
+			// The gofer process is a child of the current process,
+			// so we can wait it and collect its zombie.
+			wpid, err := syscall.Wait4(int(c.GoferPid), nil, syscall.WNOHANG, nil)
+			if err != nil {
+				return fmt.Errorf("error waiting the gofer process: %v", err)
+			}
+			if wpid == 0 {
 				return fmt.Errorf("gofer is still running")
 			}
-			c.GoferPid = 0
+
+		} else if err := syscall.Kill(c.GoferPid, 0); err == nil {
+			return fmt.Errorf("gofer is still running")
 		}
+		c.GoferPid = 0
 		return nil
 	}
 	return backoff.Retry(op, b)
@@ -816,6 +848,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
 	c.GoferPid = cmd.Process.Pid
+	c.goferIsChild = true
 	return sandEnds, nil
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 45a36e583..affb51fab 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -39,9 +39,6 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
-// childReaper reaps child processes.
-var childReaper *testutil.Reaper
-
 // waitForProcessList waits for the given process list to show up in the container.
 func waitForProcessList(cont *Container, want []*control.Process) error {
 	cb := func() error {
@@ -75,6 +72,18 @@ func waitForProcessCount(cont *Container, want int) error {
 	return testutil.Poll(cb, 30*time.Second)
 }
 
+func blockUntilWaitable(pid int) error {
+	_, _, err := testutil.RetryEintr(func() (uintptr, uintptr, error) {
+		var err error
+		_, _, err1 := syscall.Syscall6(syscall.SYS_WAITID, 1, uintptr(pid), 0, syscall.WEXITED|syscall.WNOWAIT, 0, 0)
+		if err1 != 0 {
+			err = err1
+		}
+		return 0, 0, err
+	})
+	return err
+}
+
 // procListsEqual is used to check whether 2 Process lists are equal for all
 // implemented fields.
 func procListsEqual(got, want []*control.Process) bool {
@@ -256,6 +265,11 @@ func configs(opts ...configOption) []*boot.Config {
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
 func TestLifecycle(t *testing.T) {
+	// Start the child reaper.
+	childReaper := &testutil.Reaper{}
+	childReaper.Start()
+	defer childReaper.Stop()
+
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		// The container will just sleep for a long time.  We will kill it before
@@ -1505,10 +1519,7 @@ func TestGoferExits(t *testing.T) {
 		t.Fatalf("error killing sandbox process: %v", err)
 	}
 
-	_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
-		cpid, err := syscall.Wait4(c.GoferPid, nil, 0, nil)
-		return uintptr(cpid), 0, err
-	})
+	err = blockUntilWaitable(c.GoferPid)
 	if err != nil && err != syscall.ECHILD {
 		t.Errorf("error waiting for gofer to exit: %v", err)
 	}
@@ -1576,10 +1587,6 @@ func TestUserLog(t *testing.T) {
 }
 
 func TestWaitOnExitedSandbox(t *testing.T) {
-	// Disable the childReaper for this test.
-	childReaper.Stop()
-	defer childReaper.Start()
-
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
@@ -1712,10 +1719,5 @@ func TestMain(m *testing.M) {
 	}
 	testutil.RunAsRoot()
 
-	// Start the child reaper.
-	childReaper = &testutil.Reaper{}
-	childReaper.Start()
-	defer childReaper.Stop()
-
 	os.Exit(m.Run())
 }
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index e431f5aec..6b3c41a9b 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -359,7 +359,7 @@ func TestMultiContainerSignal(t *testing.T) {
 			cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
 			return uintptr(cpid), 0, err
 		})
-		if err != nil && err != syscall.ECHILD {
+		if err != syscall.ECHILD {
 			t.Errorf("error waiting for gofer to exit: %v", err)
 		}
 		// Make sure process 1 is still running.
@@ -379,18 +379,12 @@ func TestMultiContainerSignal(t *testing.T) {
 		}
 
 		// Ensure that container's gofer and sandbox process are no more.
-		_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
-			cpid, err := syscall.Wait4(containers[0].GoferPid, nil, 0, nil)
-			return uintptr(cpid), 0, err
-		})
+		err = blockUntilWaitable(containers[0].GoferPid)
 		if err != nil && err != syscall.ECHILD {
 			t.Errorf("error waiting for gofer to exit: %v", err)
 		}
 
-		_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
-			cpid, err := syscall.Wait4(containers[0].Sandbox.Pid, nil, 0, nil)
-			return uintptr(cpid), 0, err
-		})
+		err = blockUntilWaitable(containers[0].Sandbox.Pid)
 		if err != nil && err != syscall.ECHILD {
 			t.Errorf("error waiting for sandbox to exit: %v", err)
 		}
@@ -399,6 +393,10 @@ func TestMultiContainerSignal(t *testing.T) {
 		if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil {
 			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
 		}
+
+		if err := containers[0].Destroy(); err != nil {
+			t.Errorf("failed to destroy container: %v", err)
+		}
 	}
 }
 
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 9e95a11b4..fe55ddab8 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -62,6 +62,12 @@ type Sandbox struct {
 
 	// Cgroup has the cgroup configuration for the sandbox.
 	Cgroup *cgroup.Cgroup `json:"cgroup"`
+
+	// child is set if a sandbox process is a child of the current process.
+	//
+	// This field isn't saved to json, because only a creator of sandbox
+	// will have it as a child process.
+	child bool
 }
 
 // New creates the sandbox process. The caller must call Destroy() on the
@@ -70,7 +76,10 @@ func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	s := &Sandbox{ID: id, Cgroup: cg}
 	// The Cleanup object cleans up partially created sandboxes when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
-	c := specutils.MakeCleanup(func() { _ = s.destroy() })
+	c := specutils.MakeCleanup(func() {
+		err := s.destroy()
+		log.Warningf("error destroying sandbox: %v", err)
+	})
 	defer c.Clean()
 
 	// Create pipe to synchronize when sandbox process has been booted.
@@ -578,6 +587,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	if err := specutils.StartInNS(cmd, nss); err != nil {
 		return err
 	}
+	s.child = true
 	s.Pid = cmd.Process.Pid
 	log.Infof("Sandbox started, PID: %d", s.Pid)
 
@@ -666,11 +676,6 @@ func (s *Sandbox) destroy() error {
 		}
 	}
 
-	if s.Cgroup != nil {
-		if err := s.Cgroup.Uninstall(); err != nil {
-			return err
-		}
-	}
 	if s.Chroot != "" {
 		if err := tearDownChroot(s.Chroot); err != nil {
 			return err
@@ -846,7 +851,18 @@ func (s *Sandbox) waitForStopped() error {
 	defer cancel()
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
-		if s.IsRunning() {
+		if s.child && s.Pid != 0 {
+			// The sandbox process is a child of the current process,
+			// so we can wait it and collect its zombie.
+			wpid, err := syscall.Wait4(int(s.Pid), nil, syscall.WNOHANG, nil)
+			if err != nil {
+				return fmt.Errorf("error waiting the sandbox process: %v", err)
+			}
+			if wpid == 0 {
+				return fmt.Errorf("sandbox is still running")
+			}
+			s.Pid = 0
+		} else if s.IsRunning() {
 			return fmt.Errorf("sandbox is still running")
 		}
 		return nil
-- 
cgit v1.2.3


From a46b6d453d198b96949342a81750114bfa5a5429 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 14 Jan 2019 14:07:05 -0800
Subject: runsc: set up a minimal chroot from the sandbox process

In this case, new mounts are not created in the host mount namspaces, so
tearDownChroot isn't needed, because chroot will be destroyed with a
sandbox mount namespace.

In additional, pivot_root can't be called instead of chroot.

PiperOrigin-RevId: 229250871
Change-Id: I765bdb587d0b8287a6a8efda8747639d37c7e7b6
---
 runsc/cmd/BUILD                |  1 +
 runsc/cmd/boot.go              | 31 +++++++++++++-
 runsc/cmd/chroot.go            | 95 +++++++++++++++++++++++++++++++++++++++++
 runsc/cmd/cmd.go               | 26 ++++++++++-
 runsc/sandbox/BUILD            |  1 -
 runsc/sandbox/chroot.go        | 97 ------------------------------------------
 runsc/sandbox/sandbox.go       | 86 +++++++++++++++++++------------------
 runsc/test/root/chroot_test.go | 16 +++----
 8 files changed, 201 insertions(+), 152 deletions(-)
 create mode 100644 runsc/cmd/chroot.go
 delete mode 100644 runsc/sandbox/chroot.go

(limited to 'runsc')

diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 394bb0e1f..a908172af 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -8,6 +8,7 @@ go_library(
         "boot.go",
         "capability.go",
         "checkpoint.go",
+        "chroot.go",
         "cmd.go",
         "create.go",
         "debug.go",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index bb3435284..7ca2744bd 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -60,6 +60,9 @@ type Boot struct {
 	// to the process.
 	applyCaps bool
 
+	// setUpChroot is set to true if the sandbox is started in an empty root.
+	setUpRoot bool
+
 	// cpuNum number of CPUs to create inside the sandbox.
 	cpuNum int
 
@@ -99,6 +102,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+	f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
 	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
 	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
 	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
@@ -116,6 +120,31 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Ensure that if there is a panic, all goroutine stacks are printed.
 	debug.SetTraceback("all")
 
+	if b.setUpRoot {
+		if err := setUpChroot(); err != nil {
+			Fatalf("error setting up chroot: %v", err)
+		}
+
+		specutils.ExePath = "/runsc"
+		if !b.applyCaps {
+			// Remove --setup-root arg to call myself.
+			var args []string
+			for _, arg := range os.Args {
+				if !strings.Contains(arg, "setup-root") {
+					args = append(args, arg)
+				}
+			}
+			// Note that we've already read the spec from the spec FD, and
+			// we will read it again after the exec call. This works
+			// because the ReadSpecFromFile function seeks to the beginning
+			// of the file before reading.
+			if err := callSelfAsNobody(args); err != nil {
+				Fatalf("%v", err)
+			}
+			panic("callSelfAsNobody must never return success")
+		}
+	}
+
 	// Get the spec from the specFD.
 	specFile := os.NewFile(uintptr(b.specFD), "spec file")
 	defer specFile.Close()
@@ -144,7 +173,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		// Remove --apply-caps arg to call myself.
 		var args []string
 		for _, arg := range os.Args {
-			if !strings.Contains(arg, "apply-caps") {
+			if !strings.Contains(arg, "setup-root") && !strings.Contains(arg, "apply-caps") {
 				args = append(args, arg)
 			}
 		}
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
new file mode 100644
index 000000000..b53085934
--- /dev/null
+++ b/runsc/cmd/chroot.go
@@ -0,0 +1,95 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// chrootBinPath is the location inside the chroot where the runsc binary will
+// be mounted.
+const chrootBinPath = "/runsc"
+
+// mountInChroot creates the destination mount point in the given chroot and
+// mounts the source.
+func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
+	chrootDst := filepath.Join(chroot, dst)
+	log.Infof("Mounting %q at %q", src, chrootDst)
+
+	if err := specutils.Mount(src, chrootDst, typ, flags); err != nil {
+		return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err)
+	}
+	return nil
+}
+
+// setUpChroot creates an empty directory with runsc mounted at /runsc and proc
+// mounted at /proc.
+func setUpChroot() error {
+	// We are a new mount namespace, so we can use /tmp as a directory to
+	// construct a new root.
+	chroot := os.TempDir()
+
+	log.Infof("Setting up sandbox chroot in %q", chroot)
+
+	// Convert all shared mounts into slave to be sure that nothing will be
+	// propagated outside of our namespace.
+	if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
+		return fmt.Errorf("error converting mounts: %v", err)
+	}
+
+	if err := syscall.Mount("runsc-root", chroot, "tmpfs", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC, ""); err != nil {
+		return fmt.Errorf("error mounting tmpfs in choot: %v", err)
+	}
+
+	if err := mountInChroot(chroot, "/proc", "/proc", "bind", syscall.MS_BIND|syscall.MS_RDONLY|syscall.MS_REC); err != nil {
+		return fmt.Errorf("error mounting proc in chroot: %v", err)
+	}
+
+	if err := mountInChroot(chroot, specutils.ExePath, chrootBinPath, "bind", syscall.MS_BIND|syscall.MS_RDONLY); err != nil {
+		return fmt.Errorf("error mounting runsc in chroot: %v", err)
+	}
+
+	if err := os.Chdir(chroot); err != nil {
+		return fmt.Errorf("error changing working directory: %v", err)
+	}
+
+	if err := syscall.Mount("", chroot, "", syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_BIND, ""); err != nil {
+		return fmt.Errorf("error remounting chroot in read-only: %v", err)
+	}
+	// pivot_root(new_root, put_old) moves the root filesystem (old_root)
+	// of the calling process to the directory put_old and makes new_root
+	// the new root filesystem of the calling process.
+	//
+	// pivot_root(".", ".") makes a mount of the working directory the new
+	// root filesystem, so it will be moved in "/" and then the old_root
+	// will be moved to "/" too. The parent mount of the old_root will be
+	// new_root, so after umounting the old_root, we will see only
+	// the new_root in "/".
+	if err := syscall.PivotRoot(".", "."); err != nil {
+		return fmt.Errorf("error changing root filesystem: %v", err)
+	}
+
+	if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
+		return fmt.Errorf("error umounting the old root file system: %v", err)
+	}
+
+	return nil
+}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index a1c3491a3..fbfc18fc9 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -86,6 +86,28 @@ func setCapsAndCallSelf(args []string, caps *specs.LinuxCapabilities) error {
 	}
 
 	log.Infof("Execve %q again, bye!", binPath)
-	syscall.Exec(binPath, args, []string{})
-	panic("unreachable")
+	err = syscall.Exec(binPath, args, []string{})
+	return fmt.Errorf("error executing %s: %v", binPath, err)
+}
+
+// callSelfAsNobody sets UID and GID to nobody and then execve's itself again.
+func callSelfAsNobody(args []string) error {
+	// Keep thread locked while user/group are changed.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	const nobody = 65534
+
+	if _, _, err := syscall.RawSyscall(syscall.SYS_SETGID, uintptr(nobody), 0, 0); err != 0 {
+		return fmt.Errorf("error setting uid: %v", err)
+	}
+	if _, _, err := syscall.RawSyscall(syscall.SYS_SETUID, uintptr(nobody), 0, 0); err != 0 {
+		return fmt.Errorf("error setting gid: %v", err)
+	}
+
+	binPath := "/runsc"
+
+	log.Infof("Execve %q again, bye!", binPath)
+	err := syscall.Exec(binPath, args, []string{})
+	return fmt.Errorf("error executing %s: %v", binPath, err)
 }
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index d6043bcf7..899fd99de 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -5,7 +5,6 @@ package(licenses = ["notice"])  # Apache 2.0
 go_library(
     name = "sandbox",
     srcs = [
-        "chroot.go",
         "network.go",
         "sandbox.go",
     ],
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
deleted file mode 100644
index 354049871..000000000
--- a/runsc/sandbox/chroot.go
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sandbox
-
-import (
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/runsc/specutils"
-)
-
-// chrootBinPath is the location inside the chroot where the runsc binary will
-// be mounted.
-const chrootBinPath = "/runsc"
-
-// mountInChroot creates the destination mount point in the given chroot and
-// mounts the source.
-func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
-	chrootDst := filepath.Join(chroot, dst)
-	log.Infof("Mounting %q at %q", src, chrootDst)
-
-	if err := specutils.Mount(src, chrootDst, typ, flags); err != nil {
-		return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err)
-	}
-	return nil
-}
-
-// setUpChroot creates an empty directory with runsc mounted at /runsc and proc
-// mounted at /proc.
-func setUpChroot() (string, error) {
-	// Create the chroot directory and make it accessible to all users.
-	chroot, err := ioutil.TempDir("", "runsc-sandbox-chroot-")
-	if err != nil {
-		return "", fmt.Errorf("TempDir() failed: %v", err)
-	}
-	if err := os.Chmod(chroot, 0777); err != nil {
-		return "", fmt.Errorf("Chmod(%q) failed: %v", chroot, err)
-	}
-	log.Infof("Setting up sandbox chroot in %q", chroot)
-
-	// Mount /proc.
-	if err := mountInChroot(chroot, "proc", "/proc", "proc", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC); err != nil {
-		return "", fmt.Errorf("error mounting proc in chroot: %v", err)
-	}
-
-	// Mount runsc at /runsc in the chroot.
-	binPath, err := specutils.BinPath()
-	if err != nil {
-		return "", err
-	}
-	if err := mountInChroot(chroot, binPath, chrootBinPath, "bind", syscall.MS_BIND|syscall.MS_RDONLY); err != nil {
-		return "", fmt.Errorf("error mounting runsc in chroot: %v", err)
-	}
-
-	return chroot, nil
-}
-
-// tearDownChroot unmounts /proc and /runsc from the chroot before deleting the
-// directory.
-func tearDownChroot(chroot string) error {
-	log.Debugf("Removing chroot mounts %q", chroot)
-
-	// Unmount /proc.
-	proc := filepath.Join(chroot, "proc")
-	if err := syscall.Unmount(proc, 0); err != nil {
-		return fmt.Errorf("error unmounting %q: %v", proc, err)
-	}
-
-	// Unmount /runsc.
-	exe := filepath.Join(chroot, chrootBinPath)
-	if err := syscall.Unmount(exe, 0); err != nil {
-		return fmt.Errorf("error unmounting %q: %v", exe, err)
-	}
-
-	// Remove chroot directory.
-	if err := os.RemoveAll(chroot); err != nil {
-		return fmt.Errorf("error removing %q: %v", chroot, err)
-	}
-
-	return nil
-}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index fe55ddab8..411200793 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -56,10 +56,6 @@ type Sandbox struct {
 	// is not running.
 	Pid int `json:"pid"`
 
-	// Chroot is the path to the chroot directory that the sandbox process
-	// is running in.
-	Chroot string `json:"chroot"`
-
 	// Cgroup has the cgroup configuration for the sandbox.
 	Cgroup *cgroup.Cgroup `json:"cgroup"`
 
@@ -491,6 +487,17 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		// rules.
 		cmd.Args = append(cmd.Args, "--apply-caps=true")
 
+		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
+		// bind-mount the executable inside it.
+		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
+
+		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) {
+			log.Infof("Sandbox will be started in minimal chroot")
+			cmd.Args = append(cmd.Args, "--setup-root")
+		} else {
+			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
+		}
 	} else {
 		log.Infof("Sandbox will be started in new user namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
@@ -499,50 +506,53 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		// as user nobody.
 		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
+			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
 		} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
 			// Map nobody in the new namespace to nobody in the parent namespace.
+			//
+			// A sandbox process will construct an empty
+			// root for itself, so it has to have the CAP_SYS_ADMIN
+			// capability.
+			//
+			// FIXME: The current implementations of
+			// os/exec doesn't allow to set ambient capabilities if
+			// a process is started in a new user namespace. As a
+			// workaround, we start the sandbox process with the 0
+			// UID and then it constructs a chroot and sets UID to
+			// nobody.  https://github.com/golang/go/issues/2315
 			const nobody = 65534
-			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{{
-				ContainerID: int(nobody),
-				HostID:      int(nobody),
-				Size:        int(1),
-			}}
-			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{{
-				ContainerID: int(nobody),
-				HostID:      int(nobody),
-				Size:        int(1),
-			}}
+			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+				{
+					ContainerID: int(0),
+					HostID:      int(nobody - 1),
+					Size:        int(1),
+				},
+				{
+					ContainerID: int(nobody),
+					HostID:      int(nobody),
+					Size:        int(1),
+				},
+			}
+			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+				{
+					ContainerID: int(nobody),
+					HostID:      int(nobody),
+					Size:        int(1),
+				},
+			}
 
 			// Set credentials to run as user and group nobody.
 			cmd.SysProcAttr.Credential = &syscall.Credential{
-				Uid: nobody,
+				Uid: 0,
 				Gid: nobody,
 			}
+			cmd.Args = append(cmd.Args, "--setup-root")
 		} else {
 			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
 		}
 	}
 
-	// If we have CAP_SYS_ADMIN, we can create an empty chroot and
-	// bind-mount the executable inside it.
-	if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
-
-	} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
-		log.Infof("Sandbox will be started in minimal chroot")
-		chroot, err := setUpChroot()
-		if err != nil {
-			return fmt.Errorf("error setting up chroot: %v", err)
-		}
-		s.Chroot = chroot // Remember path so it can cleaned up.
-		cmd.SysProcAttr.Chroot = chroot
-		cmd.Dir = "/"
-		cmd.Args[0] = "/runsc"
-		cmd.Path = "/runsc"
-
-	} else {
-		return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN and CAP_SYS_CHROOT")
-	}
+	cmd.Args[0] = "runsc-sandbox"
 
 	if s.Cgroup != nil {
 		cpuNum, err := s.Cgroup.NumCPU()
@@ -676,12 +686,6 @@ func (s *Sandbox) destroy() error {
 		}
 	}
 
-	if s.Chroot != "" {
-		if err := tearDownChroot(s.Chroot); err != nil {
-			return err
-		}
-	}
-
 	return nil
 }
 
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 9f705c860..04124703d 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -52,12 +52,13 @@ func TestChroot(t *testing.T) {
 	}
 
 	// Check that sandbox is chroot'ed.
-	chroot, err := filepath.EvalSymlinks(filepath.Join("/proc", strconv.Itoa(pid), "root"))
+	procRoot := filepath.Join("/proc", strconv.Itoa(pid), "root")
+	chroot, err := filepath.EvalSymlinks(procRoot)
 	if err != nil {
 		t.Fatalf("error resolving /proc/<pid>/root symlink: %v", err)
 	}
-	if want := "/tmp/runsc-sandbox-chroot-"; !strings.HasPrefix(chroot, want) {
-		t.Errorf("sandbox is not chroot'd, it should be inside: %q, got: %q", want, chroot)
+	if chroot != "/" {
+		t.Errorf("sandbox is not chroot'd, it should be inside: /, got: %q", chroot)
 	}
 
 	path, err := filepath.EvalSymlinks(filepath.Join("/proc", strconv.Itoa(pid), "cwd"))
@@ -68,12 +69,12 @@ func TestChroot(t *testing.T) {
 		t.Errorf("sandbox current dir is wrong, want: %q, got: %q", chroot, path)
 	}
 
-	fi, err := ioutil.ReadDir(chroot)
+	fi, err := ioutil.ReadDir(procRoot)
 	if err != nil {
 		t.Fatalf("error listing %q: %v", chroot, err)
 	}
 	if want, got := 2, len(fi); want != got {
-		t.Fatalf("chroot dir got %d entries, want %d", want, got)
+		t.Fatalf("chroot dir got %d entries, want %d", got, want)
 	}
 
 	// chroot dir is prepared by runsc and should contains only the executable
@@ -85,11 +86,6 @@ func TestChroot(t *testing.T) {
 	}
 
 	d.CleanUp()
-
-	// Check that chroot directory was cleaned up.
-	if _, err := os.Stat(chroot); err == nil || !os.IsNotExist(err) {
-		t.Errorf("chroot directory %q was not deleted: %v", chroot, err)
-	}
 }
 
 func TestChrootGofer(t *testing.T) {
-- 
cgit v1.2.3


From dc8450b5676d4c4ac9bcfa23cabd862e0060527d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 14 Jan 2019 20:33:29 -0800
Subject: Remove fs.Handle, ramfs.Entry, and all the DeprecatedFileOperations.

More helper structs have been added to the fsutil package to make it easier to
implement fs.InodeOperations and fs.FileOperations.

PiperOrigin-RevId: 229305982
Change-Id: Ib6f8d3862f4216745116857913dbfa351530223b
---
 pkg/abi/linux/fs.go                        |   2 +
 pkg/sentry/fs/BUILD                        |   3 +-
 pkg/sentry/fs/anon/anon.go                 |  16 +-
 pkg/sentry/fs/ashmem/BUILD                 |   1 +
 pkg/sentry/fs/ashmem/area.go               |  16 +-
 pkg/sentry/fs/ashmem/device.go             | 133 +--------
 pkg/sentry/fs/attr.go                      |  47 +++
 pkg/sentry/fs/binder/BUILD                 |   2 +-
 pkg/sentry/fs/binder/binder.go             | 135 +--------
 pkg/sentry/fs/dev/BUILD                    |   1 +
 pkg/sentry/fs/dev/dev.go                   |  29 +-
 pkg/sentry/fs/dev/fs.go                    |   2 +
 pkg/sentry/fs/dev/full.go                  |  55 ++--
 pkg/sentry/fs/dev/null.go                  |  88 ++++--
 pkg/sentry/fs/dev/random.go                |  55 ++--
 pkg/sentry/fs/dirent.go                    |   8 +-
 pkg/sentry/fs/fdpipe/pipe.go               |  14 +-
 pkg/sentry/fs/file_operations.go           |   2 +-
 pkg/sentry/fs/file_overlay_test.go         |   7 +-
 pkg/sentry/fs/filetest/filetest.go         |  16 +-
 pkg/sentry/fs/fsutil/BUILD                 |  15 -
 pkg/sentry/fs/fsutil/file.go               | 226 ++++++++++-----
 pkg/sentry/fs/fsutil/fsutil.go             |   2 -
 pkg/sentry/fs/fsutil/handle.go             | 128 ---------
 pkg/sentry/fs/fsutil/handle_test.go        | 227 ---------------
 pkg/sentry/fs/fsutil/inode.go              | 409 ++++++++++++++------------
 pkg/sentry/fs/fsutil/inode_cached_test.go  |  14 +-
 pkg/sentry/fs/gofer/file.go                |   2 +-
 pkg/sentry/fs/gofer/fs.go                  |  14 +-
 pkg/sentry/fs/gofer/inode.go               |   1 -
 pkg/sentry/fs/host/file.go                 |   4 +-
 pkg/sentry/fs/host/fs.go                   |   2 +
 pkg/sentry/fs/host/inode.go                |   1 -
 pkg/sentry/fs/inode.go                     |  29 +-
 pkg/sentry/fs/inode_operations.go          |  80 ------
 pkg/sentry/fs/inode_overlay.go             |  13 -
 pkg/sentry/fs/inode_overlay_test.go        |  62 +++-
 pkg/sentry/fs/mock.go                      |  11 -
 pkg/sentry/fs/mount.go                     |  14 +-
 pkg/sentry/fs/mounts_test.go               |  15 +-
 pkg/sentry/fs/proc/BUILD                   |   4 +-
 pkg/sentry/fs/proc/cpuinfo.go              |  41 +--
 pkg/sentry/fs/proc/exec_args.go            |  57 +++-
 pkg/sentry/fs/proc/fds.go                  | 138 +++++----
 pkg/sentry/fs/proc/file.go                 |  58 ----
 pkg/sentry/fs/proc/inode.go                |  96 +++++++
 pkg/sentry/fs/proc/net.go                  |  55 ++--
 pkg/sentry/fs/proc/proc.go                 | 152 +++++-----
 pkg/sentry/fs/proc/rpcinet_proc.go         | 246 ++++++++--------
 pkg/sentry/fs/proc/seqfile/BUILD           |   7 +-
 pkg/sentry/fs/proc/seqfile/seqfile.go      | 133 ++++++---
 pkg/sentry/fs/proc/seqfile/seqfile_test.go |  45 +--
 pkg/sentry/fs/proc/sys.go                  | 116 +++++---
 pkg/sentry/fs/proc/sys_net.go              | 325 +++++++++++++--------
 pkg/sentry/fs/proc/sys_net_state.go        |  17 +-
 pkg/sentry/fs/proc/sys_net_test.go         |  28 +-
 pkg/sentry/fs/proc/task.go                 | 264 +++++++++++------
 pkg/sentry/fs/proc/uid_gid_map.go          |  17 +-
 pkg/sentry/fs/proc/uptime.go               |  40 ++-
 pkg/sentry/fs/ramfs/BUILD                  |   6 +-
 pkg/sentry/fs/ramfs/dir.go                 | 223 +++++++++++----
 pkg/sentry/fs/ramfs/file.go                | 150 ----------
 pkg/sentry/fs/ramfs/ramfs.go               | 441 -----------------------------
 pkg/sentry/fs/ramfs/socket.go              |  48 +++-
 pkg/sentry/fs/ramfs/symlink.go             |  67 +++--
 pkg/sentry/fs/ramfs/test/BUILD             |  16 --
 pkg/sentry/fs/ramfs/test/test.go           |  46 ---
 pkg/sentry/fs/ramfs/tree.go                |   3 +-
 pkg/sentry/fs/ramfs/tree_test.go           |   2 +-
 pkg/sentry/fs/sys/BUILD                    |   3 +-
 pkg/sentry/fs/sys/devices.go               |  51 ++--
 pkg/sentry/fs/sys/fs.go                    |   2 +
 pkg/sentry/fs/sys/sys.go                   |  10 +-
 pkg/sentry/fs/timerfd/timerfd.go           |  12 +-
 pkg/sentry/fs/tmpfs/BUILD                  |   2 +
 pkg/sentry/fs/tmpfs/file_regular.go        |  14 +-
 pkg/sentry/fs/tmpfs/file_test.go           |   6 +-
 pkg/sentry/fs/tmpfs/fs.go                  |   2 +
 pkg/sentry/fs/tmpfs/inode_file.go          | 112 ++++----
 pkg/sentry/fs/tmpfs/tmpfs.go               | 164 +++++++++--
 pkg/sentry/fs/tty/BUILD                    |   2 -
 pkg/sentry/fs/tty/dir.go                   | 108 ++-----
 pkg/sentry/fs/tty/inode.go                 | 145 ----------
 pkg/sentry/fs/tty/master.go                |  23 +-
 pkg/sentry/fs/tty/slave.go                 |  25 +-
 pkg/sentry/kernel/epoll/epoll.go           |  12 +-
 pkg/sentry/kernel/eventfd/eventfd.go       |  14 +-
 pkg/sentry/kernel/pipe/node.go             |  40 +--
 pkg/sentry/kernel/pipe/node_test.go        |  36 +--
 pkg/sentry/kernel/pipe/pipe.go             |  31 +-
 pkg/sentry/kernel/pipe/reader_writer.go    |  10 +-
 pkg/sentry/loader/vdso.go                  |  48 ++--
 pkg/sentry/socket/epsocket/epsocket.go     |  10 +-
 pkg/sentry/socket/hostinet/socket.go       |  10 +-
 pkg/sentry/socket/netlink/socket.go        |  10 +-
 pkg/sentry/socket/rpcinet/socket.go        |  10 +-
 pkg/sentry/socket/socket.go                |  18 +-
 pkg/sentry/socket/unix/unix.go             |  10 +-
 runsc/boot/fs.go                           |   4 +-
 test/syscalls/linux/proc.cc                |   5 +
 100 files changed, 2547 insertions(+), 3144 deletions(-)
 delete mode 100644 pkg/sentry/fs/fsutil/handle.go
 delete mode 100644 pkg/sentry/fs/fsutil/handle_test.go
 delete mode 100644 pkg/sentry/fs/proc/file.go
 create mode 100644 pkg/sentry/fs/proc/inode.go
 delete mode 100644 pkg/sentry/fs/ramfs/file.go
 delete mode 100644 pkg/sentry/fs/ramfs/ramfs.go
 delete mode 100644 pkg/sentry/fs/ramfs/test/BUILD
 delete mode 100644 pkg/sentry/fs/ramfs/test/test.go
 delete mode 100644 pkg/sentry/fs/tty/inode.go

(limited to 'runsc')

diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 0b1c9f3db..a9f2ba132 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -22,8 +22,10 @@ const (
 	DEVPTS_SUPER_MAGIC    = 0x00001cd1
 	OVERLAYFS_SUPER_MAGIC = 0x794c7630
 	PIPEFS_MAGIC          = 0x50495045
+	PROC_SUPER_MAGIC      = 0x9fa0
 	RAMFS_MAGIC           = 0x09041934
 	SOCKFS_MAGIC          = 0x534F434B
+	SYSFS_MAGIC           = 0x62656572
 	TMPFS_MAGIC           = 0x01021994
 	V9FS_MAGIC            = 0x01021997
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 0fe2b14bf..6f368b0da 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -94,7 +94,8 @@ go_test(
     deps = [
         ":fs",
         "//pkg/sentry/context",
-        "//pkg/sentry/fs/ramfs/test",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go
index 743cf511f..a5e8c4f0d 100644
--- a/pkg/sentry/fs/anon/anon.go
+++ b/pkg/sentry/fs/anon/anon.go
@@ -28,16 +28,12 @@ import (
 // with any real filesystem. Some types depend on completely pseudo
 // "anon" inodes (eventfds, epollfds, etc).
 func NewInode(ctx context.Context) *fs.Inode {
-	return fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
-		FSType: linux.ANON_INODE_FS_MAGIC,
-		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: fs.FileOwnerFromContext(ctx),
-			Perms: fs.FilePermissions{
-				User: fs.PermMask{Read: true, Write: true},
-			},
-			Links: 1,
-		}),
-	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+	iops := &fsutil.SimpleFileInode{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermissions{
+			User: fs.PermMask{Read: true, Write: true},
+		}, linux.ANON_INODE_FS_MAGIC),
+	}
+	return fs.NewInode(iops, fs.NewPseudoMountSource(), fs.StableAttr{
 		Type:      fs.Anonymous,
 		DeviceID:  PseudoDevice.DeviceID(),
 		InodeID:   PseudoDevice.NextIno(),
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index 44ef82e64..2463111a8 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -28,6 +28,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
 
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index d7dd2c084..7c1b11464 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 const (
@@ -42,9 +43,10 @@ const (
 //
 // +stateify savable
 type Area struct {
-	fsutil.NoFsync                  `state:"nosave"`
-	fsutil.DeprecatedFileOperations `state:"nosave"`
-	fsutil.NotDirReaddir            `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
 
 	ad *Device
 
@@ -98,11 +100,6 @@ func (a *Area) Write(ctx context.Context, file *fs.File, src usermem.IOSequence,
 	return 0, syserror.ENOSYS
 }
 
-// Flush implements fs.FileOperations.Flush.
-func (a *Area) Flush(ctx context.Context, file *fs.File) error {
-	return nil
-}
-
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
 	a.mu.Lock()
@@ -122,8 +119,7 @@ func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MM
 			return syserror.ENOMEM
 		}
 		tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}, k)
-		// This is not backed by a real filesystem, so we pass in nil.
-		tmpfsInode := fs.NewInode(tmpfsInodeOps, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{})
+		tmpfsInode := fs.NewInode(tmpfsInodeOps, fs.NewPseudoMountSource(), fs.StableAttr{})
 		dirent := fs.NewDirent(tmpfsInode, namePrefix+"/"+a.name)
 		tmpfsFile, err := tmpfsInode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Write: true})
 		// Drop the extra reference on the Dirent.
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index 962da141b..5369d1b0d 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -16,49 +16,40 @@
 package ashmem
 
 import (
-	"sync"
-
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 // Device implements fs.InodeOperations.
 //
 // +stateify savable
 type Device struct {
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
 	fsutil.InodeNotDirectory         `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.NoFsync                   `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
-	fsutil.NotDirReaddir             `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
 
-	mu       sync.Mutex `state:"nosave"`
-	unstable fs.UnstableAttr
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*Device)(nil)
+
 // NewDevice creates and intializes a Device structure.
 func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device {
 	return &Device{
-		unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: owner,
-			Perms: fp,
-			Links: 1,
-		}),
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, linux.ANON_INODE_FS_MAGIC),
 	}
 }
 
-// Release implements fs.InodeOperations.Release.
-func (ad *Device) Release(context.Context) {}
-
 // GetFile implements fs.InodeOperations.GetFile.
 func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	return fs.NewFile(ctx, d, flags, &Area{
@@ -67,105 +58,3 @@ func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags)
 		perms:     usermem.AnyAccess,
 	}), nil
 }
-
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (ad *Device) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-	return ad.unstable, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (ad *Device) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// SetPermissions implements fs.InodeOperations.SetPermissions.
-func (ad *Device) SetPermissions(ctx context.Context, inode *fs.Inode, fp fs.FilePermissions) bool {
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-	ad.unstable.Perms = fp
-	ad.unstable.StatusChangeTime = time.NowFromContext(ctx)
-	return true
-}
-
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (ad *Device) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-	if owner.UID.Ok() {
-		ad.unstable.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		ad.unstable.Owner.GID = owner.GID
-	}
-	return nil
-}
-
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (ad *Device) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-
-	now := time.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATimeSetSystemTime {
-			ad.unstable.AccessTime = now
-		} else {
-			ad.unstable.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTimeSetSystemTime {
-			ad.unstable.ModificationTime = now
-		} else {
-			ad.unstable.ModificationTime = ts.MTime
-		}
-	}
-	ad.unstable.StatusChangeTime = now
-	return nil
-}
-
-// Truncate implements fs.InodeOperations.WriteOut.
-//
-// Ignored by ashmem.
-func (ad *Device) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return nil
-}
-
-// AddLink implements fs.InodeOperations.AddLink.
-//
-// Ashmem doesn't support links, no-op.
-func (ad *Device) AddLink() {}
-
-// DropLink implements fs.InodeOperations.DropLink.
-//
-// Ashmem doesn't support links, no-op.
-func (ad *Device) DropLink() {}
-
-// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-func (ad *Device) NotifyStatusChange(ctx context.Context) {
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-	now := time.NowFromContext(ctx)
-	ad.unstable.ModificationTime = now
-	ad.unstable.StatusChangeTime = now
-}
-
-// IsVirtual implements fs.InodeOperations.IsVirtual.
-//
-// Ashmem is virtual.
-func (ad *Device) IsVirtual() bool {
-	return true
-}
-
-// StatFS implements fs.InodeOperations.StatFS.
-//
-// Ashmem doesn't support querying for filesystem info.
-func (ad *Device) StatFS(context.Context) (fs.Info, error) {
-	return fs.Info{}, syserror.ENOSYS
-}
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 59e060e3c..3523b068a 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -180,6 +180,53 @@ type UnstableAttr struct {
 	Links uint64
 }
 
+// SetOwner sets the owner and group if they are valid.
+//
+// This method is NOT thread-safe. Callers must prevent concurrent calls.
+func (ua *UnstableAttr) SetOwner(ctx context.Context, owner FileOwner) {
+	if owner.UID.Ok() {
+		ua.Owner.UID = owner.UID
+	}
+	if owner.GID.Ok() {
+		ua.Owner.GID = owner.GID
+	}
+	ua.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// SetPermissions sets the permissions.
+//
+// This method is NOT thread-safe. Callers must prevent concurrent calls.
+func (ua *UnstableAttr) SetPermissions(ctx context.Context, p FilePermissions) {
+	ua.Perms = p
+	ua.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// SetTimestamps sets the timestamps according to the TimeSpec.
+//
+// This method is NOT thread-safe. Callers must prevent concurrent calls.
+func (ua *UnstableAttr) SetTimestamps(ctx context.Context, ts TimeSpec) {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return
+	}
+
+	now := ktime.NowFromContext(ctx)
+	if !ts.ATimeOmit {
+		if ts.ATimeSetSystemTime {
+			ua.AccessTime = now
+		} else {
+			ua.AccessTime = ts.ATime
+		}
+	}
+	if !ts.MTimeOmit {
+		if ts.MTimeSetSystemTime {
+			ua.ModificationTime = now
+		} else {
+			ua.ModificationTime = ts.MTime
+		}
+	}
+	ua.StatusChangeTime = now
+}
+
 // WithCurrentTime returns u with AccessTime == ModificationTime == current time.
 func WithCurrentTime(ctx context.Context, u UnstableAttr) UnstableAttr {
 	t := ktime.NowFromContext(ctx)
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index a077b91d2..27155819e 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -16,11 +16,11 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index e642c7f22..19cd55e65 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -24,12 +24,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 const (
@@ -43,34 +43,29 @@ const (
 //
 // +stateify savable
 type Device struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
 	fsutil.InodeNotDirectory         `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
 
-	// mu protects unstable.
-	mu       sync.Mutex `state:"nosave"`
-	unstable fs.UnstableAttr
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*Device)(nil)
+
 // NewDevice creates and intializes a Device structure.
 func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device {
 	return &Device{
-		unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: owner,
-			Perms: fp,
-			Links: 1,
-		}),
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, 0),
 	}
 }
 
-// Release implements fs.InodeOperations.Release.
-func (bd *Device) Release(context.Context) {}
-
 // GetFile implements fs.InodeOperations.GetFile.
 //
 // TODO: Add functionality to GetFile: Additional fields will be
@@ -85,115 +80,13 @@ func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags)
 	}), nil
 }
 
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (bd *Device) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-	return bd.unstable, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (bd *Device) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// SetPermissions implements fs.InodeOperations.SetPermissions.
-func (bd *Device) SetPermissions(ctx context.Context, inode *fs.Inode, fp fs.FilePermissions) bool {
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-	bd.unstable.Perms = fp
-	bd.unstable.StatusChangeTime = time.NowFromContext(ctx)
-	return true
-}
-
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (bd *Device) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-	if owner.UID.Ok() {
-		bd.unstable.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		bd.unstable.Owner.GID = owner.GID
-	}
-	return nil
-}
-
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (bd *Device) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-
-	now := time.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATimeSetSystemTime {
-			bd.unstable.AccessTime = now
-		} else {
-			bd.unstable.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTimeSetSystemTime {
-			bd.unstable.ModificationTime = now
-		} else {
-			bd.unstable.ModificationTime = ts.MTime
-		}
-	}
-	bd.unstable.StatusChangeTime = now
-	return nil
-}
-
-// Truncate implements fs.InodeOperations.WriteOut.
-//
-// Ignored for a character device, such as Binder.
-func (bd *Device) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return nil
-}
-
-// AddLink implements fs.InodeOperations.AddLink.
-//
-// Binder doesn't support links, no-op.
-func (bd *Device) AddLink() {}
-
-// DropLink implements fs.InodeOperations.DropLink.
-//
-// Binder doesn't support links, no-op.
-func (bd *Device) DropLink() {}
-
-// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-func (bd *Device) NotifyStatusChange(ctx context.Context) {
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-	now := time.NowFromContext(ctx)
-	bd.unstable.ModificationTime = now
-	bd.unstable.StatusChangeTime = now
-}
-
-// IsVirtual implements fs.InodeOperations.IsVirtual.
-//
-// Binder is virtual.
-func (bd *Device) IsVirtual() bool {
-	return true
-}
-
-// StatFS implements fs.InodeOperations.StatFS.
-//
-// Binder doesn't support querying for filesystem info.
-func (bd *Device) StatFS(context.Context) (fs.Info, error) {
-	return fs.Info{}, syserror.ENOSYS
-}
-
 // Proc implements fs.FileOperations and fs.IoctlGetter.
 //
 // +stateify savable
 type Proc struct {
-	fsutil.NoFsync                  `state:"nosave"`
-	fsutil.DeprecatedFileOperations `state:"nosave"`
-	fsutil.NotDirReaddir            `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
 
 	bd       *Device
 	task     *kernel.Task
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index b17b5202c..b9cfae05f 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -32,5 +32,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 3e127bf04..f8e8099f7 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -16,6 +16,8 @@
 package dev
 
 import (
+	"math"
+
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ashmem"
@@ -26,13 +28,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// Dev is the root node.
-//
-// +stateify savable
-type Dev struct {
-	ramfs.Dir
-}
-
 func newCharacterDevice(iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
 	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
@@ -43,8 +38,7 @@ func newCharacterDevice(iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode
 }
 
 func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	iops := &ramfs.Dir{}
-	iops.InitDir(ctx, map[string]*fs.Inode{}, fs.RootOwner, fs.FilePermsFromMode(0555))
+	iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -54,8 +48,7 @@ func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 }
 
 func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.Inode {
-	iops := &ramfs.Symlink{}
-	iops.InitSymlink(ctx, fs.RootOwner, target)
+	iops := ramfs.NewSymlink(ctx, fs.RootOwner, target)
 	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -66,8 +59,6 @@ func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.In
 
 // New returns the root node of a device filesystem.
 func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEnabled bool) *fs.Inode {
-	d := &Dev{}
-
 	contents := map[string]*fs.Inode{
 		"fd":     newSymlink(ctx, "/proc/self/fd", msrc),
 		"stdin":  newSymlink(ctx, "/proc/self/fd/0", msrc),
@@ -114,11 +105,19 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn
 		contents["ashmem"] = newCharacterDevice(ashmem, msrc)
 	}
 
-	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return fs.NewInode(d, msrc, fs.StableAttr{
+	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
 		BlockSize: usermem.PageSize,
 		Type:      fs.Directory,
 	})
 }
+
+// readZeros implements fs.FileOperations.Read with infinite null bytes.
+type readZeros struct{}
+
+// Read implements fs.FileOperations.Read.
+func (readZeros) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	return dst.ZeroOut(ctx, math.MaxInt64)
+}
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index d96f4f423..abfe689f0 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -33,6 +33,8 @@ const ashmemEnabledKey = "ashmem_enabled"
 // +stateify savable
 type filesystem struct{}
 
+var _ fs.Filesystem = (*filesystem)(nil)
+
 func init() {
 	fs.RegisterFilesystem(&filesystem{})
 }
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index eeda646ab..cbdd40161 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -15,41 +15,64 @@
 package dev
 
 import (
-	"math"
-
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // fullDevice is used to implement /dev/full.
 //
 // +stateify savable
 type fullDevice struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*fullDevice)(nil)
+
 func newFullDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *fullDevice {
-	f := &fullDevice{}
-	f.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	f := &fullDevice{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
 	return f
 }
 
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev by
-// returining ENOSPC.
-func (f *fullDevice) DeprecatedPwritev(_ context.Context, _ usermem.IOSequence, _ int64) (int64, error) {
-	return 0, syserror.ENOSPC
+// GetFile implements fs.InodeOperations.GetFile.
+func (f *fullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	return fs.NewFile(ctx, dirent, flags, &fullFileOperations{}), nil
 }
 
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (f *fullDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, _ int64) (int64, error) {
-	return dst.ZeroOut(ctx, math.MaxInt64)
+// +stateify savable
+type fullFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	readZeros                `state:"nosave"`
 }
 
-// Truncate should be simply ignored for character devices on linux.
-func (f *fullDevice) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
+var _ fs.FileOperations = (*fullFileOperations)(nil)
+
+// Write implements FileOperations.Write.
+func (fullFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.ENOSPC
 }
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 68090f353..73fd09058 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -15,78 +15,104 @@
 package dev
 
 import (
-	"io"
-	"math"
-
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // +stateify savable
 type nullDevice struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*nullDevice)(nil)
+
 func newNullDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *nullDevice {
-	n := &nullDevice{}
-	n.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	n := &nullDevice{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
 	return n
 }
 
-// DeprecatedPreadv reads data from the device.
-func (n *nullDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	return 0, io.EOF
-}
+// GetFile implements fs.FileOperations.GetFile.
+func (n *nullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
 
-// DeprecatedPwritev discards writes.
-func (n *nullDevice) DeprecatedPwritev(_ context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	return src.NumBytes(), nil
+	return fs.NewFile(ctx, dirent, flags, &nullFileOperations{}), nil
 }
 
-// Truncate should be simply ignored for character devices on linux.
-func (n *nullDevice) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
+// +stateify savable
+type nullFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRead      `state:"nosave"`
+	fsutil.FileNoopWrite     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
 }
 
+var _ fs.FileOperations = (*nullFileOperations)(nil)
+
 // +stateify savable
 type zeroDevice struct {
 	nullDevice
 }
 
+var _ fs.InodeOperations = (*zeroDevice)(nil)
+
 func newZeroDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *zeroDevice {
-	zd := &zeroDevice{}
-	zd.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	zd := &zeroDevice{
+		nullDevice: nullDevice{
+			InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+		},
+	}
 	return zd
 }
 
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (zd *zeroDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	return dst.ZeroOut(ctx, math.MaxInt64)
-}
-
-// GetFile overrides ramfs.Entry.GetFile and returns a zeroFile instead.
+// GetFile implements fs.FileOperations.GetFile.
 func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	// Allow pread(2) and pwrite(2) on this file.
 	flags.Pread = true
 	flags.Pwrite = true
 
-	return fs.NewFile(ctx, dirent, flags, &zeroFileOperations{
-		FileOperations: &fsutil.Handle{HandleOperations: dirent.Inode.HandleOps()},
-	}), nil
+	return fs.NewFile(ctx, dirent, flags, &zeroFileOperations{}), nil
 }
 
 // +stateify savable
 type zeroFileOperations struct {
-	fs.FileOperations
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopWrite     `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	readZeros                `state:"nosave"`
 }
 
+var _ fs.FileOperations = (*zeroFileOperations)(nil)
+
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (*zeroFileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
 	m, err := mm.NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 33e4913e4..837b7793a 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -19,37 +19,58 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // +stateify savable
 type randomDevice struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*randomDevice)(nil)
+
 func newRandomDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *randomDevice {
-	r := &randomDevice{}
-	r.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	r := &randomDevice{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
 	return r
 }
 
-// DeprecatedPreadv reads random data.
-func (*randomDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
+// GetFile implements fs.InodeOperations.GetFile.
+func (randomDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &randomFileOperations{}), nil
 }
 
-// DeprecatedPwritev implements fs.HandleOperations.DeprecatedPwritev.
-func (*randomDevice) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	// On Linux, "Writing to /dev/random or /dev/urandom will update the
-	// entropy pool with the data written, but this will not result in a higher
-	// entropy count" - random(4). We don't need to support this, but we do
-	// need to support the write, so just make it a no-op a la /dev/null.
-	return src.NumBytes(), nil
+// +stateify savable
+type randomFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopWrite     `state:"nosave"`
 }
 
-// Truncate should be simply ignored for character devices on linux.
-func (r *randomDevice) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
+var _ fs.FileOperations = (*randomFileOperations)(nil)
+
+// Read implements fs.FileOperations.Read.
+func (randomFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
 }
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index c4918a11b..d6a19dc81 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -837,8 +837,8 @@ func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perm
 	})
 }
 
-// getDotAttrs returns the DentAttrs corresponding to "." and ".." directories.
-func (d *Dirent) getDotAttrs(root *Dirent) (DentAttr, DentAttr) {
+// GetDotAttrs returns the DentAttrs corresponding to "." and ".." directories.
+func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) {
 	// Get '.'.
 	sattr := d.Inode.StableAttr
 	dot := DentAttr{
@@ -870,7 +870,7 @@ func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int6
 	// Collect attrs for "." and  "..".
 	attrs := make(map[string]DentAttr)
 	names := []string{".", ".."}
-	attrs["."], attrs[".."] = d.getDotAttrs(root)
+	attrs["."], attrs[".."] = d.GetDotAttrs(root)
 
 	// Get info from all children.
 	d.mu.Lock()
@@ -965,7 +965,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent,
 	}
 
 	// Collect attrs for "." and "..".
-	dot, dotdot := d.getDotAttrs(root)
+	dot, dotdot := d.GetDotAttrs(root)
 
 	// Emit "." and ".." if the offset is low enough.
 	if offset == 0 {
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index e3b830747..b4d11cb45 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -37,13 +37,13 @@ import (
 //
 // +stateify savable
 type pipeOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
-	waiter.Queue         `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	waiter.Queue             `state:"nosave"`
 
 	// flags are the flags used to open the pipe.
 	flags fs.FileFlags `state:".(fs.FileFlags)"`
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index 28e8e233d..81c6e2b5d 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -91,7 +91,7 @@ type FileOperations interface {
 	Flush(ctx context.Context, file *File) error
 
 	// ConfigureMMap mutates opts to implement mmap(2) for the file. Most
-	// implementations can either embed fsutil.NoMMap (if they don't support
+	// implementations can either embed fsutil.FileNoMMap (if they don't support
 	// memory mapping) or call fsutil.GenericConfigureMMap with the appropriate
 	// memmap.Mappable.
 	ConfigureMMap(ctx context.Context, file *File, opts *memmap.MMapOpts) error
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index f121cbdda..a4ac58763 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -20,7 +20,8 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 )
 
@@ -135,7 +136,7 @@ func TestReaddirRevalidation(t *testing.T) {
 
 	// Get a handle to the dirent in the upper filesystem so that we can
 	// modify it without going through the dirent.
-	upperDir := upper.InodeOperations.(*dir).InodeOperations.(*ramfstest.Dir)
+	upperDir := upper.InodeOperations.(*dir).InodeOperations.(*ramfs.Dir)
 
 	// Check that overlay returns the files from both upper and lower.
 	openDir, err := overlay.GetFile(ctx, fs.NewDirent(overlay, "stub"), fs.FileFlags{Read: true})
@@ -155,7 +156,7 @@ func TestReaddirRevalidation(t *testing.T) {
 	if err := upperDir.Remove(ctx, upper, "a"); err != nil {
 		t.Fatalf("error removing child: %v", err)
 	}
-	upperDir.AddChild(ctx, "c", fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}),
+	upperDir.AddChild(ctx, "c", fs.NewInode(fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermissions{}, 0),
 		upper.MountSource, fs.StableAttr{Type: fs.RegularFile}))
 
 	// Seek to beginning of the directory and do the readdir again.
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index 65ca196d9..40d84d9f2 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -31,14 +31,14 @@ import (
 // TestFileOperations is an implementation of the File interface. It provides all
 // required methods.
 type TestFileOperations struct {
-	fsutil.NoopRelease   `state:"nosave"`
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
-	waiter.AlwaysReady   `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
 }
 
 // NewTestFile creates and initializes a new test file.
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 6834e1272..4965e1a5f 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -67,7 +67,6 @@ go_library(
         "frame_ref_set.go",
         "frame_ref_set_impl.go",
         "fsutil.go",
-        "handle.go",
         "host_file_mapper.go",
         "host_file_mapper_state.go",
         "host_file_mapper_unsafe.go",
@@ -96,20 +95,6 @@ go_library(
     ],
 )
 
-go_test(
-    name = "fsutil_x_test",
-    size = "small",
-    srcs = ["handle_test.go"],
-    deps = [
-        ":fsutil",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/ramfs/test",
-        "//pkg/sentry/usermem",
-    ],
-)
-
 go_test(
     name = "fsutil_test",
     size = "small",
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 46db2e51c..0970f782b 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -24,12 +24,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// NoopRelease implements FileOperations.Release for files that have no
+// FileNoopRelease implements fs.FileOperations.Release for files that have no
 // resources to release.
-type NoopRelease struct{}
+type FileNoopRelease struct{}
 
 // Release is a no-op.
-func (NoopRelease) Release() {}
+func (FileNoopRelease) Release() {}
 
 // SeekWithDirCursor is used to implement fs.FileOperations.Seek.  If dirCursor
 // is not nil and the seek was on a directory, the cursor will be updated.
@@ -127,71 +127,81 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence,
 	return current, syserror.EINVAL
 }
 
-// GenericSeek implements FileOperations.Seek for files that use a generic
-// seek implementation.
-type GenericSeek struct{}
+// FileGenericSeek implements fs.FileOperations.Seek for files that use a
+// generic seek implementation.
+type FileGenericSeek struct{}
 
 // Seek implements fs.FileOperations.Seek.
-func (GenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+func (FileGenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
 	return SeekWithDirCursor(ctx, file, whence, offset, nil)
 }
 
-// ZeroSeek implements FileOperations.Seek for files that maintain a constant
-// zero-value offset and require a no-op Seek.
-type ZeroSeek struct{}
+// FileZeroSeek implements fs.FileOperations.Seek for files that maintain a
+// constant zero-value offset and require a no-op Seek.
+type FileZeroSeek struct{}
 
-// Seek implements FileOperations.Seek.
-func (ZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+// Seek implements fs.FileOperations.Seek.
+func (FileZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
 	return 0, nil
 }
 
-// PipeSeek implements FileOperations.Seek and can be used for files that behave
-// like pipes (seeking is not supported).
-type PipeSeek struct{}
+// FileNoSeek implements fs.FileOperations.Seek to return EINVAL.
+type FileNoSeek struct{}
+
+// Seek implements fs.FileOperations.Seek.
+func (FileNoSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+	return 0, syserror.EINVAL
+}
 
-// Seek implements FileOperations.Seek.
-func (PipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+// FilePipeSeek implements fs.FileOperations.Seek and can be used for files
+// that behave like pipes (seeking is not supported).
+type FilePipeSeek struct{}
+
+// Seek implements fs.FileOperations.Seek.
+func (FilePipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
 	return 0, syserror.ESPIPE
 }
 
-// NotDirReaddir implements FileOperations.Readdir for non-directories.
-type NotDirReaddir struct{}
+// FileNotDirReaddir implements fs.FileOperations.Readdir for non-directories.
+type FileNotDirReaddir struct{}
 
-// Readdir implements FileOperations.NotDirReaddir.
-func (NotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) {
+// Readdir implements fs.FileOperations.FileNotDirReaddir.
+func (FileNotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) {
 	return 0, syserror.ENOTDIR
 }
 
-// NoFsync implements FileOperations.Fsync for files that don't support syncing.
-type NoFsync struct{}
+// FileNoFsync implements fs.FileOperations.Fsync for files that don't support
+// syncing.
+type FileNoFsync struct{}
 
-// Fsync implements FileOperations.Fsync.
-func (NoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
+// Fsync implements fs.FileOperations.Fsync.
+func (FileNoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
 	return syserror.EINVAL
 }
 
-// NoopFsync implements FileOperations.Fsync for files that don't need to synced.
-type NoopFsync struct{}
+// FileNoopFsync implements fs.FileOperations.Fsync for files that don't need
+// to synced.
+type FileNoopFsync struct{}
 
-// Fsync implements FileOperations.Fsync.
-func (NoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
+// Fsync implements fs.FileOperations.Fsync.
+func (FileNoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
 	return nil
 }
 
-// NoopFlush implements FileOperations.Flush as a no-op.
-type NoopFlush struct{}
+// FileNoopFlush implements fs.FileOperations.Flush as a no-op.
+type FileNoopFlush struct{}
 
-// Flush implements FileOperations.Flush.
-func (NoopFlush) Flush(context.Context, *fs.File) error {
+// Flush implements fs.FileOperations.Flush.
+func (FileNoopFlush) Flush(context.Context, *fs.File) error {
 	return nil
 }
 
-// NoMMap implements fs.FileOperations.Mappable for files that cannot
+// FileNoMMap implements fs.FileOperations.Mappable for files that cannot
 // be memory mapped.
-type NoMMap struct{}
+type FileNoMMap struct{}
 
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
-func (NoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error {
+func (FileNoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error {
 	return syserror.ENODEV
 }
 
@@ -204,26 +214,43 @@ func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpt
 	return nil
 }
 
-// NoIoctl implements fs.FileOperations.Ioctl for files that don't implement
-// the ioctl syscall.
-type NoIoctl struct{}
+// FileNoIoctl implements fs.FileOperations.Ioctl for files that don't
+// implement the ioctl syscall.
+type FileNoIoctl struct{}
 
 // Ioctl implements fs.FileOperations.Ioctl.
-func (NoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (FileNoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	return 0, syserror.ENOTTY
 }
 
-// DirFileOperations implements FileOperations for directories.
+// DirFileOperations implements most of fs.FileOperations for directories,
+// except for Readdir which the embedding type must implement.
+type DirFileOperations struct {
+	waiter.AlwaysReady
+	FileGenericSeek
+	FileNoFsync
+	FileNoIoctl
+	FileNoMMap
+	FileNoopFlush
+	FileNoopRelease
+}
+
+// Read implements fs.FileOperations.Read
+func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Write implements fs.FileOperations.Write.
+func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// StaticDirFileOperations implements fs.FileOperations for directories with
+// static children.
 //
 // +stateify savable
-type DirFileOperations struct {
-	waiter.AlwaysReady `state:"nosave"`
-	NoopRelease        `state:"nosave"`
-	GenericSeek        `state:"nosave"`
-	NoFsync            `state:"nosave"`
-	NoopFlush          `state:"nosave"`
-	NoMMap             `state:"nosave"`
-	NoIoctl            `state:"nosave"`
+type StaticDirFileOperations struct {
+	DirFileOperations
 
 	// dentryMap is a SortedDentryMap used to implement Readdir.
 	dentryMap *fs.SortedDentryMap
@@ -233,37 +260,106 @@ type DirFileOperations struct {
 	dirCursor string
 }
 
-// NewDirFileOperations returns a new DirFileOperations that will iterate the
-// given denty map.
-func NewDirFileOperations(dentries *fs.SortedDentryMap) *DirFileOperations {
-	return &DirFileOperations{
+// NewStaticDirFileOperations returns a new StaticDirFileOperations that will
+// iterate the given denty map.
+func NewStaticDirFileOperations(dentries *fs.SortedDentryMap) *StaticDirFileOperations {
+	return &StaticDirFileOperations{
 		dentryMap: dentries,
 	}
 }
 
 // IterateDir implements DirIterator.IterateDir.
-func (dfo *DirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	n, err := fs.GenericReaddir(dirCtx, dfo.dentryMap)
+func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	n, err := fs.GenericReaddir(dirCtx, sdfo.dentryMap)
 	return offset + n, err
 }
 
-// Readdir implements FileOperations.Readdir.
-func (dfo *DirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+// Readdir implements fs.FileOperations.Readdir.
+func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
 	root := fs.RootFromContext(ctx)
 	defer root.DecRef()
 	dirCtx := &fs.DirCtx{
 		Serializer: serializer,
-		DirCursor:  &dfo.dirCursor,
+		DirCursor:  &sdfo.dirCursor,
 	}
-	return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset())
+	return fs.DirentReaddir(ctx, file.Dirent, sdfo, root, dirCtx, file.Offset())
 }
 
-// Read implements FileOperations.Read
-func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
-	return 0, syserror.EISDIR
+// NoReadWriteFile is a file that does not support reading or writing.
+//
+// +stateify savable
+type NoReadWriteFile struct {
+	waiter.AlwaysReady `state:"nosave"`
+	FileGenericSeek    `state:"nosave"`
+	FileNoIoctl        `state:"nosave"`
+	FileNoMMap         `state:"nosave"`
+	FileNoopFsync      `state:"nosave"`
+	FileNoopFlush      `state:"nosave"`
+	FileNoopRelease    `state:"nosave"`
+	FileNoRead         `state:"nosave"`
+	FileNoWrite        `state:"nosave"`
+	FileNotDirReaddir  `state:"nosave"`
 }
 
-// Write implements FileOperations.Write.
-func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
-	return 0, syserror.EISDIR
+var _ fs.FileOperations = (*NoReadWriteFile)(nil)
+
+// FileStaticContentReader is a helper to implement fs.FileOperations.Read with
+// static content.
+//
+// +stateify savable
+type FileStaticContentReader struct {
+	// content is immutable.
+	content []byte
+}
+
+// NewFileStaticContentReader initializes a FileStaticContentReader with the
+// given content.
+func NewFileStaticContentReader(b []byte) FileStaticContentReader {
+	return FileStaticContentReader{
+		content: b,
+	}
+}
+
+// Read implements fs.FileOperations.Read.
+func (scr *FileStaticContentReader) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if offset >= int64(len(scr.content)) {
+		return 0, nil
+	}
+	n, err := dst.CopyOut(ctx, scr.content[offset:])
+	return int64(n), err
+}
+
+// FileNoopWrite implements fs.FileOperations.Write as a noop.
+type FileNoopWrite struct{}
+
+// Write implements fs.FileOperations.Write.
+func (FileNoopWrite) Write(_ context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// FileNoRead implements fs.FileOperations.Read to return EINVAL.
+type FileNoRead struct{}
+
+// Read implements fs.FileOperations.Read.
+func (FileNoRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// FileNoWrite implements fs.FileOperations.Write to return EINVAL.
+type FileNoWrite struct{}
+
+// Write implements fs.FileOperations.Write.
+func (FileNoWrite) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// FileNoopRead implement fs.FileOperations.Read as a noop.
+type FileNoopRead struct{}
+
+// Read implements fs.FileOperations.Read.
+func (FileNoopRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, nil
 }
diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go
index 3d7f3732d..319c4841b 100644
--- a/pkg/sentry/fs/fsutil/fsutil.go
+++ b/pkg/sentry/fs/fsutil/fsutil.go
@@ -20,7 +20,5 @@
 // - For fs.Inodes that require a page cache to be memory mapped, see
 //   inode_cache.go.
 //
-// - For fs.Files that implement fs.HandleOps, see handle.go.
-//
 // - For anon fs.Inodes, see anon.go.
 package fsutil
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
deleted file mode 100644
index 8920b72ee..000000000
--- a/pkg/sentry/fs/fsutil/handle.go
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fsutil
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// Handle implements FileOperations.
-//
-// FIXME: Remove Handle entirely in favor of individual fs.File
-// implementations using simple generic utilities.
-//
-// +stateify savable
-type Handle struct {
-	NoopRelease      `state:"nosave"`
-	NoIoctl          `state:"nosave"`
-	HandleOperations fs.HandleOperations
-
-	// dirCursor is the directory cursor.
-	dirCursor string
-}
-
-// NewHandle returns a File backed by the Dirent and FileFlags.
-func NewHandle(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, hops fs.HandleOperations) *fs.File {
-	if !fs.IsPipe(dirent.Inode.StableAttr) && !fs.IsSocket(dirent.Inode.StableAttr) {
-		// Allow reading/writing at an arbitrary offset for non-pipes
-		// and non-sockets.
-		flags.Pread = true
-		flags.Pwrite = true
-	}
-
-	return fs.NewFile(ctx, dirent, flags, &Handle{HandleOperations: hops})
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (h *Handle) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return h.HandleOperations.Readiness(mask)
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (h *Handle) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	h.HandleOperations.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (h *Handle) EventUnregister(e *waiter.Entry) {
-	h.HandleOperations.EventUnregister(e)
-}
-
-// Readdir implements FileOperations.Readdir.
-func (h *Handle) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
-	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
-	dirCtx := &fs.DirCtx{
-		Serializer: serializer,
-		DirCursor:  &h.dirCursor,
-	}
-	n, err := fs.DirentReaddir(ctx, file.Dirent, h, root, dirCtx, file.Offset())
-	return n, err
-}
-
-// Seek implements FileOperations.Seek.
-func (h *Handle) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
-	return SeekWithDirCursor(ctx, file, whence, offset, &h.dirCursor)
-}
-
-// IterateDir implements DirIterator.IterateDir.
-func (h *Handle) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	return h.HandleOperations.DeprecatedReaddir(ctx, dirCtx, offset)
-}
-
-// Read implements FileOperations.Read.
-func (h *Handle) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	return h.HandleOperations.DeprecatedPreadv(ctx, dst, offset)
-}
-
-// Write implements FileOperations.Write.
-func (h *Handle) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
-	return h.HandleOperations.DeprecatedPwritev(ctx, src, offset)
-}
-
-// Fsync implements FileOperations.Fsync.
-func (h *Handle) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
-	switch syncType {
-	case fs.SyncAll, fs.SyncData:
-		// Write out metadata.
-		if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
-			return err
-		}
-		fallthrough
-	case fs.SyncBackingStorage:
-		// Use DeprecatedFsync to sync disks.
-		return h.HandleOperations.DeprecatedFsync()
-	}
-	panic("invalid sync type")
-}
-
-// Flush implements FileOperations.Flush.
-func (h *Handle) Flush(context.Context, *fs.File) error {
-	return h.HandleOperations.DeprecatedFlush()
-}
-
-// ConfigureMMap implements FileOperations.ConfigureMMap.
-func (h *Handle) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
-	mappable := file.Dirent.Inode.Mappable()
-	if mappable == nil {
-		return syserror.ENODEV
-	}
-	return GenericConfigureMMap(file, mappable, opts)
-}
diff --git a/pkg/sentry/fs/fsutil/handle_test.go b/pkg/sentry/fs/fsutil/handle_test.go
deleted file mode 100644
index 43e1a3bdf..000000000
--- a/pkg/sentry/fs/fsutil/handle_test.go
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fsutil_test
-
-import (
-	"io"
-	"syscall"
-	"testing"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-type testInodeOperations struct {
-	fs.InodeOperations
-	fs.InodeType
-	FileSize int64
-	writes   uint
-	reads    uint
-}
-
-func (t *testInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	return fs.UnstableAttr{Size: t.FileSize}, nil
-}
-
-// Check implements InodeOperations.Check.
-func (t *testInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-func (t *testInodeOperations) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	t.reads++
-	return t.InodeOperations.DeprecatedPreadv(ctx, dst, offset)
-}
-
-func (t *testInodeOperations) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	t.writes++
-	return t.InodeOperations.DeprecatedPwritev(ctx, src, offset)
-}
-
-// testHandle returns a handle for a test node.
-//
-// The size of the node is fixed at 20 bytes.
-func testHandle(t *testing.T, flags fs.FileFlags, nt fs.InodeType) (*fs.File, *testInodeOperations) {
-	ctx := contexttest.Context(t)
-	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
-	n := &testInodeOperations{
-		InodeOperations: ramfstest.NewFile(ctx, fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}}),
-		FileSize:        20,
-	}
-	d := fs.NewDirent(fs.NewInode(n, m, fs.StableAttr{Type: nt}), "test")
-	return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), n
-}
-
-func TestHandleOps(t *testing.T) {
-	h, n := testHandle(t, fs.FileFlags{Read: true, Write: true}, fs.RegularFile)
-	defer h.DecRef()
-
-	// Make sure a write request works.
-	if n, err := h.Writev(contexttest.Context(t), usermem.BytesIOSequence([]byte("a"))); n != 1 || err != nil {
-		t.Fatalf("Writev: got (%d, %v), wanted (1, nil)", n, err)
-	}
-	if n.writes != 1 {
-		t.Errorf("found %d writes, expected 1", n.writes)
-	}
-
-	// Make sure a read request works.
-	dst := make([]byte, 1)
-	if n, err := h.Preadv(contexttest.Context(t), usermem.BytesIOSequence(dst), 0); n != 1 || (err != nil && err != io.EOF) {
-		t.Errorf("Preadv: got (%d, %v), wanted (1, nil or EOF)", n, err)
-	}
-	if dst[0] != 'a' {
-		t.Errorf("Preadv: read %q, wanted 'a'", dst[0])
-	}
-	if n.reads != 1 {
-		t.Errorf("found %d reads, expected 1", n.reads)
-	}
-}
-
-type seekTest struct {
-	whence fs.SeekWhence
-	offset int64
-	result int64
-	err    error
-}
-
-type seekSuite struct {
-	nodeType fs.InodeType
-	cases    []seekTest
-}
-
-// FIXME: This is currently missing fs.SeekEnd tests due to the
-// fact that NullInodeOperations returns an error on stat.
-func TestHandleSeek(t *testing.T) {
-	ts := []seekSuite{
-		{
-			nodeType: fs.RegularFile,
-			cases: []seekTest{
-				{fs.SeekSet, 0, 0, nil},
-				{fs.SeekSet, 10, 10, nil},
-				{fs.SeekSet, -5, 10, syscall.EINVAL},
-				{fs.SeekCurrent, -1, 9, nil},
-				{fs.SeekCurrent, 2, 11, nil},
-				{fs.SeekCurrent, -12, 11, syscall.EINVAL},
-				{fs.SeekEnd, -1, 19, nil},
-				{fs.SeekEnd, 0, 20, nil},
-				{fs.SeekEnd, 2, 22, nil},
-			},
-		},
-		{
-			nodeType: fs.Directory,
-			cases: []seekTest{
-				{fs.SeekSet, 0, 0, nil},
-				{fs.SeekSet, 10, 0, syscall.EINVAL},
-				{fs.SeekSet, -5, 0, syscall.EINVAL},
-				{fs.SeekCurrent, 0, 0, nil},
-				{fs.SeekCurrent, 11, 0, syscall.EINVAL},
-				{fs.SeekCurrent, -6, 0, syscall.EINVAL},
-				{fs.SeekEnd, 0, 0, syscall.EINVAL},
-				{fs.SeekEnd, -1, 0, syscall.EINVAL},
-				{fs.SeekEnd, 2, 0, syscall.EINVAL},
-			},
-		},
-		{
-			nodeType: fs.Symlink,
-			cases: []seekTest{
-				{fs.SeekSet, 5, 0, syscall.EINVAL},
-				{fs.SeekSet, -5, 0, syscall.EINVAL},
-				{fs.SeekSet, 0, 0, syscall.EINVAL},
-				{fs.SeekCurrent, 5, 0, syscall.EINVAL},
-				{fs.SeekCurrent, -5, 0, syscall.EINVAL},
-				{fs.SeekCurrent, 0, 0, syscall.EINVAL},
-				{fs.SeekEnd, 5, 0, syscall.EINVAL},
-				{fs.SeekEnd, -5, 0, syscall.EINVAL},
-				{fs.SeekEnd, 0, 0, syscall.EINVAL},
-			},
-		},
-		{
-			nodeType: fs.Pipe,
-			cases: []seekTest{
-				{fs.SeekSet, 5, 0, syscall.ESPIPE},
-				{fs.SeekSet, -5, 0, syscall.ESPIPE},
-				{fs.SeekSet, 0, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, 5, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, -5, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, 0, 0, syscall.ESPIPE},
-				{fs.SeekEnd, 5, 0, syscall.ESPIPE},
-				{fs.SeekEnd, -5, 0, syscall.ESPIPE},
-				{fs.SeekEnd, 0, 0, syscall.ESPIPE},
-			},
-		},
-		{
-			nodeType: fs.Socket,
-			cases: []seekTest{
-				{fs.SeekSet, 5, 0, syscall.ESPIPE},
-				{fs.SeekSet, -5, 0, syscall.ESPIPE},
-				{fs.SeekSet, 0, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, 5, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, -5, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, 0, 0, syscall.ESPIPE},
-				{fs.SeekEnd, 5, 0, syscall.ESPIPE},
-				{fs.SeekEnd, -5, 0, syscall.ESPIPE},
-				{fs.SeekEnd, 0, 0, syscall.ESPIPE},
-			},
-		},
-		{
-			nodeType: fs.CharacterDevice,
-			cases: []seekTest{
-				{fs.SeekSet, 5, 0, nil},
-				{fs.SeekSet, -5, 0, nil},
-				{fs.SeekSet, 0, 0, nil},
-				{fs.SeekCurrent, 5, 0, nil},
-				{fs.SeekCurrent, -5, 0, nil},
-				{fs.SeekCurrent, 0, 0, nil},
-				{fs.SeekEnd, 5, 0, nil},
-				{fs.SeekEnd, -5, 0, nil},
-				{fs.SeekEnd, 0, 0, nil},
-			},
-		},
-		{
-			nodeType: fs.BlockDevice,
-			cases: []seekTest{
-				{fs.SeekSet, 0, 0, nil},
-				{fs.SeekSet, 10, 10, nil},
-				{fs.SeekSet, -5, 10, syscall.EINVAL},
-				{fs.SeekCurrent, -1, 9, nil},
-				{fs.SeekCurrent, 2, 11, nil},
-				{fs.SeekCurrent, -12, 11, syscall.EINVAL},
-				{fs.SeekEnd, -1, 19, nil},
-				{fs.SeekEnd, 0, 20, nil},
-				{fs.SeekEnd, 2, 22, nil},
-			},
-		},
-	}
-
-	for _, s := range ts {
-		h, _ := testHandle(t, fs.FileFlags{Read: true, Write: true}, s.nodeType)
-		defer h.DecRef()
-
-		for _, c := range s.cases {
-			// Try the given seek.
-			offset, err := h.Seek(contexttest.Context(t), c.whence, c.offset)
-			if err != c.err {
-				t.Errorf("seek(%s, %d) on %s had unexpected error: expected %v, got %v", c.whence, c.offset, s.nodeType, c.err, err)
-			}
-			if err == nil && offset != c.result {
-				t.Errorf("seek(%s, %d) on %s had bad result: expected %v, got %v", c.whence, c.offset, s.nodeType, c.result, offset)
-			}
-		}
-	}
-}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index d4db1c2de..f1f5ec1de 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -15,213 +15,270 @@
 package fsutil
 
 import (
+	"sync"
+
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// NewSimpleInodeOperations constructs fs.InodeOperations from InodeSimpleAttributes.
-func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations {
-	return &simpleInodeOperations{InodeSimpleAttributes: i}
+// SimpleFileInode is a simple implementation of InodeOperations.
+//
+// +stateify savable
+type SimpleFileInode struct {
+	InodeGenericChecker       `state:"nosave"`
+	InodeNoExtendedAttributes `state:"nosave"`
+	InodeNoopRelease          `state:"nosave"`
+	InodeNoopWriteOut         `state:"nosave"`
+	InodeNotDirectory         `state:"nosave"`
+	InodeNotMappable          `state:"nosave"`
+	InodeNotOpenable          `state:"nosave"`
+	InodeNotSocket            `state:"nosave"`
+	InodeNotSymlink           `state:"nosave"`
+	InodeNotTruncatable       `state:"nosave"`
+	InodeNotVirtual           `state:"nosave"`
+
+	InodeSimpleAttributes
+}
+
+// NewSimpleFileInode returns a new SimpleFileInode.
+func NewSimpleFileInode(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) *SimpleFileInode {
+	return &SimpleFileInode{
+		InodeSimpleAttributes: NewInodeSimpleAttributes(ctx, owner, perms, typ),
+	}
 }
 
-// simpleInodeOperations is a simple implementation of Inode.
+// NoReadWriteFileInode is an implementation of InodeOperations that supports
+// opening files that are not readable or writeable.
 //
 // +stateify savable
-type simpleInodeOperations struct {
-	DeprecatedFileOperations  `state:"nosave"`
+type NoReadWriteFileInode struct {
+	InodeGenericChecker       `state:"nosave"`
+	InodeNoExtendedAttributes `state:"nosave"`
+	InodeNoopRelease          `state:"nosave"`
+	InodeNoopWriteOut         `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
+	InodeNotMappable          `state:"nosave"`
 	InodeNotSocket            `state:"nosave"`
-	InodeNotRenameable        `state:"nosave"`
-	InodeNotOpenable          `state:"nosave"`
-	InodeNotVirtual           `state:"nosave"`
 	InodeNotSymlink           `state:"nosave"`
-	InodeNoExtendedAttributes `state:"nosave"`
-	NoMappable                `state:"nosave"`
-	NoopWriteOut              `state:"nosave"`
+	InodeNotTruncatable       `state:"nosave"`
+	InodeNotVirtual           `state:"nosave"`
 
 	InodeSimpleAttributes
 }
 
-// InodeSimpleAttributes implements a subset of the Inode interface. It provides
-// read-only access to attributes.
+// NewNoReadWriteFileInode returns a new NoReadWriteFileInode.
+func NewNoReadWriteFileInode(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) *NoReadWriteFileInode {
+	return &NoReadWriteFileInode{
+		InodeSimpleAttributes: NewInodeSimpleAttributes(ctx, owner, perms, typ),
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (*NoReadWriteFileInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &NoReadWriteFile{}), nil
+}
+
+// InodeSimpleAttributes implements methods for updating in-memory unstable
+// attributes.
 //
 // +stateify savable
 type InodeSimpleAttributes struct {
-	// FSType is the filesystem type reported by StatFS.
+	// FSType is the immutable filesystem type that will be returned by
+	// StatFS.
 	FSType uint64
 
-	// UAttr are the unstable attributes of the Inode.
-	UAttr fs.UnstableAttr
+	// mu protects unstable.
+	mu       sync.RWMutex `state:"nosave"`
+	Unstable fs.UnstableAttr
 }
 
-// Release implements fs.InodeOperations.Release.
-func (i *InodeSimpleAttributes) Release(context.Context) {}
-
-// StatFS implements fs.InodeOperations.StatFS.
-func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) {
-	return fs.Info{Type: i.FSType}, nil
+// NewInodeSimpleAttributes returns a new InodeSimpleAttributes.
+func NewInodeSimpleAttributes(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) InodeSimpleAttributes {
+	return InodeSimpleAttributes{
+		FSType: typ,
+		Unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: owner,
+			Perms: perms,
+		}),
+	}
 }
 
 // UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (i *InodeSimpleAttributes) UnstableAttr(context.Context, *fs.Inode) (fs.UnstableAttr, error) {
-	return i.UAttr, nil
+func (i *InodeSimpleAttributes) UnstableAttr(ctx context.Context, _ *fs.Inode) (fs.UnstableAttr, error) {
+	i.mu.RLock()
+	u := i.Unstable
+	i.mu.RUnlock()
+	return u, nil
 }
 
-// Check implements fs.InodeOperations.Check.
-func (i *InodeSimpleAttributes) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (i *InodeSimpleAttributes) SetPermissions(ctx context.Context, _ *fs.Inode, p fs.FilePermissions) bool {
+	i.mu.Lock()
+	i.Unstable.SetPermissions(ctx, p)
+	i.mu.Unlock()
+	return true
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *InodeSimpleAttributes) SetOwner(ctx context.Context, _ *fs.Inode, owner fs.FileOwner) error {
+	i.mu.Lock()
+	i.Unstable.SetOwner(ctx, owner)
+	i.mu.Unlock()
+	return nil
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *InodeSimpleAttributes) SetTimestamps(ctx context.Context, _ *fs.Inode, ts fs.TimeSpec) error {
+	i.mu.Lock()
+	i.Unstable.SetTimestamps(ctx, ts)
+	i.mu.Unlock()
+	return nil
 }
 
 // AddLink implements fs.InodeOperations.AddLink.
-func (*InodeSimpleAttributes) AddLink() {}
+func (i *InodeSimpleAttributes) AddLink() {
+	i.mu.Lock()
+	i.Unstable.Links++
+	i.mu.Unlock()
+}
 
 // DropLink implements fs.InodeOperations.DropLink.
-func (*InodeSimpleAttributes) DropLink() {}
-
-// NotifyStatusChange implements fs.fs.InodeOperations.
-func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
-	i.UAttr.StatusChangeTime = ktime.NowFromContext(ctx)
+func (i *InodeSimpleAttributes) DropLink() {
+	i.mu.Lock()
+	i.Unstable.Links--
+	i.mu.Unlock()
 }
 
-// SetPermissions implements fs.InodeOperations.SetPermissions.
-func (*InodeSimpleAttributes) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool {
-	return false
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) {
+	if i.FSType == 0 {
+		return fs.Info{}, syserror.ENOSYS
+	}
+	return fs.Info{Type: i.FSType}, nil
 }
 
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (*InodeSimpleAttributes) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
-	return syserror.EINVAL
+// NotifyAccess updates the access time.
+func (i *InodeSimpleAttributes) NotifyAccess(ctx context.Context) {
+	i.mu.Lock()
+	i.Unstable.AccessTime = ktime.NowFromContext(ctx)
+	i.mu.Unlock()
 }
 
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (*InodeSimpleAttributes) SetTimestamps(context.Context, *fs.Inode, fs.TimeSpec) error {
-	return syserror.EINVAL
+// NotifyModification updates the modification time.
+func (i *InodeSimpleAttributes) NotifyModification(ctx context.Context) {
+	i.mu.Lock()
+	i.Unstable.ModificationTime = ktime.NowFromContext(ctx)
+	i.mu.Unlock()
 }
 
-// Truncate implements fs.InodeOperations.Truncate.
-func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error {
-	return syserror.EINVAL
+// NotifyStatusChange updates the status change time.
+func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
+	i.mu.Lock()
+	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+	i.mu.Unlock()
 }
 
-// InMemoryAttributes implements utilities for updating in-memory unstable
-// attributes and extended attributes. It is not thread-safe.
-//
-// Users need not initialize Xattrs to non-nil (it will be initialized
-// when the first extended attribute is set.
+// InodeSimpleExtendedAttributes implements
+// fs.InodeOperations.{Get,Set,List}xattr.
 //
 // +stateify savable
-type InMemoryAttributes struct {
-	Unstable fs.UnstableAttr
-	Xattrs   map[string][]byte
+type InodeSimpleExtendedAttributes struct {
+	// mu protects xattrs.
+	mu     sync.RWMutex `state:"nosave"`
+	xattrs map[string][]byte
 }
 
-// SetPermissions updates the permissions to p.
-func (i *InMemoryAttributes) SetPermissions(ctx context.Context, p fs.FilePermissions) bool {
-	i.Unstable.Perms = p
-	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
-	return true
+// Getxattr implements fs.InodeOperations.Getxattr.
+func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) ([]byte, error) {
+	i.mu.RLock()
+	value, ok := i.xattrs[name]
+	i.mu.RUnlock()
+	if !ok {
+		return nil, syserror.ENOATTR
+	}
+	return value, nil
 }
 
-// SetOwner updates the file owner to owner.
-func (i *InMemoryAttributes) SetOwner(ctx context.Context, owner fs.FileOwner) error {
-	if owner.UID.Ok() {
-		i.Unstable.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		i.Unstable.Owner.GID = owner.GID
+// Setxattr implements fs.InodeOperations.Setxattr.
+func (i *InodeSimpleExtendedAttributes) Setxattr(_ *fs.Inode, name string, value []byte) error {
+	i.mu.Lock()
+	if i.xattrs == nil {
+		i.xattrs = make(map[string][]byte)
 	}
+	i.xattrs[name] = value
+	i.mu.Unlock()
 	return nil
 }
 
-// SetTimestamps sets the timestamps to ts.
-func (i *InMemoryAttributes) SetTimestamps(ctx context.Context, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	now := ktime.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATimeSetSystemTime {
-			i.Unstable.AccessTime = now
-		} else {
-			i.Unstable.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTimeSetSystemTime {
-			i.Unstable.ModificationTime = now
-		} else {
-			i.Unstable.ModificationTime = ts.MTime
-		}
+// Listxattr implements fs.InodeOperations.Listxattr.
+func (i *InodeSimpleExtendedAttributes) Listxattr(_ *fs.Inode) (map[string]struct{}, error) {
+	i.mu.RLock()
+	names := make(map[string]struct{}, len(i.xattrs))
+	for name := range i.xattrs {
+		names[name] = struct{}{}
 	}
-	i.Unstable.StatusChangeTime = now
-	return nil
+	i.mu.RUnlock()
+	return names, nil
 }
 
-// TouchAccessTime updates access time to the current time.
-func (i *InMemoryAttributes) TouchAccessTime(ctx context.Context) {
-	i.Unstable.AccessTime = ktime.NowFromContext(ctx)
-}
+// staticFile is a file with static contents. It is returned by
+// InodeStaticFileGetter.GetFile.
+//
+// +stateify savable
+type staticFile struct {
+	waiter.AlwaysReady `state:"nosave"`
+	FileGenericSeek    `state:"nosave"`
+	FileNoIoctl        `state:"nosave"`
+	FileNoMMap         `state:"nosave"`
+	FileNoopFsync      `state:"nosave"`
+	FileNoopFlush      `state:"nosave"`
+	FileNoopRelease    `state:"nosave"`
+	FileNoopWrite      `state:"nosave"`
+	FileNotDirReaddir  `state:"nosave"`
 
-// TouchModificationTime updates modification and status change
-// time to the current time.
-func (i *InMemoryAttributes) TouchModificationTime(ctx context.Context) {
-	now := ktime.NowFromContext(ctx)
-	i.Unstable.ModificationTime = now
-	i.Unstable.StatusChangeTime = now
+	FileStaticContentReader
 }
 
-// TouchStatusChangeTime updates status change time to the current time.
-func (i *InMemoryAttributes) TouchStatusChangeTime(ctx context.Context) {
-	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
-}
+// InodeNoStatFS implement StatFS by retuning ENOSYS.
+type InodeNoStatFS struct{}
 
-// Getxattr returns the extended attribute at name or ENOATTR if
-// it isn't set.
-func (i *InMemoryAttributes) Getxattr(name string) ([]byte, error) {
-	if value, ok := i.Xattrs[name]; ok {
-		return value, nil
-	}
-	return nil, syserror.ENOATTR
+// StatFS implements fs.InodeOperations.StatFS.
+func (InodeNoStatFS) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{}, syserror.ENOSYS
 }
 
-// Setxattr sets the extended attribute at name to value.
-func (i *InMemoryAttributes) Setxattr(name string, value []byte) error {
-	if i.Xattrs == nil {
-		i.Xattrs = make(map[string][]byte)
-	}
-	i.Xattrs[name] = value
-	return nil
+// InodeStaticFileGetter implements GetFile for a file with static contents.
+//
+// +stateify savable
+type InodeStaticFileGetter struct {
+	Contents []byte
 }
 
-// Listxattr returns the set of all currently set extended attributes.
-func (i *InMemoryAttributes) Listxattr() (map[string]struct{}, error) {
-	names := make(map[string]struct{}, len(i.Xattrs))
-	for name := range i.Xattrs {
-		names[name] = struct{}{}
-	}
-	return names, nil
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *InodeStaticFileGetter) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &staticFile{
+		FileStaticContentReader: NewFileStaticContentReader(i.Contents),
+	}), nil
 }
 
-// NoMappable returns a nil memmap.Mappable.
-type NoMappable struct{}
+// InodeNotMappable returns a nil memmap.Mappable.
+type InodeNotMappable struct{}
 
 // Mappable implements fs.InodeOperations.Mappable.
-func (NoMappable) Mappable(*fs.Inode) memmap.Mappable {
+func (InodeNotMappable) Mappable(*fs.Inode) memmap.Mappable {
 	return nil
 }
 
-// NoopWriteOut is a no-op implementation of Inode.WriteOut.
-type NoopWriteOut struct{}
+// InodeNoopWriteOut is a no-op implementation of fs.InodeOperations.WriteOut.
+type InodeNoopWriteOut struct{}
 
 // WriteOut is a no-op.
-func (NoopWriteOut) WriteOut(context.Context, *fs.Inode) error {
+func (InodeNoopWriteOut) WriteOut(context.Context, *fs.Inode) error {
 	return nil
 }
 
@@ -273,6 +330,11 @@ func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) err
 	return syserror.ENOTDIR
 }
 
+// Rename implements fs.FileOperations.Rename.
+func (InodeNotDirectory) Rename(context.Context, *fs.Inode, string, *fs.Inode, string) error {
+	return syserror.EINVAL
+}
+
 // InodeNotSocket can be used by Inodes that are not sockets.
 type InodeNotSocket struct{}
 
@@ -281,7 +343,31 @@ func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint {
 	return nil
 }
 
-// InodeNotRenameable can be used by Inodes that cannot be renamed.
+// InodeNotTruncatable can be used by Inodes that cannot be truncated.
+type InodeNotTruncatable struct{}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (InodeNotTruncatable) Truncate(context.Context, *fs.Inode, int64) error {
+	return syserror.EINVAL
+}
+
+// InodeIsDirTruncate implements fs.InodeOperations.Truncate for directories.
+type InodeIsDirTruncate struct{}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (InodeIsDirTruncate) Truncate(context.Context, *fs.Inode, int64) error {
+	return syserror.EISDIR
+}
+
+// InodeNoopTruncate implements fs.InodeOperations.Truncate as a noop.
+type InodeNoopTruncate struct{}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (InodeNoopTruncate) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// InodeNotRenameable can be used by Inodes that cannot be truncated.
 type InodeNotRenameable struct{}
 
 // Rename implements fs.InodeOperations.Rename.
@@ -305,6 +391,14 @@ func (InodeNotVirtual) IsVirtual() bool {
 	return false
 }
 
+// InodeVirtual can be used by Inodes that are virtual.
+type InodeVirtual struct{}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+func (InodeVirtual) IsVirtual() bool {
+	return true
+}
+
 // InodeNotSymlink can be used by Inodes that are not symlinks.
 type InodeNotSymlink struct{}
 
@@ -337,50 +431,17 @@ func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, erro
 	return nil, syserror.EOPNOTSUPP
 }
 
-// DeprecatedFileOperations panics if any deprecated Inode method is called.
-type DeprecatedFileOperations struct{}
+// InodeNoopRelease implements fs.InodeOperations.Release as a noop.
+type InodeNoopRelease struct{}
 
-// Readiness implements fs.InodeOperations.Waitable.Readiness.
-func (DeprecatedFileOperations) Readiness(waiter.EventMask) waiter.EventMask {
-	panic("not implemented")
-}
-
-// EventRegister implements fs.InodeOperations.Waitable.EventRegister.
-func (DeprecatedFileOperations) EventRegister(*waiter.Entry, waiter.EventMask) {
-	panic("not implemented")
-}
-
-// EventUnregister implements fs.InodeOperations.Waitable.EventUnregister.
-func (DeprecatedFileOperations) EventUnregister(*waiter.Entry) {
-	panic("not implemented")
-}
-
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (DeprecatedFileOperations) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
-	panic("not implemented")
-}
-
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (DeprecatedFileOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	panic("not implemented")
-}
-
-// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir.
-func (DeprecatedFileOperations) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) {
-	panic("not implemented")
-}
-
-// DeprecatedFsync implements fs.InodeOperations.DeprecatedFsync.
-func (DeprecatedFileOperations) DeprecatedFsync() error {
-	panic("not implemented")
-}
+// Release implements fs.InodeOperations.Release.
+func (InodeNoopRelease) Release(context.Context) {}
 
-// DeprecatedFlush implements fs.InodeOperations.DeprecatedFlush.
-func (DeprecatedFileOperations) DeprecatedFlush() error {
-	panic("not implemented")
-}
+// InodeGenericChecker implements fs.InodeOperations.Check with a generic
+// implementation.
+type InodeGenericChecker struct{}
 
-// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable.
-func (DeprecatedFileOperations) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) {
-	panic("not implemented")
+// Check implements fs.InodeOperations.Check.
+func (InodeGenericChecker) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
 }
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index ce5201a40..9c9391511 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -261,15 +261,11 @@ func (noopMappingSpace) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateO
 }
 
 func anonInode(ctx context.Context) *fs.Inode {
-	return fs.NewInode(NewSimpleInodeOperations(InodeSimpleAttributes{
-		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: fs.FileOwnerFromContext(ctx),
-			Perms: fs.FilePermissions{
-				User: fs.PermMask{Read: true, Write: true},
-			},
-			Links: 1,
-		}),
-	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+	return fs.NewInode(&SimpleFileInode{
+		InodeSimpleAttributes: NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), fs.FilePermissions{
+			User: fs.PermMask{Read: true, Write: true},
+		}, 0),
+	}, fs.NewPseudoMountSource(), fs.StableAttr{
 		Type:      fs.Anonymous,
 		BlockSize: usermem.PageSize,
 	})
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 6d961813d..3578b07a0 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -37,7 +37,7 @@ var openedWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_fil
 //
 // +stateify savable
 type fileOperations struct {
-	fsutil.NoIoctl     `state:"nosave"`
+	fsutil.FileNoIoctl `state:"nosave"`
 	waiter.AlwaysReady `state:"nosave"`
 
 	// inodeOperations is the inodeOperations backing the file. It is protected
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index ed30cb1f1..2dc000c6f 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -75,11 +75,11 @@ var (
 	// ErrNoTransport is returned when there is no 'trans' option.
 	ErrNoTransport = errors.New("missing required option: 'trans='")
 
-	// ErrNoReadFD is returned when there is no 'rfdno' option.
-	ErrNoReadFD = errors.New("missing required option: 'rfdno='")
+	// ErrFileNoReadFD is returned when there is no 'rfdno' option.
+	ErrFileNoReadFD = errors.New("missing required option: 'rfdno='")
 
-	// ErrNoWriteFD is returned when there is no 'wfdno' option.
-	ErrNoWriteFD = errors.New("missing required option: 'wfdno='")
+	// ErrFileNoWriteFD is returned when there is no 'wfdno' option.
+	ErrFileNoWriteFD = errors.New("missing required option: 'wfdno='")
 )
 
 // filesystem is a 9p client.
@@ -87,6 +87,8 @@ var (
 // +stateify savable
 type filesystem struct{}
 
+var _ fs.Filesystem = (*filesystem)(nil)
+
 func init() {
 	fs.RegisterFilesystem(&filesystem{})
 }
@@ -160,14 +162,14 @@ func options(data string) (opts, error) {
 	// Check for the required 'rfdno=' option.
 	srfd, ok := options[readFDKey]
 	if !ok {
-		return o, ErrNoReadFD
+		return o, ErrFileNoReadFD
 	}
 	delete(options, readFDKey)
 
 	// Check for the required 'wfdno=' option.
 	swfd, ok := options[writeFDKey]
 	if !ok {
-		return o, ErrNoWriteFD
+		return o, ErrFileNoWriteFD
 	}
 	delete(options, writeFDKey)
 
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 7c6e5b025..f0dc99fd0 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -40,7 +40,6 @@ import (
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
 
 	// fileState implements fs.CachedFileObject. It exists
 	// to break a circular load dependency between inodeOperations
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index bc6ee7aa4..4e84d1d6c 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -36,8 +36,8 @@ import (
 //
 // +stateify savable
 type fileOperations struct {
-	fsutil.NoIoctl     `state:"nosave"`
-	fsutil.NoopRelease `state:"nosave"`
+	fsutil.FileNoIoctl     `state:"nosave"`
+	fsutil.FileNoopRelease `state:"nosave"`
 
 	// iops are the Inode operations for this file.
 	iops *inodeOperations `state:"wait"`
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index 54cbb94f9..d2ba38449 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -58,6 +58,8 @@ type Filesystem struct {
 	paths []string
 }
 
+var _ fs.Filesystem = (*Filesystem)(nil)
+
 // Name is the identifier of this file system.
 func (*Filesystem) Name() string {
 	return FilesystemName
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 08754bd6b..6ff6c3254 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -39,7 +39,6 @@ import (
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
 
 	// fileState implements fs.CachedFileObject. It exists
 	// to break a circular load dependency between inodeOperations
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index fa3beb111..d32f52d55 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -356,11 +356,10 @@ func (i *Inode) AddLink() {
 	if i.overlay != nil {
 		// FIXME: Remove this from InodeOperations altogether.
 		//
-		// This interface (including DropLink and NotifyStatusChange)
-		// is only used by ramfs to update metadata of children. These
-		// filesystems should _never_ have overlay Inodes cached as
-		// children. So explicitly disallow this scenario and avoid plumbing
-		// Dirents through to do copy up.
+		// This interface is only used by ramfs to update metadata of
+		// children. These filesystems should _never_ have overlay
+		// Inodes cached as children. So explicitly disallow this
+		// scenario and avoid plumbing Dirents through to do copy up.
 		panic("overlay Inodes cached in ramfs directories are not supported")
 	}
 	i.InodeOperations.AddLink()
@@ -375,15 +374,6 @@ func (i *Inode) DropLink() {
 	i.InodeOperations.DropLink()
 }
 
-// NotifyStatusChange calls i.InodeOperations.NotifyStatusChange.
-func (i *Inode) NotifyStatusChange(ctx context.Context) {
-	if i.overlay != nil {
-		// Same as AddLink.
-		panic("overlay Inodes cached in ramfs directories are not supported")
-	}
-	i.InodeOperations.NotifyStatusChange(ctx)
-}
-
 // IsVirtual calls i.InodeOperations.IsVirtual.
 func (i *Inode) IsVirtual() bool {
 	if i.overlay != nil {
@@ -401,17 +391,6 @@ func (i *Inode) StatFS(ctx context.Context) (Info, error) {
 	return i.InodeOperations.StatFS(ctx)
 }
 
-// HandleOps extracts HandleOperations from i.
-func (i *Inode) HandleOps() HandleOperations {
-	if i.overlay != nil {
-		return overlayHandleOps(i.overlay)
-	}
-	if h, ok := i.InodeOperations.(HandleOperations); ok {
-		return h
-	}
-	return nil
-}
-
 // CheckOwnership checks whether `ctx` owns this Inode or may act as its owner.
 // Compare Linux's fs/inode.c:inode_owner_or_capable().
 func (i *Inode) CheckOwnership(ctx context.Context) bool {
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 77973ce79..db40b5256 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -21,8 +21,6 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 var (
@@ -303,83 +301,5 @@ type InodeOperations interface {
 	// StatFS returns a filesystem Info implementation or an error.  If
 	// the filesystem does not support this operation (maybe in the future
 	// it will), then ENOSYS should be returned.
-	//
-	// Move to MountSourceOperations.
 	StatFS(context.Context) (Info, error)
-
-	HandleOperations
-}
-
-// HandleOperations are extended InodeOperations that are only implemented
-// for file systems that use fs/handle.go:Handle to generate open Files.
-//
-// Handle is deprecated; these methods are deprecated as well.
-//
-// Filesystems are encouraged to implement the File interface directly
-// instead of using Handle. To indicate that the below methods should never
-// be called, embed DeprecatedFileOperations to satisfy this interface.
-type HandleOperations interface {
-	waiter.Waitable
-
-	// DeprecatedPreadv is deprecated in favor of filesystems
-	// implementing File.Preadv directly.
-	//
-	// DeprecatedPreadv reads up to dst.NumBytes() bytes into dst, starting at
-	// the given offset, and returns the number of bytes read.
-	//
-	// Preadv may return a partial read result before EOF is reached.
-	//
-	// If a symlink, Preadv reads the target value of the symlink.
-	//
-	// Preadv should not check for readable permissions.
-	DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
-
-	// DeprecatedPwritev is deprecated in favor of filesystems
-	// implementing File.Pwritev directly.
-	//
-	// DeprecatedPwritev writes up to src.NumBytes() bytes from src to the
-	// Inode, starting at the given offset and returns the number of bytes
-	// written.
-	//
-	// Pwritev should not check that the Inode has writable permissions.
-	DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
-
-	// DeprecatedReaddir is deprecated in favor of filesystems
-	// implementing File.Readdir directly.
-	//
-	// DeprecatedReaddir emits directory entries by calling dirCtx.EmitDir,
-	// beginning with the entry at offset.
-	//
-	// Entries for "." and ".." must *not* be included.
-	//
-	// If the offset returned is the same as the argument offset, then
-	// nothing has been serialized.  This is equivalent to reaching EOF.
-	// In this case serializer.Written() should return 0.
-	//
-	// The order of entries to emit must be consistent between Readdir
-	// calls, and must start with the given offset.
-	//
-	// The caller must ensure that this operation is permitted.
-	DeprecatedReaddir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error)
-
-	// DeprecatedFsync is deprecated in favor of filesystems implementing
-	// File.Fsync directly.
-	//
-	// DeprecatedFsync syncs a file.
-	DeprecatedFsync() error
-
-	// DeprecatedMappable is deprecated in favor of filesystems implementing
-	// File.Mappable directly.
-	//
-	// DeprecatedMappable returns a Mappable if the Inode can be mapped.
-	DeprecatedMappable(ctx context.Context, inode *Inode) (memmap.Mappable, bool)
-
-	// DeprecatedFlush is deprecated in favor of filesystems implementing
-	// File.Flush directly.
-	//
-	// DeprecatedFlush flushes a file.
-	//
-	// Implementations may choose to free up memory or complete pending I/O
-	// but also may implement Flush as a no-op.
-	DeprecatedFlush() error
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 512a0da28..77a2623ef 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -596,19 +596,6 @@ func overlayStatFS(ctx context.Context, o *overlayEntry) (Info, error) {
 	return i, nil
 }
 
-func overlayHandleOps(o *overlayEntry) HandleOperations {
-	// Hot path. Avoid defers.
-	var hops HandleOperations
-	o.copyMu.RLock()
-	if o.upper != nil {
-		hops = o.upper.HandleOps()
-	} else {
-		hops = o.lower.HandleOps()
-	}
-	o.copyMu.RUnlock()
-	return hops
-}
-
 // NewTestOverlayDir returns an overlay Inode for tests.
 //
 // If `revalidate` is true, then the upper filesystem will require
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 9e922d008..bc91be226 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -19,7 +19,8 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -376,7 +377,8 @@ type dir struct {
 	// List of negative child names.
 	negative []string
 
-	// Whether DeprecatedReaddir has been called on this dir.
+	// ReaddirCalled records whether Readdir was called on a file
+	// corresponding to this inode.
 	ReaddirCalled bool
 }
 
@@ -390,10 +392,19 @@ func (d *dir) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
 	return nil, syserror.ENOATTR
 }
 
-// DeprecatedReaddir implements InodeOperations.DeprecatedReaddir.
-func (d *dir) DeprecatedReaddir(ctx context.Context, dirctx *fs.DirCtx, offset int) (int, error) {
-	d.ReaddirCalled = true
-	return d.InodeOperations.DeprecatedReaddir(ctx, dirctx, offset)
+// GetFile implements InodeOperations.GetFile.
+func (d *dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	file, err := d.InodeOperations.GetFile(ctx, dirent, flags)
+	if err != nil {
+		return nil, err
+	}
+	defer file.DecRef()
+	// Wrap the file's FileOperations in a dirFile.
+	fops := &dirFile{
+		FileOperations: file.FileOperations,
+		inode:          d,
+	}
+	return fs.NewFile(ctx, dirent, flags, fops), nil
 }
 
 type dirContent struct {
@@ -401,12 +412,45 @@ type dirContent struct {
 	dir  bool
 }
 
+type dirFile struct {
+	fs.FileOperations
+	inode *dir
+}
+
+type inode struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNotTruncatable       `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeStaticFileGetter
+}
+
+// Readdir implements fs.FileOperations.Readdir. It sets the ReaddirCalled
+// field on the inode.
+func (f *dirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
+	f.inode.ReaddirCalled = true
+	return f.FileOperations.Readdir(ctx, file, ser)
+}
+
 func newTestRamfsInode(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	return fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}), msrc, fs.StableAttr{Type: fs.RegularFile})
+	inode := fs.NewInode(&inode{
+		InodeStaticFileGetter: fsutil.InodeStaticFileGetter{
+			Contents: []byte("foobar"),
+		},
+	}, msrc, fs.StableAttr{Type: fs.RegularFile})
+	return inode
 }
 
 func newTestRamfsDir(ctx context.Context, contains []dirContent, negative []string) *fs.Inode {
-	msrc := fs.NewCachingMountSource(nil, fs.MountSourceFlags{})
+	msrc := fs.NewPseudoMountSource()
 	contents := make(map[string]*fs.Inode)
 	for _, c := range contains {
 		if c.dir {
@@ -415,7 +459,7 @@ func newTestRamfsDir(ctx context.Context, contains []dirContent, negative []stri
 			contents[c.name] = newTestRamfsInode(ctx, msrc)
 		}
 	}
-	dops := ramfstest.NewDir(ctx, contents, fs.FilePermissions{
+	dops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermissions{
 		User: fs.PermMask{Read: true, Execute: true},
 	})
 	return fs.NewInode(&dir{
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 6bfcda6bb..abfdc6a25 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -16,7 +16,6 @@ package fs
 
 import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -151,16 +150,6 @@ func (n *MockInodeOperations) Truncate(ctx context.Context, inode *Inode, size i
 	return nil
 }
 
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (n *MockInodeOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, nil
-}
-
-// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir.
-func (n *MockInodeOperations) DeprecatedReaddir(context.Context, *DirCtx, int) (int, error) {
-	return 0, nil
-}
-
 // Remove implements fs.InodeOperations.Remove.
 func (n *MockInodeOperations) Remove(context.Context, *Inode, string) error {
 	return nil
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 24e28ddb2..dd6e64b4c 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -43,8 +43,6 @@ type DirentOperations interface {
 // MountSourceOperations contains filesystem specific operations.
 type MountSourceOperations interface {
 	// TODO: Add:
-	//
-	// StatFS() (Info, error)
 	// BlockSize() int64
 	// FS() Filesystem
 
@@ -249,7 +247,7 @@ func (msrc *MountSource) FlushDirentRefs() {
 }
 
 // NewCachingMountSource returns a generic mount that will cache dirents
-// aggressively. Filesystem may be nil if there is no backing filesystem.
+// aggressively.
 func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
 	return NewMountSource(&SimpleMountSourceOperations{
 		keep:       true,
@@ -258,7 +256,6 @@ func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *Mount
 }
 
 // NewNonCachingMountSource returns a generic mount that will never cache dirents.
-// Filesystem may be nil if there is no backing filesystem.
 func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
 	return NewMountSource(&SimpleMountSourceOperations{
 		keep:       false,
@@ -275,6 +272,15 @@ func NewRevalidatingMountSource(filesystem Filesystem, flags MountSourceFlags) *
 	}, filesystem, flags)
 }
 
+// NewPseudoMountSource returns a "pseudo" mount source that is not backed by
+// an actual filesystem. It is always non-caching.
+func NewPseudoMountSource() *MountSource {
+	return NewMountSource(&SimpleMountSourceOperations{
+		keep:       false,
+		revalidate: false,
+	}, nil, MountSourceFlags{})
+}
+
 // SimpleMountSourceOperations implements MountSourceOperations.
 //
 // +stateify savable
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index 7d682d99b..54000614f 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -19,7 +19,8 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 )
 
@@ -29,15 +30,15 @@ import (
 //   |-bar (file)
 func createMountNamespace(ctx context.Context) (*fs.MountNamespace, error) {
 	perms := fs.FilePermsFromMode(0777)
-	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	m := fs.NewPseudoMountSource()
 
-	barFile := ramfstest.NewFile(ctx, perms)
-	fooDir := ramfstest.NewDir(ctx, map[string]*fs.Inode{
+	barFile := fsutil.NewSimpleFileInode(ctx, fs.RootOwner, perms, 0)
+	fooDir := ramfs.NewDir(ctx, map[string]*fs.Inode{
 		"bar": fs.NewInode(barFile, m, fs.StableAttr{Type: fs.RegularFile}),
-	}, perms)
-	rootDir := ramfstest.NewDir(ctx, map[string]*fs.Inode{
+	}, fs.RootOwner, perms)
+	rootDir := ramfs.NewDir(ctx, map[string]*fs.Inode{
 		"foo": fs.NewInode(fooDir, m, fs.StableAttr{Type: fs.Directory}),
-	}, perms)
+	}, fs.RootOwner, perms)
 
 	return fs.NewMountNamespace(ctx, fs.NewInode(rootDir, m, fs.StableAttr{Type: fs.Directory}))
 }
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index aff3c3c01..74954f213 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -8,9 +8,9 @@ go_library(
         "cpuinfo.go",
         "exec_args.go",
         "fds.go",
-        "file.go",
         "filesystems.go",
         "fs.go",
+        "inode.go",
         "loadavg.go",
         "meminfo.go",
         "mounts.go",
@@ -32,6 +32,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/proc/device",
         "//pkg/sentry/fs/proc/seqfile",
         "//pkg/sentry/fs/ramfs",
@@ -45,6 +46,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
 
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index f8be06dc3..f756c45bf 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -15,52 +15,21 @@
 package proc
 
 import (
-	"io"
-
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// cpuinfo is a file describing the CPU capabilities.
-//
-// Presently cpuinfo never changes, so it doesn't need to be a SeqFile.
-//
-// +stateify savable
-type cpuinfo struct {
-	ramfs.Entry
-
-	// k is the system kernel.
-	k *kernel.Kernel
-}
-
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (c *cpuinfo) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	features := c.k.FeatureSet()
+func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	k := kernel.KernelFromContext(ctx)
+	features := k.FeatureSet()
 	if features == nil {
 		// Kernel is always initialized with a FeatureSet.
 		panic("cpuinfo read with nil FeatureSet")
 	}
-
 	contents := make([]byte, 0, 1024)
-	for i, max := uint(0), c.k.ApplicationCores(); i < max; i++ {
+	for i, max := uint(0), k.ApplicationCores(); i < max; i++ {
 		contents = append(contents, []byte(features.CPUInfo(i))...)
 	}
-	if offset >= int64(len(contents)) {
-		return 0, io.EOF
-	}
-
-	n, err := dst.CopyOut(ctx, contents[offset:])
-	return int64(n), err
-}
-
-func (p *proc) newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	f := &cpuinfo{
-		k: p.k,
-	}
-	f.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-
-	return newFile(f, msrc, fs.SpecialFile, nil)
+	return newStaticProcInode(ctx, msrc, contents)
 }
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index b4896053f..ddda67f54 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -18,12 +18,14 @@ import (
 	"fmt"
 	"io"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // execArgType enumerates the types of exec arguments that are exposed through
@@ -35,12 +37,12 @@ const (
 	environExecArg
 )
 
-// execArgFile is a file containing the exec args (either cmdline or environ)
+// execArgInode is a inode containing the exec args (either cmdline or environ)
 // for a given task.
 //
 // +stateify savable
-type execArgFile struct {
-	ramfs.Entry
+type execArgInode struct {
+	fsutil.SimpleFileInode
 
 	// arg is the type of exec argument this file contains.
 	arg execArgType
@@ -49,21 +51,52 @@ type execArgFile struct {
 	t *kernel.Task
 }
 
+var _ fs.InodeOperations = (*execArgInode)(nil)
+
 // newExecArgFile creates a file containing the exec args of the given type.
-func newExecArgFile(t *kernel.Task, msrc *fs.MountSource, arg execArgType) *fs.Inode {
+func newExecArgInode(t *kernel.Task, msrc *fs.MountSource, arg execArgType) *fs.Inode {
 	if arg != cmdlineExecArg && arg != environExecArg {
 		panic(fmt.Sprintf("unknown exec arg type %v", arg))
 	}
-	f := &execArgFile{
-		arg: arg,
-		t:   t,
+	f := &execArgInode{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		arg:             arg,
+		t:               t,
 	}
-	f.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(f, msrc, fs.SpecialFile, t)
+	return newProcInode(f, msrc, fs.SpecialFile, t)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *execArgInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &execArgFile{
+		arg: i.arg,
+		t:   i.t,
+	}), nil
 }
 
-// DeprecatedPreadv reads the exec arg from the process's address space..
-func (f *execArgFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// +stateify savable
+type execArgFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopWrite     `state:"nosave"`
+
+	// arg is the type of exec argument this file contains.
+	arg execArgType
+
+	// t is the Task to read the exec arg line from.
+	t *kernel.Task
+}
+
+var _ fs.FileOperations = (*execArgFile)(nil)
+
+// Read reads the exec arg from the process's address space..
+func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 5acbce75e..b8a0a5eff 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -21,11 +21,11 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -55,7 +55,7 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
 // readDescriptors reads fds in the task starting at offset, and calls the
 // toDentAttr callback for each to get a DentAttr, which it then emits. This is
 // a helper for implementing fs.InodeOperations.Readdir.
-func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int, toDentAttr func(int) fs.DentAttr) (int, error) {
+func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
 	var fds kernel.FDs
 	t.WithMuLocked(func(t *kernel.Task) {
 		if fdm := t.FDMap(); fdm != nil {
@@ -69,7 +69,7 @@ func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int, toDentAttr func(i
 	}
 
 	// Find the fd to start at.
-	idx := sort.SearchInts(fdInts, offset)
+	idx := sort.SearchInts(fdInts, int(offset))
 	if idx == len(fdInts) {
 		return offset, nil
 	}
@@ -80,28 +80,32 @@ func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int, toDentAttr func(i
 		name := strconv.FormatUint(uint64(fd), 10)
 		if err := c.DirEmit(name, toDentAttr(fd)); err != nil {
 			// Returned offset is the next fd to serialize.
-			return fd, err
+			return int64(fd), err
 		}
 	}
 	// We serialized them all.  Next offset should be higher than last
 	// serialized fd.
-	return fd + 1, nil
+	return int64(fd + 1), nil
 }
 
-// fd is a single file in /proc/TID/fd/.
+// fd implements fs.InodeOperations for a file in /proc/TID/fd/.
 type fd struct {
 	ramfs.Symlink
 	*fs.File
 }
 
+var _ fs.InodeOperations = (*fd)(nil)
+
 // newFd returns a new fd based on an existing file.
 //
 // This inherits one reference to the file.
 func newFd(t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode {
-	fd := &fd{File: f}
-	// RootOwner by default, is overridden in UnstableAttr()
-	fd.InitSymlink(t, fs.RootOwner, "")
-	return newFile(fd, msrc, fs.Symlink, t)
+	fd := &fd{
+		// RootOwner overridden by taskOwnedInodeOps.UnstableAttrs().
+		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+		File:    f,
+	}
+	return newProcInode(fd, msrc, fs.Symlink, t)
 }
 
 // GetFile returns the fs.File backing this fd.  The dirent and flags
@@ -142,7 +146,7 @@ func (f *fd) Close() error {
 	return nil
 }
 
-// fdDir implements /proc/TID/fd.
+// fdDir is an InodeOperations for /proc/TID/fd.
 //
 // +stateify savable
 type fdDir struct {
@@ -154,11 +158,15 @@ type fdDir struct {
 	t *kernel.Task
 }
 
+var _ fs.InodeOperations = (*fdDir)(nil)
+
 // newFdDir creates a new fdDir.
 func newFdDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	f := &fdDir{t: t}
-	f.InitDir(t, nil, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true, Execute: true}})
-	return newFile(f, msrc, fs.SpecialDirectory, t)
+	f := &fdDir{
+		Dir: *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true, Execute: true}}),
+		t:   t,
+	}
+	return newProcInode(f, msrc, fs.SpecialDirectory, t)
 }
 
 // Check implements InodeOperations.Check.
@@ -191,49 +199,55 @@ func (f *fdDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent
 	return fs.NewDirent(n, p), nil
 }
 
-// DeprecatedReaddir lists fds in /proc/TID/fd.
-func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	return readDescriptors(f.t, dirCtx, offset, func(fd int) fs.DentAttr {
-		return fs.GenericDentAttr(fs.Symlink, device.ProcDevice)
+// GetFile implements fs.FileOperations.GetFile.
+func (f *fdDir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	fops := &fdDirFile{
+		isInfoFile: false,
+		t:          f.t,
+	}
+	return fs.NewFile(ctx, dirent, flags, fops), nil
+}
+
+// +stateify savable
+type fdDirFile struct {
+	fsutil.DirFileOperations `state:"nosave"`
+
+	isInfoFile bool
+
+	t *kernel.Task
+}
+
+var _ fs.FileOperations = (*fdDirFile)(nil)
+
+// Readdir implements fs.FileOperations.Readdir.
+func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
+	dirCtx := &fs.DirCtx{
+		Serializer: ser,
+	}
+	typ := fs.RegularFile
+	if f.isInfoFile {
+		typ = fs.Symlink
+	}
+	return readDescriptors(f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr {
+		return fs.GenericDentAttr(typ, device.ProcDevice)
 	})
 }
 
-// fdInfo is a single file in /proc/TID/fdinfo/.
+// fdInfoInode is a single file in /proc/TID/fdinfo/.
 //
 // +stateify savable
-type fdInfo struct {
-	ramfs.File
+type fdInfoInode struct {
+	staticFileInodeOps
 
 	file    *fs.File
 	flags   fs.FileFlags
 	fdFlags kernel.FDFlags
 }
 
-// newFdInfo returns a new fdInfo based on an existing file.
-func newFdInfo(t *kernel.Task, file *fs.File, fdFlags kernel.FDFlags, msrc *fs.MountSource) *fs.Inode {
-	fdi := &fdInfo{file: file, flags: file.Flags(), fdFlags: fdFlags}
-	fdi.InitFile(t, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true}})
-	// TODO: Get pos, locks, and other data.  For now we only
-	// have flags.
-	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
-
-	flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags()
-	fdi.Append([]byte(fmt.Sprintf("flags:\t0%o\n", flags)))
-	return newFile(fdi, msrc, fs.SpecialFile, t)
-}
-
-// DeprecatedPwritev implements fs.HandleOperations.DeprecatedPwritev.
-func (*fdInfo) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	return 0, ramfs.ErrInvalidOp
-}
-
-// Truncate implements fs.InodeOperations.Truncate.
-func (*fdInfo) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return ramfs.ErrInvalidOp
-}
+var _ fs.InodeOperations = (*fdInfoInode)(nil)
 
-func (f *fdInfo) Release(ctx context.Context) {
-	f.File.Release(ctx)
+// Release implements fs.InodeOperations.Release.
+func (f *fdInfoInode) Release(ctx context.Context) {
 	f.file.DecRef()
 }
 
@@ -249,25 +263,37 @@ type fdInfoDir struct {
 
 // newFdInfoDir creates a new fdInfoDir.
 func newFdInfoDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	fdid := &fdInfoDir{t: t}
-	fdid.InitDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0500))
-	return newFile(fdid, msrc, fs.SpecialDirectory, t)
+	fdid := &fdInfoDir{
+		Dir: *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0500)),
+		t:   t,
+	}
+	return newProcInode(fdid, msrc, fs.SpecialDirectory, t)
 }
 
 // Lookup loads an fd in /proc/TID/fdinfo into a Dirent.
 func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
-	n, err := walkDescriptors(fdid.t, p, func(file *fs.File, fdFlags kernel.FDFlags) *fs.Inode {
-		return newFdInfo(fdid.t, file, fdFlags, dir.MountSource)
+	inode, err := walkDescriptors(fdid.t, p, func(file *fs.File, fdFlags kernel.FDFlags) *fs.Inode {
+		// TODO: Using a static inode here means that the
+		// data can be out-of-date if, for instance, the flags on the
+		// FD change before we read this file. We should switch to
+		// generating the data on Read(). Also, we should include pos,
+		// locks, and other data.  For now we only have flags.
+		// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
+		flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags()
+		contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags))
+		return newStaticProcInode(ctx, dir.MountSource, contents)
 	})
 	if err != nil {
 		return nil, err
 	}
-	return fs.NewDirent(n, p), nil
+	return fs.NewDirent(inode, p), nil
 }
 
-// DeprecatedReaddir lists fds in /proc/TID/fdinfo.
-func (fdid *fdInfoDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	return readDescriptors(fdid.t, dirCtx, offset, func(fd int) fs.DentAttr {
-		return fs.GenericDentAttr(fs.RegularFile, device.ProcDevice)
-	})
+// GetFile implements fs.FileOperations.GetFile.
+func (fdid *fdInfoDir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	fops := &fdDirFile{
+		isInfoFile: true,
+		t:          fdid.t,
+	}
+	return fs.NewFile(ctx, dirent, flags, fops), nil
 }
diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
deleted file mode 100644
index f659e590a..000000000
--- a/pkg/sentry/fs/proc/file.go
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-// +stateify savable
-type file struct {
-	fs.InodeOperations
-
-	// nodeType is the file type of this file.
-	nodeType fs.InodeType
-
-	// t is the associated kernel task that owns this file.
-	t *kernel.Task
-}
-
-func newFile(node fs.InodeOperations, msrc *fs.MountSource, nodeType fs.InodeType, t *kernel.Task) *fs.Inode {
-	iops := &file{node, nodeType, t}
-	sattr := fs.StableAttr{
-		DeviceID:  device.ProcDevice.DeviceID(),
-		InodeID:   device.ProcDevice.NextIno(),
-		BlockSize: usermem.PageSize,
-		Type:      nodeType,
-	}
-	return fs.NewInode(iops, msrc, sattr)
-}
-
-// UnstableAttr returns all attributes of this file.
-func (f *file) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	uattr, err := f.InodeOperations.UnstableAttr(ctx, inode)
-	if err != nil {
-		return fs.UnstableAttr{}, err
-	}
-	if f.t != nil {
-		creds := f.t.Credentials()
-		uattr.Owner = fs.FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
-	}
-	return uattr, nil
-}
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
new file mode 100644
index 000000000..3c36af5ea
--- /dev/null
+++ b/pkg/sentry/fs/proc/inode.go
@@ -0,0 +1,96 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// taskOwnedInodeOps wraps an fs.InodeOperations and overrides the UnstableAttr
+// method to return the task as the owner.
+//
+// +stateify savable
+type taskOwnedInodeOps struct {
+	fs.InodeOperations
+
+	// t is the task that owns this file.
+	t *kernel.Task
+}
+
+// UnstableAttr implement fs.InodeOperations.UnstableAttr.
+func (i *taskOwnedInodeOps) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	uattr, err := i.InodeOperations.UnstableAttr(ctx, inode)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	// Set the task owner as the file owner.
+	creds := i.t.Credentials()
+	uattr.Owner = fs.FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
+	return uattr, nil
+}
+
+// staticFileInodeOps is an InodeOperations implementation that can be used to
+// return file contents which are constant. This file is not writable and will
+// always have mode 0444.
+//
+// +stateify savable
+type staticFileInodeOps struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeStaticFileGetter
+}
+
+var _ fs.InodeOperations = (*staticFileInodeOps)(nil)
+
+// newStaticFileInode returns a procfs InodeOperations with static contents.
+func newStaticProcInode(ctx context.Context, msrc *fs.MountSource, contents []byte) *fs.Inode {
+	iops := &staticFileInodeOps{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		InodeStaticFileGetter: fsutil.InodeStaticFileGetter{
+			Contents: contents,
+		},
+	}
+	return newProcInode(iops, msrc, fs.SpecialFile, nil)
+}
+
+// newProcInode creates a new inode from the given inode operations.
+func newProcInode(iops fs.InodeOperations, msrc *fs.MountSource, typ fs.InodeType, t *kernel.Task) *fs.Inode {
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      typ,
+	}
+	if t != nil {
+		iops = &taskOwnedInodeOps{iops, t}
+	}
+	return fs.NewInode(iops, msrc, sattr)
+}
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 2806d6035..3ed85a538 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -28,35 +28,36 @@ import (
 
 // newNet creates a new proc net entry.
 func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	var contents map[string]*fs.Inode
 	if s := p.k.NetworkStack(); s != nil && s.SupportsIPv6() {
-		d.AddChild(ctx, "dev", seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc))
-		d.AddChild(ctx, "if_inet6", seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc))
-
-		// The following files are simple stubs until they are implemented in
-		// netstack, if the file contains a header the stub is just the header
-		// otherwise it is an empty file.
-		d.AddChild(ctx, "arp", p.newStubProcFSFile(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device")))
-		d.AddChild(ctx, "ipv6_route", p.newStubProcFSFile(ctx, msrc, []byte("")))
-		d.AddChild(ctx, "netlink", p.newStubProcFSFile(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode")))
-		d.AddChild(ctx, "netstat", p.newStubProcFSFile(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")))
-		d.AddChild(ctx, "packet", p.newStubProcFSFile(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")))
-		d.AddChild(ctx, "protocols", p.newStubProcFSFile(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")))
-
-		// Linux sets these values to: nsec per usec, psched tick in ns, 1000000,
-		// high res timer ticks per sec (ClockGetres returns 1ns resolution).
-		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
-		d.AddChild(ctx, "psched", p.newStubProcFSFile(ctx, msrc, []byte(psched)))
-
-		d.AddChild(ctx, "ptype", p.newStubProcFSFile(ctx, msrc, []byte("Type Device      Function")))
-		d.AddChild(ctx, "route", p.newStubProcFSFile(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")))
-		d.AddChild(ctx, "tcp", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
-		d.AddChild(ctx, "tcp6", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
-		d.AddChild(ctx, "udp", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")))
-		d.AddChild(ctx, "udp6", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
+		contents = map[string]*fs.Inode{
+			"dev":      seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
+			"if_inet6": seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc),
+
+			// The following files are simple stubs until they are
+			// implemented in netstack, if the file contains a
+			// header the stub is just the header otherwise it is
+			// an empty file.
+			"arp":        newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device")),
+			"ipv6_route": newStaticProcInode(ctx, msrc, []byte("")),
+			"netlink":    newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode")),
+			"netstat":    newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")),
+			"packet":     newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")),
+			"protocols":  newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")),
+			// Linux sets psched values to: nsec per usec, psched
+			// tick in ns, 1000000, high res timer ticks per sec
+			// (ClockGetres returns 1ns resolution).
+			"psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
+			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function")),
+			"route":  newStaticProcInode(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")),
+			"tcp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
+			"tcp6":   newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
+			"udp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")),
+			"udp6":   newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
+		}
 	}
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6.
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 70e549c31..d1c699418 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -17,18 +17,17 @@ package proc
 
 import (
 	"fmt"
-	"io"
 	"sort"
 	"strconv"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -46,32 +45,6 @@ type proc struct {
 	pidns *kernel.PIDNamespace
 }
 
-// stubProcFSFile is a file type that can be used to return file contents
-// which are constant. This file is not writable and will always have mode
-// 0444.
-//
-// +stateify savable
-type stubProcFSFile struct {
-	ramfs.Entry
-
-	// contents are the immutable file contents that will always be returned.
-	contents []byte
-}
-
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (s *stubProcFSFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-
-	if offset >= int64(len(s.contents)) {
-		return 0, io.EOF
-	}
-
-	n, err := dst.CopyOut(ctx, s.contents[offset:])
-	return int64(n), err
-}
-
 // New returns the root node of a partial simple procfs.
 func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 	k := kernel.KernelFromContext(ctx)
@@ -83,29 +56,39 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 		return nil, fmt.Errorf("procfs requires a PID namespace")
 	}
 
-	p := &proc{k: k, pidns: pidns}
-	p.InitDir(ctx, map[string]*fs.Inode{
+	// Note that these are just the static members. There are dynamic
+	// members populated in Readdir and Lookup below.
+	contents := map[string]*fs.Inode{
+		"cpuinfo":     newCPUInfo(ctx, msrc),
 		"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
 		"loadavg":     seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc),
 		"meminfo":     seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc),
-		"mounts":      newMountsSymlink(ctx, msrc),
+		"mounts":      newProcInode(ramfs.NewSymlink(ctx, fs.RootOwner, "self/mounts"), msrc, fs.Symlink, nil),
+		"self":        newSelf(ctx, pidns, msrc),
 		"stat":        seqfile.NewSeqFileInode(ctx, &statData{k}, msrc),
+		"thread-self": newThreadSelf(ctx, pidns, msrc),
+		"uptime":      newUptime(ctx, msrc),
 		"version":     seqfile.NewSeqFileInode(ctx, &versionData{k}, msrc),
-	}, fs.RootOwner, fs.FilePermsFromMode(0555))
+	}
+
+	// Construct the proc InodeOperations.
+	p := &proc{
+		Dir:   *ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		k:     k,
+		pidns: pidns,
+	}
+
+	// Add more contents that need proc to be initialized.
+	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
 
-	p.AddChild(ctx, "cpuinfo", p.newCPUInfo(ctx, msrc))
 	// If we're using rpcinet we will let it manage /proc/net.
 	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-		p.AddChild(ctx, "net", newRPCInetProcNet(ctx, msrc))
+		contents["net"] = newRPCInetProcNet(ctx, msrc)
 	} else {
-		p.AddChild(ctx, "net", p.newNetDir(ctx, msrc))
+		contents["net"] = p.newNetDir(ctx, msrc)
 	}
-	p.AddChild(ctx, "self", p.newSelf(ctx, msrc))
-	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
-	p.AddChild(ctx, "thread-self", p.newThreadSelf(ctx, msrc))
-	p.AddChild(ctx, "uptime", p.newUptime(ctx, msrc))
 
-	return newFile(p, msrc, fs.SpecialDirectory, nil), nil
+	return newProcInode(p, msrc, fs.SpecialDirectory, nil), nil
 }
 
 // self is a magical link.
@@ -118,26 +101,21 @@ type self struct {
 }
 
 // newSelf returns a new "self" node.
-func (p *proc) newSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	s := &self{pidns: p.pidns}
-	s.InitSymlink(ctx, fs.RootOwner, "")
-	return newFile(s, msrc, fs.Symlink, nil)
+func newSelf(ctx context.Context, pidns *kernel.PIDNamespace, msrc *fs.MountSource) *fs.Inode {
+	s := &self{
+		Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""),
+		pidns:   pidns,
+	}
+	return newProcInode(s, msrc, fs.Symlink, nil)
 }
 
 // newThreadSelf returns a new "threadSelf" node.
-func (p *proc) newThreadSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	s := &threadSelf{pidns: p.pidns}
-	s.InitSymlink(ctx, fs.RootOwner, "")
-	return newFile(s, msrc, fs.Symlink, nil)
-}
-
-// newStubProcFsFile returns a procfs file with constant contents.
-func (p *proc) newStubProcFSFile(ctx context.Context, msrc *fs.MountSource, c []byte) *fs.Inode {
-	u := &stubProcFSFile{
-		contents: c,
+func newThreadSelf(ctx context.Context, pidns *kernel.PIDNamespace, msrc *fs.MountSource) *fs.Inode {
+	s := &threadSelf{
+		Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""),
+		pidns:   pidns,
 	}
-	u.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(u, msrc, fs.SpecialFile, nil)
+	return newProcInode(s, msrc, fs.Symlink, nil)
 }
 
 // Readlink implements fs.InodeOperations.Readlink.
@@ -145,13 +123,13 @@ func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	if t := kernel.TaskFromContext(ctx); t != nil {
 		tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
 		if tgid == 0 {
-			return "", ramfs.ErrNotFound
+			return "", syserror.ENOENT
 		}
 		return strconv.FormatUint(uint64(tgid), 10), nil
 	}
 
 	// Who is reading this link?
-	return "", ramfs.ErrInvalidOp
+	return "", syserror.EINVAL
 }
 
 // threadSelf is more magical than "self" link.
@@ -169,13 +147,13 @@ func (s *threadSelf) Readlink(ctx context.Context, inode *fs.Inode) (string, err
 		tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
 		tid := s.pidns.IDOfTask(t)
 		if tid == 0 || tgid == 0 {
-			return "", ramfs.ErrNotFound
+			return "", syserror.ENOENT
 		}
 		return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 	}
 
 	// Who is reading this link?
-	return "", ramfs.ErrInvalidOp
+	return "", syserror.EINVAL
 }
 
 // Lookup loads an Inode at name into a Dirent.
@@ -204,25 +182,44 @@ func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dire
 	return fs.NewDirent(td, name), nil
 }
 
-// Readdir synthesizes proc contents.
-func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	// Serialize normal contents.
-	_, err := p.Dir.DeprecatedReaddir(ctx, dirCtx, offset)
-	if err != nil {
-		return offset, err
+// GetFile implements fs.InodeOperations.
+func (p *proc) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &rootProcFile{iops: p}), nil
+}
+
+// rootProcFile implements fs.FileOperations for the proc directory.
+//
+// +stateify savable
+type rootProcFile struct {
+	fsutil.DirFileOperations `state:"nosave"`
+
+	iops *proc
+}
+
+var _ fs.FileOperations = (*rootProcFile)(nil)
+
+// Readdir implements fs.FileOperations.Readdir.
+func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
+	offset := file.Offset()
+	dirCtx := &fs.DirCtx{
+		Serializer: ser,
 	}
 
-	m := make(map[string]fs.DentAttr)
-	var names []string
+	// Get normal directory contents from ramfs dir.
+	names, m := rpf.iops.Dir.Children()
 
-	// Add special files.
-	m["sys"] = fs.GenericDentAttr(fs.SpecialFile, device.ProcDevice)
-	names = append(names, "sys")
+	// Add dot and dotdot.
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dot, dotdot := file.Dirent.GetDotAttrs(root)
+	names = append(names, ".", "..")
+	m["."] = dot
+	m[".."] = dotdot
 
 	// Collect tasks.
 	// Per linux we only include it in directory listings if it's the leader.
 	// But for whatever crazy reason, you can still walk to the given node.
-	for _, tg := range p.pidns.ThreadGroups() {
+	for _, tg := range rpf.iops.pidns.ThreadGroups() {
 		if leader := tg.Leader(); leader != nil {
 			name := strconv.FormatUint(uint64(tg.ID()), 10)
 			m[name] = fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice)
@@ -230,7 +227,7 @@ func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 		}
 	}
 
-	if offset >= len(m) {
+	if offset >= int64(len(m)) {
 		return offset, nil
 	}
 	sort.Strings(names)
@@ -241,12 +238,5 @@ func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 		}
 		offset++
 	}
-	return offset, err
-}
-
-// newMountsSymlink returns a symlink to "self/mounts".
-func newMountsSymlink(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	s := &ramfs.Symlink{}
-	s.InitSymlink(ctx, fs.RootOwner, "self/mounts")
-	return newFile(s, msrc, fs.Symlink, nil)
+	return offset, nil
 }
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
index d025069df..65faa21f2 100644
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -20,32 +20,72 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// rpcinetFile implments fs.InodeOperations as RPCs.
-type rpcinetFile struct {
-	ramfs.Entry
+// rpcInetInode implments fs.InodeOperations.
+type rpcInetInode struct {
+	fsutil.SimpleFileInode
 
-	// filepath is the full path of this rpcinetFile.
+	// filepath is the full path of this rpcInetInode.
 	filepath string
 
 	k *kernel.Kernel
 }
 
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-// This method can panic if an rpcinetFile was created without an rpcinet
+func newRPCInetInode(ctx context.Context, msrc *fs.MountSource, filepath string, mode linux.FileMode) *fs.Inode {
+	f := &rpcInetInode{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(mode), linux.PROC_SUPER_MAGIC),
+		filepath:        filepath,
+		k:               kernel.KernelFromContext(ctx),
+	}
+	return newProcInode(f, msrc, fs.SpecialFile, nil)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *rpcInetInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
+	fops := &rpcInetFile{
+		inode: i,
+	}
+	return fs.NewFile(ctx, dirent, flags, fops), nil
+}
+
+// rpcInetFile implements fs.FileOperations as RPCs.
+type rpcInetFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	inode *rpcInetInode
+}
+
+// Read implements fs.FileOperations.Read.
+//
+// This method can panic if an rpcInetInode was created without an rpcinet
 // stack.
-func (r rpcinetFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	s, ok := r.k.NetworkStack().(*rpcinet.Stack)
+func (f *rpcInetFile) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	s, ok := f.inode.k.NetworkStack().(*rpcinet.Stack)
 	if !ok {
 		panic("Network stack is not a rpcinet.")
 	}
 
-	contents, se := s.RPCReadFile(r.filepath)
+	contents, se := s.RPCReadFile(f.inode.filepath)
 	if se != nil || offset >= int64(len(contents)) {
 		return 0, io.EOF
 	}
@@ -54,16 +94,12 @@ func (r rpcinetFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequenc
 	return int64(n), err
 }
 
-// Truncate implements fs.InodeOperations.Truncate.
-func (r rpcinetFile) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-// This method can panic if an rpcinetFile was created without an rpcinet
+// Write implements fs.FileOperations.Write.
+//
+// This method can panic if an rpcInetInode was created without an rpcInet
 // stack.
-func (r rpcinetFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	s, ok := r.k.NetworkStack().(*rpcinet.Stack)
+func (f *rpcInetFile) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	s, ok := f.inode.k.NetworkStack().(*rpcinet.Stack)
 	if !ok {
 		panic("Network stack is not a rpcinet.")
 	}
@@ -78,116 +114,102 @@ func (r rpcinetFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequen
 		return int64(n), err
 	}
 
-	written, se := s.RPCWriteFile(r.filepath, b)
+	written, se := s.RPCWriteFile(f.inode.filepath, b)
 	return int64(written), se.ToError()
 }
 
-func newRPCProcFSFile(ctx context.Context, msrc *fs.MountSource, filepath string, mode linux.FileMode) *fs.Inode {
-	f := &rpcinetFile{
-		filepath: filepath,
-		k:        kernel.KernelFromContext(ctx),
-	}
-	f.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(mode))
-
-	fi := newFile(f, msrc, fs.SpecialFile, nil)
-	return fi
-}
-
 // newRPCInetProcNet will build an inode for /proc/net.
 func newRPCInetProcNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-
-	// Add all the files we want to forward for /proc/net.
-	d.AddChild(ctx, "arp", newRPCProcFSFile(ctx, msrc, "/proc/net/arp", 0444))
-	d.AddChild(ctx, "dev", newRPCProcFSFile(ctx, msrc, "/proc/net/dev", 0444))
-	d.AddChild(ctx, "if_inet6", newRPCProcFSFile(ctx, msrc, "/proc/net/if_inet6", 0444))
-	d.AddChild(ctx, "ipv6_route", newRPCProcFSFile(ctx, msrc, "/proc/net/ipv6_route", 0444))
-	d.AddChild(ctx, "netlink", newRPCProcFSFile(ctx, msrc, "/proc/net/netlink", 0444))
-	d.AddChild(ctx, "netstat", newRPCProcFSFile(ctx, msrc, "/proc/net/netstat", 0444))
-	d.AddChild(ctx, "packet", newRPCProcFSFile(ctx, msrc, "/proc/net/packet", 0444))
-	d.AddChild(ctx, "protocols", newRPCProcFSFile(ctx, msrc, "/proc/net/protocols", 0444))
-	d.AddChild(ctx, "psched", newRPCProcFSFile(ctx, msrc, "/proc/net/psched", 0444))
-	d.AddChild(ctx, "ptype", newRPCProcFSFile(ctx, msrc, "/proc/net/ptype", 0444))
-	d.AddChild(ctx, "route", newRPCProcFSFile(ctx, msrc, "/proc/net/route", 0444))
-	d.AddChild(ctx, "tcp", newRPCProcFSFile(ctx, msrc, "/proc/net/tcp", 0444))
-	d.AddChild(ctx, "tcp6", newRPCProcFSFile(ctx, msrc, "/proc/net/tcp6", 0444))
-	d.AddChild(ctx, "udp", newRPCProcFSFile(ctx, msrc, "/proc/net/udp", 0444))
-	d.AddChild(ctx, "udp6", newRPCProcFSFile(ctx, msrc, "/proc/net/udp6", 0444))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	contents := map[string]*fs.Inode{
+		"arp":        newRPCInetInode(ctx, msrc, "/proc/net/arp", 0444),
+		"dev":        newRPCInetInode(ctx, msrc, "/proc/net/dev", 0444),
+		"if_inet6":   newRPCInetInode(ctx, msrc, "/proc/net/if_inet6", 0444),
+		"ipv6_route": newRPCInetInode(ctx, msrc, "/proc/net/ipv6_route", 0444),
+		"netlink":    newRPCInetInode(ctx, msrc, "/proc/net/netlink", 0444),
+		"netstat":    newRPCInetInode(ctx, msrc, "/proc/net/netstat", 0444),
+		"packet":     newRPCInetInode(ctx, msrc, "/proc/net/packet", 0444),
+		"protocols":  newRPCInetInode(ctx, msrc, "/proc/net/protocols", 0444),
+		"psched":     newRPCInetInode(ctx, msrc, "/proc/net/psched", 0444),
+		"ptype":      newRPCInetInode(ctx, msrc, "/proc/net/ptype", 0444),
+		"route":      newRPCInetInode(ctx, msrc, "/proc/net/route", 0444),
+		"tcp":        newRPCInetInode(ctx, msrc, "/proc/net/tcp", 0444),
+		"tcp6":       newRPCInetInode(ctx, msrc, "/proc/net/tcp6", 0444),
+		"udp":        newRPCInetInode(ctx, msrc, "/proc/net/udp", 0444),
+		"udp6":       newRPCInetInode(ctx, msrc, "/proc/net/udp6", 0444),
+	}
+
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 // newRPCInetProcSysNet will build an inode for /proc/sys/net.
 func newRPCInetProcSysNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	d.AddChild(ctx, "ipv4", newRPCInetSysNetIPv4Dir(ctx, msrc))
-	d.AddChild(ctx, "core", newRPCInetSysNetCore(ctx, msrc))
+	contents := map[string]*fs.Inode{
+		"ipv4": newRPCInetSysNetIPv4Dir(ctx, msrc),
+		"core": newRPCInetSysNetCore(ctx, msrc),
+	}
 
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 // newRPCInetSysNetCore builds the /proc/sys/net/core directory.
 func newRPCInetSysNetCore(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-
-	// Add all the files we want to forward over RPC for /proc/sys/net/core
-	d.AddChild(ctx, "default_qdisc", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/default_qdisc", 0444))
-	d.AddChild(ctx, "message_burst", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/message_burst", 0444))
-	d.AddChild(ctx, "message_cost", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/message_cost", 0444))
-	d.AddChild(ctx, "optmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/optmem_max", 0444))
-	d.AddChild(ctx, "rmem_default", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/rmem_default", 0444))
-	d.AddChild(ctx, "rmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/rmem_max", 0444))
-	d.AddChild(ctx, "somaxconn", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/somaxconn", 0444))
-	d.AddChild(ctx, "wmem_default", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/wmem_default", 0444))
-	d.AddChild(ctx, "wmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/wmem_max", 0444))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	contents := map[string]*fs.Inode{
+		"default_qdisc": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/default_qdisc", 0444),
+		"message_burst": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/message_burst", 0444),
+		"message_cost":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/message_cost", 0444),
+		"optmem_max":    newRPCInetInode(ctx, msrc, "/proc/sys/net/core/optmem_max", 0444),
+		"rmem_default":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/rmem_default", 0444),
+		"rmem_max":      newRPCInetInode(ctx, msrc, "/proc/sys/net/core/rmem_max", 0444),
+		"somaxconn":     newRPCInetInode(ctx, msrc, "/proc/sys/net/core/somaxconn", 0444),
+		"wmem_default":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/wmem_default", 0444),
+		"wmem_max":      newRPCInetInode(ctx, msrc, "/proc/sys/net/core/wmem_max", 0444),
+	}
+
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 // newRPCInetSysNetIPv4Dir builds the /proc/sys/net/ipv4 directory.
 func newRPCInetSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-
-	// Add all the files we want to forward over RPC for /proc/sys/net/ipv4.
-	d.AddChild(ctx, "ip_local_port_range", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_local_port_range", 0444))
-	d.AddChild(ctx, "ip_local_reserved_ports", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_local_reserved_ports", 0444))
-	d.AddChild(ctx, "ipfrag_time", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ipfrag_time", 0444))
-	d.AddChild(ctx, "ip_nonlocal_bind", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_nonlocal_bind", 0444))
-	d.AddChild(ctx, "ip_no_pmtu_disc", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_no_pmtu_disc", 0444))
-
-	d.AddChild(ctx, "tcp_allowed_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_allowed_congestion_control", 0444))
-	d.AddChild(ctx, "tcp_available_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_available_congestion_control", 0444))
-	d.AddChild(ctx, "tcp_base_mss", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_base_mss", 0444))
-	d.AddChild(ctx, "tcp_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_congestion_control", 0644))
-	d.AddChild(ctx, "tcp_dsack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_dsack", 0644))
-	d.AddChild(ctx, "tcp_early_retrans", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_early_retrans", 0644))
-	d.AddChild(ctx, "tcp_fack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fack", 0644))
-	d.AddChild(ctx, "tcp_fastopen", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen", 0644))
-	d.AddChild(ctx, "tcp_fastopen_key", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen_key", 0444))
-	d.AddChild(ctx, "tcp_fin_timeout", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fin_timeout", 0644))
-	d.AddChild(ctx, "tcp_invalid_ratelimit", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_invalid_ratelimit", 0444))
-	d.AddChild(ctx, "tcp_keepalive_intvl", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_intvl", 0644))
-	d.AddChild(ctx, "tcp_keepalive_probes", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_probes", 0644))
-	d.AddChild(ctx, "tcp_keepalive_time", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_time", 0644))
-	d.AddChild(ctx, "tcp_mem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_mem", 0444))
-	d.AddChild(ctx, "tcp_mtu_probing", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_mtu_probing", 0644))
-	d.AddChild(ctx, "tcp_no_metrics_save", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_no_metrics_save", 0444))
-	d.AddChild(ctx, "tcp_probe_interval", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_interval", 0444))
-	d.AddChild(ctx, "tcp_probe_threshold", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_threshold", 0444))
-	d.AddChild(ctx, "tcp_retries1", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries1", 0644))
-	d.AddChild(ctx, "tcp_retries2", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries2", 0644))
-	d.AddChild(ctx, "tcp_rfc1337", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_rfc1337", 0444))
-	d.AddChild(ctx, "tcp_rmem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_rmem", 0444))
-	d.AddChild(ctx, "tcp_sack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_sack", 0644))
-	d.AddChild(ctx, "tcp_slow_start_after_idle", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_slow_start_after_idle", 0644))
-	d.AddChild(ctx, "tcp_synack_retries", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_synack_retries", 0644))
-	d.AddChild(ctx, "tcp_syn_retries", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_syn_retries", 0644))
-	d.AddChild(ctx, "tcp_timestamps", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_timestamps", 0644))
-	d.AddChild(ctx, "tcp_wmem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_wmem", 0444))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	contents := map[string]*fs.Inode{
+		"ip_local_port_range":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_local_port_range", 0444),
+		"ip_local_reserved_ports":          newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_local_reserved_ports", 0444),
+		"ipfrag_time":                      newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ipfrag_time", 0444),
+		"ip_nonlocal_bind":                 newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_nonlocal_bind", 0444),
+		"ip_no_pmtu_disc":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_no_pmtu_disc", 0444),
+		"tcp_allowed_congestion_control":   newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_allowed_congestion_control", 0444),
+		"tcp_available_congestion_control": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_available_congestion_control", 0444),
+		"tcp_base_mss":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_base_mss", 0444),
+		"tcp_congestion_control":           newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_congestion_control", 0644),
+		"tcp_dsack":                        newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_dsack", 0644),
+		"tcp_early_retrans":                newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_early_retrans", 0644),
+		"tcp_fack":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fack", 0644),
+		"tcp_fastopen":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen", 0644),
+		"tcp_fastopen_key":                 newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen_key", 0444),
+		"tcp_fin_timeout":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fin_timeout", 0644),
+		"tcp_invalid_ratelimit":            newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_invalid_ratelimit", 0444),
+		"tcp_keepalive_intvl":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_intvl", 0644),
+		"tcp_keepalive_probes":             newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_probes", 0644),
+		"tcp_keepalive_time":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_time", 0644),
+		"tcp_mem":                          newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_mem", 0444),
+		"tcp_mtu_probing":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_mtu_probing", 0644),
+		"tcp_no_metrics_save":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_no_metrics_save", 0444),
+		"tcp_probe_interval":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_interval", 0444),
+		"tcp_probe_threshold":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_threshold", 0444),
+		"tcp_retries1":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries1", 0644),
+		"tcp_retries2":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries2", 0644),
+		"tcp_rfc1337":                      newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_rfc1337", 0444),
+		"tcp_rmem":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_rmem", 0444),
+		"tcp_sack":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_sack", 0644),
+		"tcp_slow_start_after_idle":        newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_slow_start_after_idle", 0644),
+		"tcp_synack_retries":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_synack_retries", 0644),
+		"tcp_syn_retries":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_syn_retries", 0644),
+		"tcp_timestamps":                   newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_timestamps", 0644),
+		"tcp_wmem":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_wmem", 0444),
+	}
+
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 53c475652..b4ba64e10 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -8,12 +8,15 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/proc/device",
-        "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
+        "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
 
@@ -26,7 +29,7 @@ go_test(
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/fs/ramfs/test",
+        "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 0499ba65b..16fc6789e 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -18,12 +18,15 @@ import (
 	"io"
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // SeqHandle is a helper handle to seek in the file.
@@ -87,7 +90,18 @@ func (s *SeqGenerationCounter) IsCurrent(generation int64) bool {
 //
 // +stateify savable
 type SeqFile struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	fsutil.InodeSimpleExtendedAttributes
+	fsutil.InodeSimpleAttributes
 
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -99,11 +113,14 @@ type SeqFile struct {
 	lastRead   int64
 }
 
+var _ fs.InodeOperations = (*SeqFile)(nil)
+
 // NewSeqFile returns a seqfile suitable for use by external consumers.
 func NewSeqFile(ctx context.Context, source SeqSource) *SeqFile {
-	s := &SeqFile{SeqSource: source}
-	s.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return s
+	return &SeqFile{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		SeqSource:             source,
+	}
 }
 
 // NewSeqFileInode returns an Inode with SeqFile InodeOperations.
@@ -120,11 +137,19 @@ func NewSeqFileInode(ctx context.Context, source SeqSource, msrc *fs.MountSource
 
 // UnstableAttr returns unstable attributes of the SeqFile.
 func (s *SeqFile) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	uattr, _ := s.Entry.UnstableAttr(ctx, inode)
+	uattr, err := s.InodeSimpleAttributes.UnstableAttr(ctx, inode)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
 	uattr.ModificationTime = ktime.NowFromContext(ctx)
 	return uattr, nil
 }
 
+// GetFile implements fs.InodeOperations.GetFile.
+func (s *SeqFile) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &seqFileOperations{seqFile: s}), nil
+}
+
 // findIndexAndOffset finds the unit that corresponds to a certain offset.
 // Returns the unit and the offset within the unit. If there are not enough
 // units len(data) and leftover offset is returned.
@@ -139,36 +164,74 @@ func findIndexAndOffset(data []SeqData, offset int64) (int, int64) {
 	return len(data), offset
 }
 
-// DeprecatedPreadv reads from the file at the given offset.
-func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
+// updateSourceLocked requires that s.mu is held.
+func (s *SeqFile) updateSourceLocked(ctx context.Context, record int) {
+	var h SeqHandle
+	if record == 0 {
+		h = nil
+	} else {
+		h = s.source[record-1].Handle
+	}
+	// Save what we have previously read.
+	s.source = s.source[:record]
+	var newSource []SeqData
+	newSource, s.generation = s.SeqSource.ReadSeqFileData(ctx, h)
+	s.source = append(s.source, newSource...)
+}
+
+// seqFileOperations implements fs.FileOperations.
+//
+// +stateify savable
+type seqFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	seqFile *SeqFile
+}
+
+var _ fs.FileOperations = (*seqFileOperations)(nil)
+
+// Write implements fs.FileOperations.Write.
+func (*seqFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EACCES
+}
+
+// Read implements fs.FileOperations.Read.
+func (sfo *seqFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	sfo.seqFile.mu.Lock()
+	defer sfo.seqFile.mu.Unlock()
 
-	s.Entry.NotifyAccess(ctx)
-	defer func() { s.lastRead = offset }()
+	sfo.seqFile.NotifyAccess(ctx)
+	defer func() { sfo.seqFile.lastRead = offset }()
 
 	updated := false
 
 	// Try to find where we should start reading this file.
-	i, recordOffset := findIndexAndOffset(s.source, offset)
-	if i == len(s.source) {
+	i, recordOffset := findIndexAndOffset(sfo.seqFile.source, offset)
+	if i == len(sfo.seqFile.source) {
 		// Ok, we're at EOF. Let's first check to see if there might be
 		// more data available to us. If there is more data, add it to
 		// the end and try reading again.
-		if !s.SeqSource.NeedsUpdate(s.generation) {
+		if !sfo.seqFile.SeqSource.NeedsUpdate(sfo.seqFile.generation) {
 			return 0, io.EOF
 		}
-		oldLen := len(s.source)
-		s.updateSourceLocked(ctx, len(s.source))
+		oldLen := len(sfo.seqFile.source)
+		sfo.seqFile.updateSourceLocked(ctx, len(sfo.seqFile.source))
 		updated = true
 		// We know that we had consumed everything up until this point
 		// so we search in the new slice instead of starting over.
-		i, recordOffset = findIndexAndOffset(s.source[oldLen:], recordOffset)
+		i, recordOffset = findIndexAndOffset(sfo.seqFile.source[oldLen:], recordOffset)
 		i += oldLen
 		// i is at most the length of the slice which is
-		// len(s.source) - oldLen. So at most i will be equal to
-		// len(s.source).
-		if i == len(s.source) {
+		// len(sfo.seqFile.source) - oldLen. So at most i will be equal to
+		// len(sfo.seqFile.source).
+		if i == len(sfo.seqFile.source) {
 			return 0, io.EOF
 		}
 	}
@@ -178,7 +241,7 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	// before continuing on to the next. We don't refresh our data source
 	// before this record is completed.
 	if recordOffset != 0 {
-		n, err := dst.CopyOut(ctx, s.source[i].Buf[recordOffset:])
+		n, err := dst.CopyOut(ctx, sfo.seqFile.source[i].Buf[recordOffset:])
 		done += int64(n)
 		dst = dst.DropFirst(n)
 		if dst.NumBytes() == 0 || err != nil {
@@ -190,15 +253,15 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	// Next/New unit, update the source file if necessary. Make an extra
 	// check to see if we've seeked backwards and if so always update our
 	// data source.
-	if !updated && (s.SeqSource.NeedsUpdate(s.generation) || s.lastRead > offset) {
-		s.updateSourceLocked(ctx, i)
+	if !updated && (sfo.seqFile.SeqSource.NeedsUpdate(sfo.seqFile.generation) || sfo.seqFile.lastRead > offset) {
+		sfo.seqFile.updateSourceLocked(ctx, i)
 		// recordOffset is 0 here and we won't update records behind the
 		// current one so recordOffset is still 0 even though source
 		// just got updated. Just read the next record.
 	}
 
 	// Finish by reading all the available data.
-	for _, buf := range s.source[i:] {
+	for _, buf := range sfo.seqFile.source[i:] {
 		n, err := dst.CopyOut(ctx, buf.Buf)
 		done += int64(n)
 		dst = dst.DropFirst(n)
@@ -214,23 +277,3 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	}
 	return done, nil
 }
-
-// updateSourceLocked requires that s.mu is held.
-func (s *SeqFile) updateSourceLocked(ctx context.Context, record int) {
-	var h SeqHandle
-	if record == 0 {
-		h = nil
-	} else {
-		h = s.source[record-1].Handle
-	}
-	// Save what we have previously read.
-	s.source = s.source[:record]
-	var newSource []SeqData
-	newSource, s.generation = s.SeqSource.ReadSeqFileData(ctx, h)
-	s.source = append(s.source, newSource...)
-}
-
-// DeprecatedPwritev is always denied.
-func (*SeqFile) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ramfs.ErrDenied
-}
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
index f9a2ca38e..35403ab7f 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -23,7 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -91,10 +91,15 @@ type testTable struct {
 	expectedError  error
 }
 
-func runTableTests(ctx context.Context, table []testTable, n fs.InodeOperations) error {
+func runTableTests(ctx context.Context, table []testTable, dirent *fs.Dirent) error {
 	for _, tt := range table {
+		file, err := dirent.Inode.InodeOperations.GetFile(ctx, dirent, fs.FileFlags{Read: true})
+		if err != nil {
+			return fmt.Errorf("GetFile returned error: %v", err)
+		}
+
 		data := make([]byte, tt.readBufferSize)
-		resultLen, err := n.DeprecatedPreadv(ctx, usermem.BytesIOSequence(data), tt.offset)
+		resultLen, err := file.Preadv(ctx, usermem.BytesIOSequence(data), tt.offset)
 		if err != tt.expectedError {
 			return fmt.Errorf("t.Preadv(len: %v, offset: %v) (error) => %v expected %v", tt.readBufferSize, tt.offset, err, tt.expectedError)
 		}
@@ -115,12 +120,12 @@ func TestSeqFile(t *testing.T) {
 	testSource.Init()
 
 	// Create a file that can be R/W.
-	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	m := fs.NewPseudoMountSource()
 	ctx := contexttest.Context(t)
 	contents := map[string]*fs.Inode{
 		"foo": NewSeqFileInode(ctx, testSource, m),
 	}
-	root := ramfstest.NewDir(ctx, contents, fs.FilePermsFromMode(0777))
+	root := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0777))
 
 	// How about opening it?
 	inode := fs.NewInode(root, m, fs.StableAttr{Type: fs.Directory})
@@ -129,9 +134,13 @@ func TestSeqFile(t *testing.T) {
 		t.Fatalf("failed to walk to foo for n2: %v", err)
 	}
 	n2 := dirent2.Inode.InodeOperations
+	file2, err := n2.GetFile(ctx, dirent2, fs.FileFlags{Read: true, Write: true})
+	if err != nil {
+		t.Fatalf("GetFile returned error: %v", err)
+	}
 
 	// Writing?
-	if _, err := n2.DeprecatedPwritev(nil, usermem.BytesIOSequence([]byte("test")), 0); err == nil {
+	if _, err := file2.Writev(ctx, usermem.BytesIOSequence([]byte("test"))); err == nil {
 		t.Fatalf("managed to write to n2: %v", err)
 	}
 
@@ -141,7 +150,6 @@ func TestSeqFile(t *testing.T) {
 		t.Fatalf("failed to walk to foo: %v", err)
 	}
 	n3 := dirent3.Inode.InodeOperations
-
 	if n2 != n3 {
 		t.Error("got n2 != n3, want same")
 	}
@@ -170,13 +178,13 @@ func TestSeqFile(t *testing.T) {
 		// Read the last 3 bytes.
 		{97, 10, testSource.actual[9].Buf[7:], nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed with testSource.update = %v : %v", testSource.update, err)
 	}
 
 	// Disable updates and do it again.
 	testSource.update = false
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed with testSource.update = %v: %v", testSource.update, err)
 	}
 }
@@ -188,25 +196,24 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	testSource.update = true
 
 	// Create a file that can be R/W.
-	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	m := fs.NewPseudoMountSource()
 	ctx := contexttest.Context(t)
 	contents := map[string]*fs.Inode{
 		"foo": NewSeqFileInode(ctx, testSource, m),
 	}
-	root := ramfstest.NewDir(ctx, contents, fs.FilePermsFromMode(0777))
+	root := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0777))
 
 	// How about opening it?
 	inode := fs.NewInode(root, m, fs.StableAttr{Type: fs.Directory})
 	dirent2, err := root.Lookup(ctx, inode, "foo")
 	if err != nil {
-		t.Fatalf("failed to walk to foo for n2: %v", err)
+		t.Fatalf("failed to walk to foo for dirent2: %v", err)
 	}
-	n2 := dirent2.Inode.InodeOperations
 
 	table := []testTable{
 		{0, 16, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf[:6]), nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed: %v", err)
 	}
 	// Delete the first entry.
@@ -224,7 +231,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 		// Read the following two lines.
 		{30, 20, flatten(testSource.actual[3].Buf, testSource.actual[4].Buf), nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after removing first entry: %v", err)
 	}
 
@@ -238,7 +245,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	table = []testTable{
 		{50, 20, flatten(testSource.actual[4].Buf, testSource.actual[5].Buf), nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after adding middle entry: %v", err)
 	}
 	// This will be used in a later test.
@@ -249,7 +256,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	table = []testTable{
 		{20, 20, []byte{}, io.EOF},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after removing all entries: %v", err)
 	}
 	// Restore some of the data.
@@ -257,7 +264,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	table = []testTable{
 		{6, 20, testSource.actual[0].Buf[6:], nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after adding first entry back: %v", err)
 	}
 
@@ -266,7 +273,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	table = []testTable{
 		{30, 20, flatten(testSource.actual[3].Buf, testSource.actual[4].Buf), nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after extending testSource: %v", err)
 	}
 }
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index 54562508d..ee6b9f262 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -22,39 +22,15 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// hostname is a file containing the system hostname.
-//
-// +stateify savable
-type hostname struct {
-	ramfs.Entry
-}
-
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (hostname) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	utsns := kernel.UTSNamespaceFromContext(ctx)
-	contents := []byte(utsns.HostName() + "\n")
-
-	if offset >= int64(len(contents)) {
-		return 0, io.EOF
-	}
-
-	n, err := dst.CopyOut(ctx, contents[offset:])
-	return int64(n), err
-}
-
-func (p *proc) newHostname(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	h := &hostname{}
-	h.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(h, msrc, fs.SpecialFile, nil)
-}
-
 // mmapMinAddrData backs /proc/sys/vm/mmap_min_addr.
 //
 // +stateify savable
@@ -101,36 +77,84 @@ func (*overcommitMemory) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandl
 }
 
 func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	d.AddChild(ctx, "hostname", p.newHostname(ctx, msrc))
-
-	d.AddChild(ctx, "shmmax", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))))
-	d.AddChild(ctx, "shmall", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))))
-	d.AddChild(ctx, "shmmni", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))))
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	h := hostname{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+	}
+
+	children := map[string]*fs.Inode{
+		"hostname": newProcInode(&h, msrc, fs.SpecialFile, nil),
+		"shmall":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))),
+		"shmmax":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))),
+		"shmmni":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))),
+	}
+
+	d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 func (p *proc) newVMDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	d.AddChild(ctx, "mmap_min_addr", seqfile.NewSeqFileInode(ctx, &mmapMinAddrData{p.k}, msrc))
-	d.AddChild(ctx, "overcommit_memory", seqfile.NewSeqFileInode(ctx, &overcommitMemory{}, msrc))
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	children := map[string]*fs.Inode{
+		"mmap_min_addr":     seqfile.NewSeqFileInode(ctx, &mmapMinAddrData{p.k}, msrc),
+		"overcommit_memory": seqfile.NewSeqFileInode(ctx, &overcommitMemory{}, msrc),
+	}
+	d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 func (p *proc) newSysDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	d.AddChild(ctx, "kernel", p.newKernelDir(ctx, msrc))
-	d.AddChild(ctx, "vm", p.newVMDir(ctx, msrc))
+	children := map[string]*fs.Inode{
+		"kernel": p.newKernelDir(ctx, msrc),
+		"vm":     p.newVMDir(ctx, msrc),
+	}
 
 	// If we're using rpcinet we will let it manage /proc/sys/net.
 	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-		d.AddChild(ctx, "net", newRPCInetProcSysNet(ctx, msrc))
+		children["net"] = newRPCInetProcSysNet(ctx, msrc)
 	} else {
-		d.AddChild(ctx, "net", p.newSysNetDir(ctx, msrc))
+		children["net"] = p.newSysNetDir(ctx, msrc)
 	}
 
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
+}
+
+// hostname is the inode for a file containing the system hostname.
+//
+// +stateify savable
+type hostname struct {
+	fsutil.SimpleFileInode
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (h *hostname) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &hostnameFile{}), nil
+}
+
+var _ fs.InodeOperations = (*hostname)(nil)
+
+// +stateify savable
+type hostnameFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoSeek        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
 }
+
+// Read implements fs.FileOperations.Read.
+func (hf *hostnameFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	utsns := kernel.UTSNamespaceFromContext(ctx)
+	contents := []byte(utsns.HostName() + "\n")
+	if offset >= int64(len(contents)) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, contents[offset:])
+	return int64(n), err
+
+}
+
+var _ fs.FileOperations = (*hostnameFile)(nil)
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index b50d43d70..42e9bc47f 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -17,13 +17,17 @@ package proc
 import (
 	"fmt"
 	"io"
+	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 type tcpMemDir int
@@ -33,21 +37,37 @@ const (
 	tcpWMem
 )
 
+// tcpMemInode is used to read/write the size of netstack tcp buffers.
+//
+// TODO: If we have multiple proc mounts, concurrent writes can
+// leave netstack and the proc files in an inconsistent state. Since we set the
+// buffer size from these proc files on restore, we may also race and end up in
+// an inconsistent state on restore.
+//
 // +stateify savable
-type tcpMem struct {
-	ramfs.Entry
-	s    inet.Stack `state:"wait"`
+type tcpMemInode struct {
+	fsutil.SimpleFileInode
+	dir tcpMemDir
+	s   inet.Stack `state:"wait"`
+
+	// size stores the tcp buffer size during save, and sets the buffer
+	// size in netstack in restore. We must save/restore this here, since
+	// netstack itself is stateless.
 	size inet.TCPBufferSize
-	dir  tcpMemDir
-}
 
-func newTCPMem(s inet.Stack, size inet.TCPBufferSize, dir tcpMemDir) *tcpMem {
-	return &tcpMem{s: s, size: size, dir: dir}
+	// mu protects against concurrent reads/writes to files based on this
+	// inode.
+	mu sync.Mutex `state:"nosave"`
 }
 
-func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, size inet.TCPBufferSize, dir tcpMemDir) *fs.Inode {
-	tm := newTCPMem(s, size, dir)
-	tm.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0644))
+var _ fs.InodeOperations = (*tcpMemInode)(nil)
+
+func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir tcpMemDir) *fs.Inode {
+	tm := &tcpMemInode{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		s:               s,
+		dir:             dir,
+	}
 	sattr := fs.StableAttr{
 		DeviceID:  device.ProcDevice.DeviceID(),
 		InodeID:   device.ProcDevice.NextIno(),
@@ -57,62 +77,105 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, siz
 	return fs.NewInode(tm, msrc, sattr)
 }
 
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (m *tcpMem) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// GetFile implements fs.InodeOperations.GetFile.
+func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	return fs.NewFile(ctx, dirent, flags, &tcpMemFile{tcpMemInode: m}), nil
+}
+
+// +stateify savable
+type tcpMemFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	tcpMemInode *tcpMemInode
+}
+
+var _ fs.FileOperations = (*tcpMemFile)(nil)
+
+// Read implements fs.FileOperations.Read.
+func (f *tcpMemFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset != 0 {
 		return 0, io.EOF
 	}
-	s := fmt.Sprintf("%d\t%d\t%d\n", m.size.Min, m.size.Default, m.size.Max)
+	f.tcpMemInode.mu.Lock()
+	defer f.tcpMemInode.mu.Unlock()
+
+	size, err := readSize(f.tcpMemInode.dir, f.tcpMemInode.s)
+	if err != nil {
+		return 0, err
+	}
+	s := fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max)
 	n, err := dst.CopyOut(ctx, []byte(s))
 	return int64(n), err
 }
 
-// Truncate implements fs.InodeOperations.Truncate.
-func (*tcpMem) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+// Write implements fs.FileOperations.Write.
+func (f *tcpMemFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	if src.NumBytes() == 0 {
 		return 0, nil
 	}
-	src = src.TakeFirst(usermem.PageSize - 1)
+	f.tcpMemInode.mu.Lock()
+	defer f.tcpMemInode.mu.Unlock()
 
-	buf := []int32{int32(m.size.Min), int32(m.size.Default), int32(m.size.Max)}
+	src = src.TakeFirst(usermem.PageSize - 1)
+	size, err := readSize(f.tcpMemInode.dir, f.tcpMemInode.s)
+	if err != nil {
+		return 0, err
+	}
+	buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
 	n, cperr := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
-	m.size = inet.TCPBufferSize{
+	newSize := inet.TCPBufferSize{
 		Min:     int(buf[0]),
 		Default: int(buf[1]),
 		Max:     int(buf[2]),
 	}
-	if err := m.writeSize(); err != nil {
+	if err := writeSize(f.tcpMemInode.dir, f.tcpMemInode.s, newSize); err != nil {
 		return n, err
 	}
 	return n, cperr
 }
 
-func (m *tcpMem) writeSize() error {
-	switch m.dir {
+func readSize(dirType tcpMemDir, s inet.Stack) (inet.TCPBufferSize, error) {
+	switch dirType {
+	case tcpRMem:
+		return s.TCPReceiveBufferSize()
+	case tcpWMem:
+		return s.TCPSendBufferSize()
+	default:
+		panic(fmt.Sprintf("unknown tcpMemFile type: %v", dirType))
+	}
+}
+
+func writeSize(dirType tcpMemDir, s inet.Stack, size inet.TCPBufferSize) error {
+	switch dirType {
 	case tcpRMem:
-		return m.s.SetTCPReceiveBufferSize(m.size)
+		return s.SetTCPReceiveBufferSize(size)
 	case tcpWMem:
-		return m.s.SetTCPSendBufferSize(m.size)
+		return s.SetTCPSendBufferSize(size)
 	default:
-		panic(fmt.Sprintf("unknown tcpMem.dir: %v", m.dir))
+		panic(fmt.Sprintf("unknown tcpMemFile type: %v", dirType))
 	}
 }
 
 // +stateify savable
 type tcpSack struct {
-	ramfs.Entry
-	s       inet.Stack `state:"wait"`
+	stack   inet.Stack `state:"wait"`
 	enabled *bool
+	fsutil.SimpleFileInode
 }
 
 func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
-	ts := &tcpSack{s: s}
-	ts.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0644))
+	ts := &tcpSack{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		stack:           s,
+	}
 	sattr := fs.StableAttr{
 		DeviceID:  device.ProcDevice.DeviceID(),
 		InodeID:   device.ProcDevice.NextIno(),
@@ -122,21 +185,48 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f
 	return fs.NewInode(ts, msrc, sattr)
 }
 
-func (s *tcpSack) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// GetFile implements fs.InodeOperations.GetFile.
+func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
+	return fs.NewFile(ctx, dirent, flags, &tcpSackFile{
+		tcpSack: s,
+		stack:   s.stack,
+	}), nil
+}
+
+// +stateify savable
+type tcpSackFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	tcpSack *tcpSack
+
+	stack inet.Stack `state:"wait"`
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *tcpSackFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset != 0 {
 		return 0, io.EOF
 	}
 
-	if s.enabled == nil {
-		sack, err := s.s.TCPSACKEnabled()
+	if f.tcpSack.enabled == nil {
+		sack, err := f.stack.TCPSACKEnabled()
 		if err != nil {
 			return 0, err
 		}
-		s.enabled = &sack
+		f.tcpSack.enabled = &sack
 	}
 
 	val := "0\n"
-	if *s.enabled {
+	if *f.tcpSack.enabled {
 		// Technically, this is not quite compatible with Linux. Linux
 		// stores these as an integer, so if you write "2" into
 		// tcp_sack, you should get 2 back. Tough luck.
@@ -146,13 +236,8 @@ func (s *tcpSack) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	return int64(n), err
 }
 
-// Truncate implements fs.InodeOperations.Truncate.
-func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (s *tcpSack) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+// Write implements fs.FileOperations.Write.
+func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	if src.NumBytes() == 0 {
 		return 0, nil
 	}
@@ -163,100 +248,104 @@ func (s *tcpSack) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 	if err != nil {
 		return n, err
 	}
-	if s.enabled == nil {
-		s.enabled = new(bool)
+	if f.tcpSack.enabled == nil {
+		f.tcpSack.enabled = new(bool)
 	}
-	*s.enabled = v != 0
-	return n, s.s.SetTCPSACKEnabled(*s.enabled)
+	*f.tcpSack.enabled = v != 0
+	return n, f.tcpSack.stack.SetTCPSACKEnabled(*f.tcpSack.enabled)
 }
 
 func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-
 	// The following files are simple stubs until they are implemented in
 	// netstack, most of these files are configuration related. We use the
 	// value closest to the actual netstack behavior or any empty file,
 	// all of these files will have mode 0444 (read-only for all users).
-	d.AddChild(ctx, "default_qdisc", p.newStubProcFSFile(ctx, msrc, []byte("pfifo_fast")))
-	d.AddChild(ctx, "message_burst", p.newStubProcFSFile(ctx, msrc, []byte("10")))
-	d.AddChild(ctx, "message_cost", p.newStubProcFSFile(ctx, msrc, []byte("5")))
-	d.AddChild(ctx, "optmem_max", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "rmem_default", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
-	d.AddChild(ctx, "rmem_max", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
-	d.AddChild(ctx, "somaxconn", p.newStubProcFSFile(ctx, msrc, []byte("128")))
-	d.AddChild(ctx, "wmem_default", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
-	d.AddChild(ctx, "wmem_max", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	contents := map[string]*fs.Inode{
+		"default_qdisc": newStaticProcInode(ctx, msrc, []byte("pfifo_fast")),
+		"message_burst": newStaticProcInode(ctx, msrc, []byte("10")),
+		"message_cost":  newStaticProcInode(ctx, msrc, []byte("5")),
+		"optmem_max":    newStaticProcInode(ctx, msrc, []byte("0")),
+		"rmem_default":  newStaticProcInode(ctx, msrc, []byte("212992")),
+		"rmem_max":      newStaticProcInode(ctx, msrc, []byte("212992")),
+		"somaxconn":     newStaticProcInode(ctx, msrc, []byte("128")),
+		"wmem_default":  newStaticProcInode(ctx, msrc, []byte("212992")),
+		"wmem_max":      newStaticProcInode(ctx, msrc, []byte("212992")),
+	}
+
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	contents := map[string]*fs.Inode{
+		// Add tcp_sack.
+		"tcp_sack": newTCPSackInode(ctx, msrc, s),
+
+		// The following files are simple stubs until they are
+		// implemented in netstack, most of these files are
+		// configuration related. We use the value closest to the
+		// actual netstack behavior or any empty file, all of these
+		// files will have mode 0444 (read-only for all users).
+		"ip_local_port_range":     newStaticProcInode(ctx, msrc, []byte("16000   65535")),
+		"ip_local_reserved_ports": newStaticProcInode(ctx, msrc, []byte("")),
+		"ipfrag_time":             newStaticProcInode(ctx, msrc, []byte("30")),
+		"ip_nonlocal_bind":        newStaticProcInode(ctx, msrc, []byte("0")),
+		"ip_no_pmtu_disc":         newStaticProcInode(ctx, msrc, []byte("1")),
+
+		// tcp_allowed_congestion_control tell the user what they are
+		// able to do as an unprivledged process so we leave it empty.
+		"tcp_allowed_congestion_control":   newStaticProcInode(ctx, msrc, []byte("")),
+		"tcp_available_congestion_control": newStaticProcInode(ctx, msrc, []byte("reno")),
+		"tcp_congestion_control":           newStaticProcInode(ctx, msrc, []byte("reno")),
+
+		// Many of the following stub files are features netstack
+		// doesn't support. The unsupported features return "0" to
+		// indicate they are disabled.
+		"tcp_base_mss":              newStaticProcInode(ctx, msrc, []byte("1280")),
+		"tcp_dsack":                 newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_early_retrans":         newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_fack":                  newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_fastopen":              newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_fastopen_key":          newStaticProcInode(ctx, msrc, []byte("")),
+		"tcp_invalid_ratelimit":     newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_keepalive_intvl":       newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_keepalive_probes":      newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_keepalive_time":        newStaticProcInode(ctx, msrc, []byte("7200")),
+		"tcp_mtu_probing":           newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_no_metrics_save":       newStaticProcInode(ctx, msrc, []byte("1")),
+		"tcp_probe_interval":        newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_probe_threshold":       newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_retries1":              newStaticProcInode(ctx, msrc, []byte("3")),
+		"tcp_retries2":              newStaticProcInode(ctx, msrc, []byte("15")),
+		"tcp_rfc1337":               newStaticProcInode(ctx, msrc, []byte("1")),
+		"tcp_slow_start_after_idle": newStaticProcInode(ctx, msrc, []byte("1")),
+		"tcp_synack_retries":        newStaticProcInode(ctx, msrc, []byte("5")),
+		"tcp_syn_retries":           newStaticProcInode(ctx, msrc, []byte("3")),
+		"tcp_timestamps":            newStaticProcInode(ctx, msrc, []byte("1")),
+	}
 
 	// Add tcp_rmem.
-	if rs, err := s.TCPReceiveBufferSize(); err == nil {
-		d.AddChild(ctx, "tcp_rmem", newTCPMemInode(ctx, msrc, s, rs, tcpRMem))
+	if _, err := s.TCPReceiveBufferSize(); err == nil {
+		contents["tcp_rmem"] = newTCPMemInode(ctx, msrc, s, tcpRMem)
 	}
 
 	// Add tcp_wmem.
-	if ss, err := s.TCPSendBufferSize(); err == nil {
-		d.AddChild(ctx, "tcp_wmem", newTCPMemInode(ctx, msrc, s, ss, tcpWMem))
+	if _, err := s.TCPSendBufferSize(); err == nil {
+		contents["tcp_wmem"] = newTCPMemInode(ctx, msrc, s, tcpWMem)
 	}
 
-	// Add tcp_sack.
-	d.AddChild(ctx, "tcp_sack", newTCPSackInode(ctx, msrc, s))
-
-	// The following files are simple stubs until they are implemented in
-	// netstack, most of these files are configuration related. We use the
-	// value closest to the actual netstack behavior or any empty file,
-	// all of these files will have mode 0444 (read-only for all users).
-	d.AddChild(ctx, "ip_local_port_range", p.newStubProcFSFile(ctx, msrc, []byte("16000   65535")))
-	d.AddChild(ctx, "ip_local_reserved_ports", p.newStubProcFSFile(ctx, msrc, []byte("")))
-	d.AddChild(ctx, "ipfrag_time", p.newStubProcFSFile(ctx, msrc, []byte("30")))
-	d.AddChild(ctx, "ip_nonlocal_bind", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "ip_no_pmtu_disc", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-
-	// tcp_allowed_congestion_control tell the user what they are able to do as an
-	// unprivledged process so we leave it empty.
-	d.AddChild(ctx, "tcp_allowed_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("")))
-	d.AddChild(ctx, "tcp_available_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("reno")))
-	d.AddChild(ctx, "tcp_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("reno")))
-
-	// Many of the following stub files are features netstack doesn't support
-	// and are therefore "0" for disabled.
-	d.AddChild(ctx, "tcp_base_mss", p.newStubProcFSFile(ctx, msrc, []byte("1280")))
-	d.AddChild(ctx, "tcp_dsack", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_early_retrans", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_fack", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_fastopen", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_fastopen_key", p.newStubProcFSFile(ctx, msrc, []byte("")))
-	d.AddChild(ctx, "tcp_invalid_ratelimit", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_keepalive_intvl", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_keepalive_probes", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_keepalive_time", p.newStubProcFSFile(ctx, msrc, []byte("7200")))
-	d.AddChild(ctx, "tcp_mtu_probing", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_no_metrics_save", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-	d.AddChild(ctx, "tcp_probe_interval", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_probe_threshold", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_retries1", p.newStubProcFSFile(ctx, msrc, []byte("3")))
-	d.AddChild(ctx, "tcp_retries2", p.newStubProcFSFile(ctx, msrc, []byte("15")))
-	d.AddChild(ctx, "tcp_rfc1337", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-	d.AddChild(ctx, "tcp_slow_start_after_idle", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-	d.AddChild(ctx, "tcp_synack_retries", p.newStubProcFSFile(ctx, msrc, []byte("5")))
-	d.AddChild(ctx, "tcp_syn_retries", p.newStubProcFSFile(ctx, msrc, []byte("3")))
-	d.AddChild(ctx, "tcp_timestamps", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	var contents map[string]*fs.Inode
 	if s := p.k.NetworkStack(); s != nil {
-		d.AddChild(ctx, "ipv4", p.newSysNetIPv4Dir(ctx, msrc, s))
-		d.AddChild(ctx, "core", p.newSysNetCore(ctx, msrc, s))
+		contents = map[string]*fs.Inode{
+			"ipv4": p.newSysNetIPv4Dir(ctx, msrc, s),
+			"core": p.newSysNetCore(ctx, msrc, s),
+		}
 	}
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 7f46776c0..5f481a1cf 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -16,17 +16,26 @@ package proc
 
 import "fmt"
 
+// beforeSave is invoked by stateify.
+func (t *tcpMemInode) beforeSave() {
+	size, err := readSize(t.dir, t.s)
+	if err != nil {
+		panic(fmt.Sprintf("failed to read TCP send / receive buffer sizes: %v", err))
+	}
+	t.size = size
+}
+
 // afterLoad is invoked by stateify.
-func (m *tcpMem) afterLoad() {
-	if err := m.writeSize(); err != nil {
-		panic(fmt.Sprintf("failed to write previous TCP send / receive buffer sizes [%v]: %v", m.size, err))
+func (t *tcpMemInode) afterLoad() {
+	if err := writeSize(t.dir, t.s, t.size); err != nil {
+		panic(fmt.Sprintf("failed to write previous TCP send / receive buffer sizes [%v]: %v", t.size, err))
 	}
 }
 
 // afterLoad is invoked by stateify.
 func (s *tcpSack) afterLoad() {
 	if s.enabled != nil {
-		if err := s.s.SetTCPSACKEnabled(*s.enabled); err != nil {
+		if err := s.stack.SetTCPSACKEnabled(*s.enabled); err != nil {
 			panic(fmt.Sprintf("failed to set previous TCP sack configuration [%v]: %v", *s.enabled, err))
 		}
 	}
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 0ce9d30f1..ea0d94fce 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -26,13 +26,14 @@ func TestQuerySendBufferSize(t *testing.T) {
 	ctx := context.Background()
 	s := inet.NewTestStack()
 	s.TCPSendBufSize = inet.TCPBufferSize{100, 200, 300}
-	tm := newTCPMem(s, s.TCPSendBufSize, tcpWMem)
+	tmi := &tcpMemInode{s: s, dir: tcpWMem}
+	tmf := &tcpMemFile{tcpMemInode: tmi}
 
 	buf := make([]byte, 100)
 	dst := usermem.BytesIOSequence(buf)
-	n, err := tm.DeprecatedPreadv(ctx, dst, 0)
+	n, err := tmf.Read(ctx, nil, dst, 0)
 	if err != nil {
-		t.Fatalf("DeprecatedPreadv failed: %v", err)
+		t.Fatalf("Read failed: %v", err)
 	}
 
 	if got, want := string(buf[:n]), "100\t200\t300\n"; got != want {
@@ -44,13 +45,14 @@ func TestQueryRecvBufferSize(t *testing.T) {
 	ctx := context.Background()
 	s := inet.NewTestStack()
 	s.TCPRecvBufSize = inet.TCPBufferSize{100, 200, 300}
-	tm := newTCPMem(s, s.TCPRecvBufSize, tcpRMem)
+	tmi := &tcpMemInode{s: s, dir: tcpRMem}
+	tmf := &tcpMemFile{tcpMemInode: tmi}
 
 	buf := make([]byte, 100)
 	dst := usermem.BytesIOSequence(buf)
-	n, err := tm.DeprecatedPreadv(ctx, dst, 0)
+	n, err := tmf.Read(ctx, nil, dst, 0)
 	if err != nil {
-		t.Fatalf("DeprecatedPreadv failed: %v", err)
+		t.Fatalf("Read failed: %v", err)
 	}
 
 	if got, want := string(buf[:n]), "100\t200\t300\n"; got != want {
@@ -85,12 +87,13 @@ func TestConfigureSendBufferSize(t *testing.T) {
 	s := inet.NewTestStack()
 	for _, c := range cases {
 		s.TCPSendBufSize = c.initial
-		tm := newTCPMem(s, c.initial, tcpWMem)
+		tmi := &tcpMemInode{s: s, dir: tcpWMem}
+		tmf := &tcpMemFile{tcpMemInode: tmi}
 
 		// Write the values.
 		src := usermem.BytesIOSequence([]byte(c.str))
-		if n, err := tm.DeprecatedPwritev(ctx, src, 0); n != int64(len(c.str)) || err != nil {
-			t.Errorf("DeprecatedPwritev, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
+		if n, err := tmf.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+			t.Errorf("Write, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
 		}
 
 		// Read the values from the stack and check them.
@@ -105,12 +108,13 @@ func TestConfigureRecvBufferSize(t *testing.T) {
 	s := inet.NewTestStack()
 	for _, c := range cases {
 		s.TCPRecvBufSize = c.initial
-		tm := newTCPMem(s, c.initial, tcpRMem)
+		tmi := &tcpMemInode{s: s, dir: tcpRMem}
+		tmf := &tcpMemFile{tcpMemInode: tmi}
 
 		// Write the values.
 		src := usermem.BytesIOSequence([]byte(c.str))
-		if n, err := tm.DeprecatedPwritev(ctx, src, 0); n != int64(len(c.str)) || err != nil {
-			t.Errorf("DeprecatedPwritev, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
+		if n, err := tmf.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+			t.Errorf("Write, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
 		}
 
 		// Read the values from the stack and check them.
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 91bda8a95..41981a973 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
@@ -32,6 +33,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // getTaskMM returns t's MemoryManager. If getTaskMM succeeds, the MemoryManager's
@@ -57,19 +59,19 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 type taskDir struct {
 	ramfs.Dir
 
-	// t is the associated kernel task that owns this file.
-	t *kernel.Task
+	t     *kernel.Task
+	pidns *kernel.PIDNamespace
 }
 
+var _ fs.InodeOperations = (*taskDir)(nil)
+
 // newTaskDir creates a new proc task entry.
 func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace, showSubtasks bool) *fs.Inode {
-	d := &taskDir{t: t}
-	// TODO: Set EUID/EGID based on dumpability.
-	d.InitDir(t, map[string]*fs.Inode{
+	contents := map[string]*fs.Inode{
 		"auxv":    newAuxvec(t, msrc),
-		"cmdline": newExecArgFile(t, msrc, cmdlineExecArg),
+		"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
 		"comm":    newComm(t, msrc),
-		"environ": newExecArgFile(t, msrc, environExecArg),
+		"environ": newExecArgInode(t, msrc, environExecArg),
 		"exe":     newExe(t, msrc),
 		"fd":      newFdDir(t, msrc),
 		"fdinfo":  newFdInfoDir(t, msrc),
@@ -87,11 +89,18 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"statm":     newStatm(t, msrc),
 		"status":    newStatus(t, msrc, pidns),
 		"uid_map":   newUIDMap(t, msrc),
-	}, fs.RootOwner, fs.FilePermsFromMode(0555))
+	}
 	if showSubtasks {
-		d.AddChild(t, "task", newSubtasks(t, msrc, pidns))
+		contents["task"] = newSubtasks(t, msrc, pidns)
 	}
-	return newFile(d, msrc, fs.SpecialDirectory, t)
+
+	// TODO: Set EUID/EGID based on dumpability.
+	d := &taskDir{
+		Dir:   *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		t:     t,
+		pidns: pidns,
+	}
+	return newProcInode(d, msrc, fs.SpecialDirectory, t)
 }
 
 // subtasks represents a /proc/TID/task directory.
@@ -100,15 +109,19 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 type subtasks struct {
 	ramfs.Dir
 
-	t *kernel.Task
-
+	t     *kernel.Task
 	pidns *kernel.PIDNamespace
 }
 
+var _ fs.InodeOperations = (*subtasks)(nil)
+
 func newSubtasks(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode {
-	s := &subtasks{t: t, pidns: pidns}
-	s.InitDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newFile(s, msrc, fs.SpecialDirectory, t)
+	s := &subtasks{
+		Dir:   *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		t:     t,
+		pidns: pidns,
+	}
+	return newProcInode(s, msrc, fs.SpecialDirectory, t)
 }
 
 // UnstableAttr returns unstable attributes of the subtasks.
@@ -123,35 +136,52 @@ func (s *subtasks) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.Unstab
 	return uattr, nil
 }
 
-// Lookup loads an Inode in a task's subtask directory into a Dirent.
-func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
-	tid, err := strconv.ParseUint(p, 10, 32)
-	if err != nil {
-		return nil, syserror.ENOENT
-	}
+// GetFile implements fs.InodeOperations.GetFile.
+func (s *subtasks) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &subtasksFile{t: s.t, pidns: s.pidns}), nil
+}
 
-	task := s.pidns.TaskWithID(kernel.ThreadID(tid))
-	if task == nil {
-		return nil, syserror.ENOENT
-	}
-	if task.ThreadGroup() != s.t.ThreadGroup() {
-		return nil, syserror.ENOENT
-	}
+// +stateify savable
+type subtasksFile struct {
+	fsutil.DirFileOperations `state:"nosave"`
 
-	td := newTaskDir(task, dir.MountSource, s.pidns, false)
-	return fs.NewDirent(td, p), nil
+	t     *kernel.Task
+	pidns *kernel.PIDNamespace
 }
 
-// DeprecatedReaddir lists a task's subtask directory.
-func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	tasks := s.t.ThreadGroup().MemberIDs(s.pidns)
+// Readdir implements fs.FileOperations.Readdir.
+func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
+	dirCtx := fs.DirCtx{
+		Serializer: ser,
+	}
+
+	// Note that unlike most Readdir implementations, the offset here is
+	// not an index into the subtasks, but rather the TID of the next
+	// subtask to emit.
+	offset := file.Offset()
+
+	if offset == 0 {
+		// Serialize "." and "..".
+		root := fs.RootFromContext(ctx)
+		defer root.DecRef()
+		dot, dotdot := file.Dirent.GetDotAttrs(root)
+		if err := dirCtx.DirEmit(".", dot); err != nil {
+			return offset, err
+		}
+		if err := dirCtx.DirEmit("..", dotdot); err != nil {
+			return offset, err
+		}
+	}
+
+	// Serialize tasks.
+	tasks := f.t.ThreadGroup().MemberIDs(f.pidns)
 	taskInts := make([]int, 0, len(tasks))
 	for _, tid := range tasks {
 		taskInts = append(taskInts, int(tid))
 	}
 
 	// Find the task to start at.
-	idx := sort.SearchInts(taskInts, offset)
+	idx := sort.SearchInts(taskInts, int(offset))
 	if idx == len(taskInts) {
 		return offset, nil
 	}
@@ -163,12 +193,33 @@ func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, off
 		attr := fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice)
 		if err := dirCtx.DirEmit(name, attr); err != nil {
 			// Returned offset is next tid to serialize.
-			return tid, err
+			return int64(tid), err
 		}
 	}
 	// We serialized them all.  Next offset should be higher than last
 	// serialized tid.
-	return tid + 1, nil
+	return int64(tid) + 1, nil
+}
+
+var _ fs.FileOperations = (*subtasksFile)(nil)
+
+// Lookup loads an Inode in a task's subtask directory into a Dirent.
+func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
+	tid, err := strconv.ParseUint(p, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+
+	task := s.pidns.TaskWithID(kernel.ThreadID(tid))
+	if task == nil {
+		return nil, syserror.ENOENT
+	}
+	if task.ThreadGroup() != s.t.ThreadGroup() {
+		return nil, syserror.ENOENT
+	}
+
+	td := newTaskDir(task, dir.MountSource, s.pidns, false)
+	return fs.NewDirent(td, p), nil
 }
 
 // exe is an fs.InodeOperations symlink for the /proc/PID/exe file.
@@ -181,9 +232,11 @@ type exe struct {
 }
 
 func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	exeSymlink := &exe{t: t}
-	exeSymlink.InitSymlink(t, fs.RootOwner, "")
-	return newFile(exeSymlink, msrc, fs.Symlink, t)
+	exeSymlink := &exe{
+		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+		t:       t,
+	}
+	return newProcInode(exeSymlink, msrc, fs.Symlink, t)
 }
 
 func (e *exe) executable() (d *fs.Dirent, err error) {
@@ -231,55 +284,48 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	return n, nil
 }
 
-// namespaceFile represents a file in the namespacefs, such as the files in
-// /proc/<pid>/ns.
+// namespaceSymlink represents a symlink in the namespacefs, such as the files
+// in /proc/<pid>/ns.
 //
 // +stateify savable
-type namespaceFile struct {
+type namespaceSymlink struct {
 	ramfs.Symlink
 
 	t *kernel.Task
 }
 
-func newNamespaceFile(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode {
-	n := &namespaceFile{t: t}
-	n.InitSymlink(t, fs.RootOwner, "")
-
+func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode {
 	// TODO: Namespace symlinks should contain the namespace name and the
 	// inode number for the namespace instance, so for example user:[123456]. We
 	// currently fake the inode number by sticking the symlink inode in its
 	// place.
-	n.Target = fmt.Sprintf("%s:[%d]", name, device.ProcDevice.NextIno())
-
-	return newFile(n, msrc, fs.Symlink, t)
+	target := fmt.Sprintf("%s:[%d]", name, device.ProcDevice.NextIno())
+	n := &namespaceSymlink{
+		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, target),
+		t:       t,
+	}
+	return newProcInode(n, msrc, fs.Symlink, t)
 }
 
 // Getlink implements fs.InodeOperations.Getlink.
-func (n *namespaceFile) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
+func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
 	if !kernel.ContextCanTrace(ctx, n.t, false) {
 		return nil, syserror.EACCES
 	}
 
 	// Create a new regular file to fake the namespace file.
-	node := &ramfs.Entry{}
-	node.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0777))
-	sattr := fs.StableAttr{
-		DeviceID:  device.ProcDevice.DeviceID(),
-		InodeID:   device.ProcDevice.NextIno(),
-		BlockSize: usermem.PageSize,
-		Type:      fs.RegularFile,
-	}
-	return fs.NewDirent(fs.NewInode(node, inode.MountSource, sattr), n.Symlink.Target), nil
+	iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC)
+	return fs.NewDirent(newProcInode(iops, inode.MountSource, fs.RegularFile, nil), n.Symlink.Target), nil
 }
 
 func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(t, map[string]*fs.Inode{
-		"net":  newNamespaceFile(t, msrc, "net"),
-		"pid":  newNamespaceFile(t, msrc, "pid"),
-		"user": newNamespaceFile(t, msrc, "user"),
-	}, fs.RootOwner, fs.FilePermsFromMode(0511))
-	return newFile(d, msrc, fs.SpecialDirectory, t)
+	contents := map[string]*fs.Inode{
+		"net":  newNamespaceSymlink(t, msrc, "net"),
+		"pid":  newNamespaceSymlink(t, msrc, "pid"),
+		"user": newNamespaceSymlink(t, msrc, "user"),
+	}
+	d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0511))
+	return newProcInode(d, msrc, fs.SpecialDirectory, t)
 }
 
 // mapsData implements seqfile.SeqSource for /proc/[pid]/maps.
@@ -290,7 +336,7 @@ type mapsData struct {
 }
 
 func newMaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &mapsData{t}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &mapsData{t}), msrc, fs.SpecialFile, t)
 }
 
 func (md *mapsData) mm() *mm.MemoryManager {
@@ -330,7 +376,7 @@ type smapsData struct {
 }
 
 func newSmaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &smapsData{t}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &smapsData{t}), msrc, fs.SpecialFile, t)
 }
 
 func (sd *smapsData) mm() *mm.MemoryManager {
@@ -376,7 +422,7 @@ type taskStatData struct {
 }
 
 func newTaskStat(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool, pidns *kernel.PIDNamespace) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate returns whether the generation is old or not.
@@ -450,7 +496,7 @@ type statmData struct {
 }
 
 func newStatm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &statmData{t}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &statmData{t}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
@@ -487,7 +533,7 @@ type statusData struct {
 }
 
 func newStatus(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &statusData{t, pidns}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &statusData{t, pidns}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
@@ -552,7 +598,7 @@ type ioData struct {
 }
 
 func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate returns whether the generation is old or not.
@@ -590,25 +636,49 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 //
 // +stateify savable
 type comm struct {
-	ramfs.Entry
+	fsutil.SimpleFileInode
 
 	t *kernel.Task
 }
 
 // newComm returns a new comm file.
 func newComm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	c := &comm{t: t}
-	c.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(c, msrc, fs.SpecialFile, t)
+	c := &comm{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(c, msrc, fs.SpecialFile, t)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (c *comm) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &commFile{t: c.t}), nil
+}
+
+// +stateify savable
+type commFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+
+	t *kernel.Task
 }
 
-// DeprecatedPreadv reads the current command name.
-func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+var _ fs.FileOperations = (*commFile)(nil)
+
+// Read implements fs.FileOperations.Read.
+func (f *commFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
 
-	buf := []byte(c.t.Name() + "\n")
+	buf := []byte(f.t.Name() + "\n")
 	if offset >= int64(len(buf)) {
 		return 0, io.EOF
 	}
@@ -621,25 +691,47 @@ func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, off
 //
 // +stateify savable
 type auxvec struct {
-	ramfs.Entry
+	fsutil.SimpleFileInode
 
 	t *kernel.Task
 }
 
 // newAuxvec returns a new auxvec file.
 func newAuxvec(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	a := &auxvec{t: t}
-	a.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0400))
-	return newFile(a, msrc, fs.SpecialFile, t)
+	a := &auxvec{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(a, msrc, fs.SpecialFile, t)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (a *auxvec) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &auxvecFile{t: a.t}), nil
+}
+
+// +stateify savable
+type auxvecFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+
+	t *kernel.Task
 }
 
-// DeprecatedPreadv reads the current auxiliary vector.
-func (a *auxvec) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// Read implements fs.FileOperations.Read.
+func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
 
-	m, err := getTaskMM(a.t)
+	m, err := getTaskMM(f.t)
 	if err != nil {
 		return 0, err
 	}
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index f70399686..815c40b7f 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -90,12 +90,13 @@ func newGIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 func newIDMap(t *kernel.Task, msrc *fs.MountSource, gids bool) *fs.Inode {
-	imsf := &idMapSeqFile{seqfile.SeqFile{SeqSource: &idMapSeqSource{
-		t:    t,
-		gids: gids,
-	}}}
-	imsf.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0644))
-	return newFile(imsf, msrc, fs.SpecialFile, t)
+	imsf := &idMapSeqFile{
+		*seqfile.NewSeqFile(t, &idMapSeqSource{
+			t:    t,
+			gids: gids,
+		}),
+	}
+	return newProcInode(imsf, msrc, fs.SpecialFile, t)
 }
 
 func (imsf *idMapSeqFile) source() *idMapSeqSource {
@@ -106,8 +107,8 @@ func (imsf *idMapSeqFile) source() *idMapSeqSource {
 // Linux 3.18, the limit is five lines." - user_namespaces(7)
 const maxIDMapLines = 5
 
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (imsf *idMapSeqFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+// Write implements fs.FileOperations.Write.
+func (imsf *idMapSeqFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	// "In addition, the number of bytes written to the file must be less than
 	// the system page size, and the write must be performed at the start of
 	// the file ..." - user_namespaces(7)
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index 80c7ce0b4..40d0fd1fd 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -18,42 +18,64 @@ import (
 	"fmt"
 	"io"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // uptime is a file containing the system uptime.
 //
 // +stateify savable
 type uptime struct {
-	ramfs.Entry
+	fsutil.SimpleFileInode
 
 	// The "start time" of the sandbox.
 	startTime ktime.Time
 }
 
 // newUptime returns a new uptime file.
-func (p *proc) newUptime(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+func newUptime(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	u := &uptime{
-		startTime: ktime.NowFromContext(ctx),
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		startTime:       ktime.NowFromContext(ctx),
 	}
-	u.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(u, msrc, fs.SpecialFile, nil)
+	return newProcInode(u, msrc, fs.SpecialFile, nil)
 }
 
-// DeprecatedPreadv reads the current uptime.
-func (u *uptime) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// GetFile implements fs.InodeOperations.GetFile.
+func (u *uptime) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &uptimeFile{startTime: u.startTime}), nil
+}
+
+// +stateify savable
+type uptimeFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+
+	startTime ktime.Time
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *uptimeFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
 
 	now := ktime.NowFromContext(ctx)
 	// Pretend that we've spent zero time sleeping (second number).
-	s := []byte(fmt.Sprintf("%.2f 0.00\n", now.Sub(u.startTime).Seconds()))
+	s := []byte(fmt.Sprintf("%.2f 0.00\n", now.Sub(f.startTime).Seconds()))
 	if offset >= int64(len(s)) {
 		return 0, io.EOF
 	}
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index a93ad6240..a476c9cce 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -6,8 +6,6 @@ go_library(
     name = "ramfs",
     srcs = [
         "dir.go",
-        "file.go",
-        "ramfs.go",
         "socket.go",
         "symlink.go",
         "tree.go",
@@ -15,14 +13,12 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/secio",
+        "//pkg/abi/linux",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/memmap",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 0a911b155..729f37694 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -18,10 +18,12 @@ import (
 	"sync"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -47,7 +49,17 @@ type CreateOps struct {
 //
 // +stateify savable
 type Dir struct {
-	Entry
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeIsDirTruncate  `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeSimpleExtendedAttributes
 
 	// CreateOps may be provided.
 	//
@@ -64,17 +76,23 @@ type Dir struct {
 	children map[string]*fs.Inode
 
 	// dentryMap is a sortedDentryMap containing entries for all children.
-	// Its entries ar kept up-to-date with d.children.
+	// Its entries are kept up-to-date with d.children.
 	dentryMap *fs.SortedDentryMap
 }
 
-// InitDir initializes a directory.
-func (d *Dir) InitDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions) {
-	d.InitEntry(ctx, owner, perms)
+var _ fs.InodeOperations = (*Dir)(nil)
+
+// NewDir returns a new Dir with the given contents and attributes.
+func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions) *Dir {
+	d := &Dir{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, perms, linux.RAMFS_MAGIC),
+	}
+
 	if contents == nil {
 		contents = make(map[string]*fs.Inode)
 	}
 	d.children = contents
+
 	// Build the entries map ourselves, rather than calling addChildLocked,
 	// because it will be faster.
 	entries := make(map[string]fs.DentAttr, len(contents))
@@ -88,6 +106,8 @@ func (d *Dir) InitDir(ctx context.Context, contents map[string]*fs.Inode, owner
 
 	// Directories have an extra link, corresponding to '.'.
 	d.AddLink()
+
+	return d
 }
 
 // addChildLocked add the child inode, inheriting its reference.
@@ -124,17 +144,24 @@ func (d *Dir) FindChild(name string) (*fs.Inode, bool) {
 	return child, ok
 }
 
+// Children returns the names and DentAttrs of all children. It can be used to
+// implement Readdir for types that embed ramfs.Dir.
+func (d *Dir) Children() ([]string, map[string]fs.DentAttr) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.dentryMap.GetAll()
+}
+
 // removeChildLocked attempts to remove an entry from this directory.
-// This Entry's mutex must be held. It returns the removed Inode.
 func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, error) {
 	inode, ok := d.children[name]
 	if !ok {
-		return nil, ErrNotFound
+		return nil, syserror.EACCES
 	}
 
 	delete(d.children, name)
 	d.dentryMap.Remove(name)
-	d.Entry.NotifyModification(ctx)
+	d.NotifyModification(ctx)
 
 	// If the child was a subdirectory, then we must decrement this dir's
 	// link count which was the child's ".." directory entry.
@@ -143,7 +170,7 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er
 	}
 
 	// Update ctime.
-	inode.NotifyStatusChange(ctx)
+	inode.InodeOperations.NotifyStatusChange(ctx)
 
 	// Given we're now removing this inode to the directory we must also
 	// decrease its link count. Similarly it is increased in addChildLocked.
@@ -152,8 +179,8 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er
 	return inode, nil
 }
 
-// RemoveEntry attempts to remove an entry from this directory.
-func (d *Dir) RemoveEntry(ctx context.Context, name string) error {
+// Remove removes the named non-directory.
+func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 	inode, err := d.removeChildLocked(ctx, name)
@@ -166,27 +193,23 @@ func (d *Dir) RemoveEntry(ctx context.Context, name string) error {
 	return nil
 }
 
-// Remove removes the named non-directory.
-func (d *Dir) Remove(ctx context.Context, dir *fs.Inode, name string) error {
-	return d.RemoveEntry(ctx, name)
-}
-
 // RemoveDirectory removes the named directory.
-func (d *Dir) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) error {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-	n, err := d.walkLocked(ctx, name)
+	// Get the child and make sure it is not empty.
+	childInode, err := d.walkLocked(ctx, name)
 	if err != nil {
 		return err
 	}
-	dirCtx := &fs.DirCtx{}
-	if _, err := n.HandleOps().DeprecatedReaddir(ctx, dirCtx, 0); err != nil {
+	if ok, err := hasChildren(ctx, childInode); err != nil {
 		return err
+	} else if ok {
+		return syserror.ENOTEMPTY
 	}
-	if len(dirCtx.DentAttrs()) > 0 {
-		return ErrNotEmpty
-	}
+
+	// Child was empty. Proceed with removal.
 	inode, err := d.removeChildLocked(ctx, name)
 	if err != nil {
 		return err
@@ -195,11 +218,11 @@ func (d *Dir) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) e
 	// Remove our reference on the inode.
 	inode.DecRef()
 
-	return err
+	return nil
 }
 
 // Lookup loads an inode at p into a Dirent.
-func (d *Dir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
+func (d *Dir) Lookup(ctx context.Context, _ *fs.Inode, p string) (*fs.Dirent, error) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
@@ -214,9 +237,9 @@ func (d *Dir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent,
 	return fs.NewDirent(inode, p), nil
 }
 
-// walkLocked must be called with this Entry's mutex held.
+// walkLocked must be called with d.mu held.
 func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) {
-	d.Entry.NotifyAccess(ctx)
+	d.NotifyAccess(ctx)
 
 	// Lookup a child node.
 	if inode, ok := d.children[p]; ok {
@@ -244,7 +267,7 @@ func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, make
 	}
 
 	d.addChildLocked(name, inode)
-	d.Entry.NotifyModification(ctx)
+	d.NotifyModification(ctx)
 
 	return inode, nil
 }
@@ -252,7 +275,7 @@ func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, make
 // Create creates a new Inode with the given name and returns its File.
 func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perms fs.FilePermissions) (*fs.File, error) {
 	if d.CreateOps == nil || d.CreateOps.NewFile == nil {
-		return nil, ErrDenied
+		return nil, syserror.EACCES
 	}
 
 	inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
@@ -274,7 +297,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F
 // CreateLink returns a new link.
 func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error {
 	if d.CreateOps == nil || d.CreateOps.NewSymlink == nil {
-		return ErrDenied
+		return syserror.EACCES
 	}
 	_, err := d.createInodeOperationsCommon(ctx, newname, func() (*fs.Inode, error) {
 		return d.NewSymlink(ctx, dir, oldname)
@@ -292,10 +315,10 @@ func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inod
 
 	// The link count will be incremented in addChildLocked.
 	d.addChildLocked(name, target)
-	d.Entry.NotifyModification(ctx)
+	d.NotifyModification(ctx)
 
 	// Update ctime.
-	target.NotifyStatusChange(ctx)
+	target.InodeOperations.NotifyStatusChange(ctx)
 
 	return nil
 }
@@ -303,7 +326,7 @@ func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inod
 // CreateDirectory returns a new subdirectory.
 func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
 	if d.CreateOps == nil || d.CreateOps.NewDir == nil {
-		return ErrDenied
+		return syserror.EACCES
 	}
 	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewDir(ctx, dir, perms)
@@ -316,7 +339,7 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p
 // Bind implements fs.InodeOperations.Bind.
 func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) {
 	if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil {
-		return nil, ErrDenied
+		return nil, syserror.EACCES
 	}
 	inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewBoundEndpoint(ctx, dir, ep, perms)
@@ -335,7 +358,7 @@ func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport
 // CreateFifo implements fs.InodeOperations.CreateFifo.
 func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
 	if d.CreateOps == nil || d.CreateOps.NewFifo == nil {
-		return ErrDenied
+		return syserror.EACCES
 	}
 	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewFifo(ctx, dir, perms)
@@ -343,29 +366,125 @@ func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms
 	return err
 }
 
-func (d *Dir) readdirLocked(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	// Serialize the entries in dentryMap.
-	n, err := fs.GenericReaddir(dirCtx, d.dentryMap)
+// GetFile implements fs.InodeOperations.GetFile.
+func (d *Dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	return fs.NewFile(ctx, dirent, flags, &dirFileOperations{dir: d}), nil
+}
 
-	// Touch the access time.
-	d.Entry.NotifyAccess(ctx)
+// Rename implements fs.InodeOperations.Rename.
+func (*Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName)
+}
 
+// dirFileOperations implements fs.FileOperations for a ramfs directory.
+//
+// +stateify savable
+type dirFileOperations struct {
+	fsutil.DirFileOperations `state:"nosave"`
+
+	// dirCursor contains the name of the last directory entry that was
+	// serialized.
+	dirCursor string
+
+	// dir is the ramfs dir that this file corresponds to.
+	dir *Dir
+}
+
+var _ fs.FileOperations = (*dirFileOperations)(nil)
+
+// Seek implements fs.FileOperations.Seek.
+func (dfo *dirFileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &dfo.dirCursor)
+}
+
+// IterateDir implements DirIterator.IterateDir.
+func (dfo *dirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	dfo.dir.mu.Lock()
+	defer dfo.dir.mu.Unlock()
+
+	n, err := fs.GenericReaddir(dirCtx, dfo.dir.dentryMap)
 	return offset + n, err
 }
 
-// DeprecatedReaddir emits the entries contained in this directory.
-func (d *Dir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.readdirLocked(ctx, dirCtx, offset)
+// Readdir implements FileOperations.Readdir.
+func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &dfo.dirCursor,
+	}
+	dfo.dir.mu.Lock()
+	dfo.dir.InodeSimpleAttributes.Unstable.AccessTime = ktime.NowFromContext(ctx)
+	dfo.dir.mu.Unlock()
+	return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset())
 }
 
-// DeprecatedPreadv always returns ErrIsDirectory
-func (*Dir) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ErrIsDirectory
+// hasChildren is a helper method that determines whether an arbitrary inode
+// (not necessarily ramfs) has any children.
+func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) {
+	// Take an extra ref on inode which will be given to the dirent and
+	// dropped when that dirent is destroyed.
+	inode.IncRef()
+	d := fs.NewTransientDirent(inode)
+	defer d.DecRef()
+
+	file, err := inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+	if err != nil {
+		return false, err
+	}
+	defer file.DecRef()
+
+	ser := &fs.CollectEntriesSerializer{}
+	if err := file.Readdir(ctx, ser); err != nil {
+		return false, err
+	}
+	// We will always write "." and "..", so ignore those two.
+	if ser.Written() > 2 {
+		return true, nil
+	}
+	return false, nil
 }
 
-// DeprecatedPwritev always returns ErrIsDirectory
-func (*Dir) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ErrIsDirectory
+// Rename renames from a *ramfs.Dir to another *ramfs.Dir.
+func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string) error {
+	op, ok := oldParent.(*Dir)
+	if !ok {
+		return syserror.EXDEV
+	}
+	np, ok := newParent.(*Dir)
+	if !ok {
+		return syserror.EXDEV
+	}
+
+	np.mu.Lock()
+	defer np.mu.Unlock()
+
+	// Check whether the ramfs entry to be replaced is a non-empty directory.
+	if replaced, ok := np.children[newName]; ok {
+		if fs.IsDir(replaced.StableAttr) {
+			if ok, err := hasChildren(ctx, replaced); err != nil {
+				return err
+			} else if ok {
+				return syserror.ENOTEMPTY
+			}
+		}
+	}
+
+	// Be careful, we may have already grabbed this mutex above.
+	if op != np {
+		op.mu.Lock()
+		defer op.mu.Unlock()
+	}
+
+	// Do the swap.
+	n := op.children[oldName]
+	op.removeChildLocked(ctx, oldName)
+	np.addChildLocked(newName, n)
+
+	// Update ctime.
+	n.InodeOperations.NotifyStatusChange(ctx)
+
+	return nil
 }
diff --git a/pkg/sentry/fs/ramfs/file.go b/pkg/sentry/fs/ramfs/file.go
deleted file mode 100644
index b7fc98ffc..000000000
--- a/pkg/sentry/fs/ramfs/file.go
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ramfs
-
-import (
-	"io"
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/secio"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-)
-
-// File represents a unique file.  It uses a simple byte slice as storage, and
-// thus should only be used for small files.
-//
-// A File is not mappable.
-//
-// +stateify savable
-type File struct {
-	Entry
-
-	// mu protects the fields below.
-	mu sync.Mutex `state:"nosave"`
-
-	// data tracks backing data for the file.
-	data []byte
-}
-
-// InitFile initializes a file.
-func (f *File) InitFile(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions) {
-	f.InitEntry(ctx, owner, perms)
-}
-
-// UnstableAttr returns unstable attributes of this ramfs file.
-func (f *File) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	uattr, _ := f.Entry.UnstableAttr(ctx, inode)
-	uattr.Size = int64(len(f.data))
-	uattr.Usage = f.usageLocked()
-
-	return uattr, nil
-}
-
-// usageLocked returns the disk usage. Caller must hold f.mu.
-func (f *File) usageLocked() int64 {
-	return int64(len(f.data))
-}
-
-// Append appends the given data. This is for internal use.
-func (f *File) Append(data []byte) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	f.data = append(f.data, data...)
-}
-
-// Truncate truncates this node.
-func (f *File) Truncate(ctx context.Context, inode *fs.Inode, l int64) error {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	if l < int64(len(f.data)) {
-		// Remove excess bytes.
-		f.data = f.data[:l]
-		return nil
-	} else if l > int64(len(f.data)) {
-		// Create a new slice with size l, and copy f.data into it.
-		d := make([]byte, l)
-		copy(d, f.data)
-		f.data = d
-	}
-	f.Entry.NotifyModification(ctx)
-	return nil
-}
-
-// ReadAt implements io.ReaderAt.
-func (f *File) ReadAt(data []byte, offset int64) (int, error) {
-	if offset < 0 {
-		return 0, ErrInvalidOp
-	}
-	if offset >= int64(len(f.data)) {
-		return 0, io.EOF
-	}
-	n := copy(data, f.data[offset:])
-	// Did we read past the end?
-	if offset+int64(len(data)) >= int64(len(f.data)) {
-		return n, io.EOF
-	}
-	return n, nil
-}
-
-// DeprecatedPreadv reads into a collection of slices from a given offset.
-func (f *File) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	if offset >= int64(len(f.data)) {
-		return 0, io.EOF
-	}
-	n, err := dst.CopyOut(ctx, f.data[offset:])
-	if n > 0 {
-		f.Entry.NotifyAccess(ctx)
-	}
-	return int64(n), err
-}
-
-// WriteAt implements io.WriterAt.
-func (f *File) WriteAt(data []byte, offset int64) (int, error) {
-	if offset < 0 {
-		return 0, ErrInvalidOp
-	}
-	newLen := offset + int64(len(data))
-	if newLen < 0 {
-		// Overflow.
-		return 0, syserror.EINVAL
-	}
-	if newLen > int64(len(f.data)) {
-		// Copy f.data into new slice with expanded length.
-		d := make([]byte, newLen)
-		copy(d, f.data)
-		f.data = d
-	}
-	return copy(f.data[offset:], data), nil
-}
-
-// DeprecatedPwritev writes from a collection of slices at a given offset.
-func (f *File) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	n, err := src.CopyInTo(ctx, safemem.FromIOWriter{secio.NewOffsetWriter(f, offset)})
-	if n > 0 {
-		f.Entry.NotifyModification(ctx)
-	}
-	return n, err
-}
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
deleted file mode 100644
index d77688a34..000000000
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ /dev/null
@@ -1,441 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package ramfs implements an in-memory file system that can be associated with
-// any device.
-package ramfs
-
-import (
-	"errors"
-	"sync"
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-var (
-	// ErrInvalidOp indicates the operation is not valid.
-	ErrInvalidOp = errors.New("invalid operation")
-
-	// ErrDenied indicates the operation was denied.
-	ErrDenied = errors.New("operation denied")
-
-	// ErrNotFound indicates that a node was not found on a walk.
-	ErrNotFound = errors.New("node not found")
-
-	// ErrCrossDevice indicates a cross-device link or rename.
-	ErrCrossDevice = errors.New("can't link across filesystems")
-
-	// ErrIsDirectory indicates that the operation failed because
-	// the node is a directory.
-	ErrIsDirectory = errors.New("is a directory")
-
-	// ErrNotDirectory indicates that the operation failed because
-	// the node is a not directory.
-	ErrNotDirectory = errors.New("not a directory")
-
-	// ErrNotEmpty indicates that the operation failed because the
-	// directory is not empty.
-	ErrNotEmpty = errors.New("directory not empty")
-)
-
-// Entry represents common internal state for file and directory nodes.
-// This may be used by other packages to easily create ramfs files.
-//
-// +stateify savable
-type Entry struct {
-	waiter.AlwaysReady    `state:"nosave"`
-	fsutil.NoMappable     `state:"nosave"`
-	fsutil.NoopWriteOut   `state:"nosave"`
-	fsutil.InodeNotSocket `state:"nosave"`
-
-	// mu protects the fields below.
-	mu sync.Mutex `state:"nosave"`
-
-	// unstable is unstable attributes.
-	unstable fs.UnstableAttr
-
-	// xattrs are the extended attributes of the Entry.
-	xattrs map[string][]byte
-}
-
-// InitEntry initializes an entry.
-func (e *Entry) InitEntry(ctx context.Context, owner fs.FileOwner, p fs.FilePermissions) {
-	e.InitEntryWithAttr(ctx, fs.WithCurrentTime(ctx, fs.UnstableAttr{
-		Owner: owner,
-		Perms: p,
-		// Always start unlinked.
-		Links: 0,
-	}))
-}
-
-// InitEntryWithAttr initializes an entry with a complete set of attributes.
-func (e *Entry) InitEntryWithAttr(ctx context.Context, uattr fs.UnstableAttr) {
-	e.unstable = uattr
-	e.xattrs = make(map[string][]byte)
-}
-
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (e *Entry) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	e.mu.Lock()
-	attr := e.unstable
-	e.mu.Unlock()
-	return attr, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (*Entry) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// Getxattr implements fs.InodeOperations.Getxattr.
-func (e *Entry) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
-	// Hot path. Avoid defers.
-	e.mu.Lock()
-	value, ok := e.xattrs[name]
-	e.mu.Unlock()
-	if ok {
-		return value, nil
-	}
-	return nil, syserror.ENOATTR
-}
-
-// Setxattr implements fs.InodeOperations.Setxattr.
-func (e *Entry) Setxattr(inode *fs.Inode, name string, value []byte) error {
-	e.mu.Lock()
-	e.xattrs[name] = value
-	e.mu.Unlock()
-	return nil
-}
-
-// Listxattr implements fs.InodeOperations.Listxattr.
-func (e *Entry) Listxattr(inode *fs.Inode) (map[string]struct{}, error) {
-	e.mu.Lock()
-	names := make(map[string]struct{}, len(e.xattrs))
-	for name := range e.xattrs {
-		names[name] = struct{}{}
-	}
-	e.mu.Unlock()
-	return names, nil
-}
-
-// GetFile returns a fs.File backed by the dirent argument and flags.
-func (*Entry) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), nil
-}
-
-// SetPermissions always sets the permissions.
-func (e *Entry) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
-	e.mu.Lock()
-	e.unstable.Perms = p
-	e.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
-	e.mu.Unlock()
-	return true
-}
-
-// SetOwner always sets ownership.
-func (e *Entry) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	e.mu.Lock()
-	if owner.UID.Ok() {
-		e.unstable.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		e.unstable.Owner.GID = owner.GID
-	}
-	e.mu.Unlock()
-	return nil
-}
-
-// SetTimestamps sets the timestamps.
-func (e *Entry) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	e.mu.Lock()
-	now := ktime.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATimeSetSystemTime {
-			e.unstable.AccessTime = now
-		} else {
-			e.unstable.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTimeSetSystemTime {
-			e.unstable.ModificationTime = now
-		} else {
-			e.unstable.ModificationTime = ts.MTime
-		}
-	}
-	e.unstable.StatusChangeTime = now
-	e.mu.Unlock()
-	return nil
-}
-
-// NotifyStatusChange updates the status change time (ctime).
-func (e *Entry) NotifyStatusChange(ctx context.Context) {
-	e.mu.Lock()
-	e.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
-	e.mu.Unlock()
-}
-
-// StatusChangeTime returns the last status change time for this node.
-func (e *Entry) StatusChangeTime() ktime.Time {
-	e.mu.Lock()
-	t := e.unstable.StatusChangeTime
-	e.mu.Unlock()
-	return t
-}
-
-// NotifyModification updates the modification time and the status change time.
-func (e *Entry) NotifyModification(ctx context.Context) {
-	e.mu.Lock()
-	now := ktime.NowFromContext(ctx)
-	e.unstable.ModificationTime = now
-	e.unstable.StatusChangeTime = now
-	e.mu.Unlock()
-}
-
-// ModificationTime returns the last modification time for this node.
-func (e *Entry) ModificationTime() ktime.Time {
-	e.mu.Lock()
-	t := e.unstable.ModificationTime
-	e.mu.Unlock()
-	return t
-}
-
-// NotifyAccess updates the access time.
-func (e *Entry) NotifyAccess(ctx context.Context) {
-	e.mu.Lock()
-	now := ktime.NowFromContext(ctx)
-	e.unstable.AccessTime = now
-	e.mu.Unlock()
-}
-
-// AccessTime returns the last access time for this node.
-func (e *Entry) AccessTime() ktime.Time {
-	e.mu.Lock()
-	t := e.unstable.AccessTime
-	e.mu.Unlock()
-	return t
-}
-
-// Permissions returns permissions on this entry.
-func (e *Entry) Permissions() fs.FilePermissions {
-	e.mu.Lock()
-	p := e.unstable.Perms
-	e.mu.Unlock()
-	return p
-}
-
-// Lookup is not supported by default.
-func (*Entry) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) {
-	return nil, ErrInvalidOp
-}
-
-// Create is not supported by default.
-func (*Entry) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) {
-	return nil, ErrInvalidOp
-}
-
-// CreateLink is not supported by default.
-func (*Entry) CreateLink(context.Context, *fs.Inode, string, string) error {
-	return ErrInvalidOp
-}
-
-// CreateHardLink is not supported by default.
-func (*Entry) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
-	return ErrInvalidOp
-}
-
-// IsVirtual returns true.
-func (*Entry) IsVirtual() bool {
-	return true
-}
-
-// CreateDirectory is not supported by default.
-func (*Entry) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return ErrInvalidOp
-}
-
-// Bind is not supported by default.
-func (*Entry) Bind(context.Context, *fs.Inode, string, transport.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
-	return nil, ErrInvalidOp
-}
-
-// CreateFifo implements fs.InodeOperations.CreateFifo. CreateFifo is not supported by
-// default.
-func (*Entry) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return ErrInvalidOp
-}
-
-// Remove is not supported by default.
-func (*Entry) Remove(context.Context, *fs.Inode, string) error {
-	return ErrInvalidOp
-}
-
-// RemoveDirectory is not supported by default.
-func (*Entry) RemoveDirectory(context.Context, *fs.Inode, string) error {
-	return ErrInvalidOp
-}
-
-// StatFS always returns ENOSYS.
-func (*Entry) StatFS(context.Context) (fs.Info, error) {
-	return fs.Info{}, syscall.ENOSYS
-}
-
-// Rename implements fs.InodeOperations.Rename.
-func (e *Entry) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
-	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName)
-}
-
-// Rename renames from a *ramfs.Dir to another *ramfs.Dir.
-func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string) error {
-	op, ok := oldParent.(*Dir)
-	if !ok {
-		return ErrCrossDevice
-	}
-	np, ok := newParent.(*Dir)
-	if !ok {
-		return ErrCrossDevice
-	}
-
-	np.mu.Lock()
-	defer np.mu.Unlock()
-
-	// Check whether the ramfs entry to be replaced is a non-empty directory.
-	if replaced, ok := np.children[newName]; ok {
-		if fs.IsDir(replaced.StableAttr) {
-			// FIXME: simplify by pinning children of ramfs-backed directories
-			// in the Dirent tree: this allows us to generalize ramfs operations without
-			// relying on an implementation of Readdir (which may do anything, like require
-			// that the file be open ... which would be reasonable).
-			dirCtx := &fs.DirCtx{}
-			_, err := replaced.HandleOps().DeprecatedReaddir(ctx, dirCtx, 0)
-			if err != nil {
-				return err
-			}
-			attrs := dirCtx.DentAttrs()
-
-			// ramfs-backed directories should not contain "." and "..", but we do this
-			// just in case.
-			delete(attrs, ".")
-			delete(attrs, "..")
-
-			// If the directory to be replaced is not empty, reject the rename.
-			if len(attrs) != 0 {
-				return ErrNotEmpty
-			}
-		}
-	}
-
-	// Be careful, we may have already grabbed this mutex above.
-	if op != np {
-		op.mu.Lock()
-		defer op.mu.Unlock()
-	}
-
-	// Do the swap.
-	n := op.children[oldName]
-	op.removeChildLocked(ctx, oldName)
-	np.addChildLocked(newName, n)
-
-	// Update ctime.
-	n.NotifyStatusChange(ctx)
-
-	return nil
-}
-
-// Truncate is not supported by default.
-func (*Entry) Truncate(context.Context, *fs.Inode, int64) error {
-	return ErrInvalidOp
-}
-
-// Readlink always returns ENOLINK.
-func (*Entry) Readlink(context.Context, *fs.Inode) (string, error) {
-	return "", syscall.ENOLINK
-}
-
-// Getlink always returns ENOLINK.
-func (*Entry) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
-	return nil, syscall.ENOLINK
-}
-
-// Release is a no-op.
-func (e *Entry) Release(context.Context) {}
-
-// AddLink implements InodeOperationss.AddLink.
-func (e *Entry) AddLink() {
-	e.mu.Lock()
-	e.unstable.Links++
-	e.mu.Unlock()
-}
-
-// DropLink implements InodeOperationss.DropLink.
-func (e *Entry) DropLink() {
-	e.mu.Lock()
-	e.unstable.Links--
-	e.mu.Unlock()
-}
-
-// DeprecatedReaddir is not supported by default.
-func (*Entry) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) {
-	return 0, ErrNotDirectory
-}
-
-// DeprecatedPreadv always returns ErrInvalidOp.
-func (*Entry) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ErrInvalidOp
-}
-
-// DeprecatedPwritev always returns ErrInvalidOp.
-func (*Entry) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ErrInvalidOp
-}
-
-// DeprecatedFsync is a noop.
-func (*Entry) DeprecatedFsync() error {
-	// Ignore, this is in memory.
-	return nil
-}
-
-// DeprecatedFlush always returns nil.
-func (*Entry) DeprecatedFlush() error {
-	return nil
-}
-
-// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable.
-func (*Entry) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) {
-	return nil, false
-}
-
-func init() {
-	// Register ramfs errors.
-	syserror.AddErrorTranslation(ErrInvalidOp, syscall.EINVAL)
-	syserror.AddErrorTranslation(ErrDenied, syscall.EACCES)
-	syserror.AddErrorTranslation(ErrNotFound, syscall.ENOENT)
-	syserror.AddErrorTranslation(ErrCrossDevice, syscall.EXDEV)
-	syserror.AddErrorTranslation(ErrIsDirectory, syscall.EISDIR)
-	syserror.AddErrorTranslation(ErrNotDirectory, syscall.ENOTDIR)
-	syserror.AddErrorTranslation(ErrNotEmpty, syscall.ENOTEMPTY)
-}
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 8c81478c8..2c1295897 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -15,25 +15,42 @@
 package ramfs
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // Socket represents a socket.
 //
 // +stateify savable
 type Socket struct {
-	Entry
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeSimpleExtendedAttributes
 
 	// ep is the bound endpoint.
 	ep transport.BoundEndpoint
 }
 
-// InitSocket initializes a socket.
-func (s *Socket) InitSocket(ctx context.Context, ep transport.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions) {
-	s.InitEntry(ctx, owner, perms)
-	s.ep = ep
+var _ fs.InodeOperations = (*Socket)(nil)
+
+// NewSocket returns a new Socket.
+func NewSocket(ctx context.Context, ep transport.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions) *Socket {
+	return &Socket{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, perms, linux.SOCKFS_MAGIC),
+		ep:                    ep,
+	}
 }
 
 // BoundEndpoint returns the socket data.
@@ -42,3 +59,24 @@ func (s *Socket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint {
 	// care about the path argument.
 	return s.ep
 }
+
+// GetFile implements fs.FileOperations.GetFile.
+func (s *Socket) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &socketFileOperations{}), nil
+}
+
+// +stateify savable
+type socketFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoRead        `state:"nosave"`
+	fsutil.FileNoSeek        `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+}
+
+var _ fs.FileOperations = (*socketFileOperations)(nil)
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index a21fac2c7..47dae380b 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -15,44 +15,55 @@
 package ramfs
 
 import (
-	"sync"
-
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // Symlink represents a symlink.
 //
 // +stateify savable
 type Symlink struct {
-	Entry
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
 
-	mu sync.Mutex `state:"nosave"`
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeSimpleExtendedAttributes
 
 	// Target is the symlink target.
 	Target string
 }
 
-// InitSymlink initializes a symlink, pointing to the given target.
-// A symlink is assumed to always have permissions 0777.
-func (s *Symlink) InitSymlink(ctx context.Context, owner fs.FileOwner, target string) {
-	s.InitEntry(ctx, owner, fs.FilePermsFromMode(0777))
-	s.Target = target
+var _ fs.InodeOperations = (*Symlink)(nil)
+
+// NewSymlink returns a new Symlink.
+func NewSymlink(ctx context.Context, owner fs.FileOwner, target string) *Symlink {
+	// A symlink is assumed to always have permissions 0777.
+	return &Symlink{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(0777), linux.RAMFS_MAGIC),
+		Target:                target,
+	}
 }
 
 // UnstableAttr returns all attributes of this ramfs symlink.
 func (s *Symlink) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	uattr, _ := s.Entry.UnstableAttr(ctx, inode)
+	uattr, err := s.InodeSimpleAttributes.UnstableAttr(ctx, inode)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
 	uattr.Size = int64(len(s.Target))
 	uattr.Usage = uattr.Size
 	return uattr, nil
 }
 
-// Check implements InodeOperations.Check.
-func (s *Symlink) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
 // SetPermissions on a symlink is always rejected.
 func (s *Symlink) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool {
 	return false
@@ -60,10 +71,7 @@ func (s *Symlink) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions)
 
 // Readlink reads the symlink value.
 func (s *Symlink) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	s.Entry.NotifyAccess(ctx)
+	s.NotifyAccess(ctx)
 	return s.Target, nil
 }
 
@@ -72,3 +80,24 @@ func (s *Symlink) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
 func (*Symlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
 	return nil, fs.ErrResolveViaReadlink
 }
+
+// GetFile implements fs.FileOperations.GetFile.
+func (s *Symlink) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &symlinkFileOperations{}), nil
+}
+
+// +stateify savable
+type symlinkFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoRead        `state:"nosave"`
+	fsutil.FileNoSeek        `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+}
+
+var _ fs.FileOperations = (*symlinkFileOperations)(nil)
diff --git a/pkg/sentry/fs/ramfs/test/BUILD b/pkg/sentry/fs/ramfs/test/BUILD
deleted file mode 100644
index 187eac49d..000000000
--- a/pkg/sentry/fs/ramfs/test/BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-package(licenses = ["notice"])  # Apache 2.0
-
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-go_library(
-    name = "test",
-    testonly = 1,
-    srcs = ["test.go"],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test",
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/ramfs",
-    ],
-)
diff --git a/pkg/sentry/fs/ramfs/test/test.go b/pkg/sentry/fs/ramfs/test/test.go
deleted file mode 100644
index 11bff7729..000000000
--- a/pkg/sentry/fs/ramfs/test/test.go
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package test provides a simple ramfs-based filesystem for use in testing.
-package test
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
-)
-
-// Dir is a simple ramfs.Dir that supports save/restore as-is.
-type Dir struct {
-	ramfs.Dir
-}
-
-// NewDir returns a simple ramfs directory with the passed contents.
-func NewDir(ctx context.Context, contents map[string]*fs.Inode, perms fs.FilePermissions) *Dir {
-	d := &Dir{}
-	d.InitDir(ctx, contents, fs.RootOwner, perms)
-	return d
-}
-
-// File is a simple ramfs.File that supports save/restore as-is.
-type File struct {
-	ramfs.File
-}
-
-// NewFile returns a simple ramfs File.
-func NewFile(ctx context.Context, perms fs.FilePermissions) *File {
-	f := &File{}
-	f.InitFile(ctx, fs.RootOwner, perms)
-	return f
-}
diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
index 29a70f698..f6d5ffdec 100644
--- a/pkg/sentry/fs/ramfs/tree.go
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -60,8 +60,7 @@ func makeSubdir(ctx context.Context, msrc *fs.MountSource, root *Dir, subdir str
 
 // emptyDir returns an empty *ramfs.Dir that is traversable but not writable.
 func emptyDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	dir := &Dir{}
-	dir.InitDir(ctx, make(map[string]*fs.Inode), fs.RootOwner, fs.FilePermsFromMode(0555))
+	dir := NewDir(ctx, make(map[string]*fs.Inode), fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(dir, msrc, fs.StableAttr{
 		DeviceID:  anon.PseudoDevice.DeviceID(),
 		InodeID:   anon.PseudoDevice.NextIno(),
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index 54df2143c..8bee9cfc1 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -22,7 +22,7 @@ import (
 )
 
 func TestMakeDirectoryTree(t *testing.T) {
-	mount := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	mount := fs.NewPseudoMountSource()
 
 	for _, test := range []struct {
 		name    string
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 5ba23d5da..7de928e16 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -13,12 +13,13 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/usermem",
-        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index e64aa0edc..8b728a4e4 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -16,43 +16,50 @@ package sys
 
 import (
 	"fmt"
-	"io"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 // +stateify savable
 type cpunum struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
+	fsutil.InodeNotTruncatable       `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeStaticFileGetter
+
+	// k is the system kernel.
+	k *kernel.Kernel
 }
 
-func (c *cpunum) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
+var _ fs.InodeOperations = (*cpunum)(nil)
 
+func newPossible(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	var maxCore uint
 	k := kernel.KernelFromContext(ctx)
-	if k == nil {
-		return 0, io.EOF
+	if k != nil {
+		maxCore = k.ApplicationCores() - 1
 	}
+	contents := []byte(fmt.Sprintf("0-%d\n", maxCore))
 
-	str := []byte(fmt.Sprintf("0-%d\n", k.ApplicationCores()-1))
-	if offset >= int64(len(str)) {
-		return 0, io.EOF
+	c := &cpunum{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.SYSFS_MAGIC),
+		InodeStaticFileGetter: fsutil.InodeStaticFileGetter{
+			Contents: contents,
+		},
 	}
-
-	n, err := dst.CopyOut(ctx, str[offset:])
-	return int64(n), err
-}
-
-func newPossible(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	c := &cpunum{}
-	c.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
 	return newFile(c, msrc)
 }
 
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 5ce33f87f..301fef038 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -24,6 +24,8 @@ import (
 // +stateify savable
 type filesystem struct{}
 
+var _ fs.Filesystem = (*filesystem)(nil)
+
 func init() {
 	fs.RegisterFilesystem(&filesystem{})
 }
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index 7cc1942c7..c5b56fe69 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -22,13 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// sys is a root sys node.
-//
-// +stateify savable
-type sys struct {
-	ramfs.Dir
-}
-
 func newFile(node fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
 	sattr := fs.StableAttr{
 		DeviceID:  sysfsDevice.DeviceID(),
@@ -40,8 +33,7 @@ func newFile(node fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
 }
 
 func newDir(ctx context.Context, msrc *fs.MountSource, contents map[string]*fs.Inode) *fs.Inode {
-	d := &sys{}
-	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(d, msrc, fs.StableAttr{
 		DeviceID:  sysfsDevice.DeviceID(),
 		InodeID:   sysfsDevice.NextIno(),
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 7423e816c..b26466b9d 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -33,12 +33,12 @@ import (
 //
 // +stateify savable
 type TimerOperations struct {
-	fsutil.ZeroSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
+	fsutil.FileZeroSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
 
 	events waiter.Queue `state:"zerovalue"`
 	timer  *ktime.Timer
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 9065cdd5d..14c7a9e62 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -23,11 +23,13 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/syserror",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 1f9d69909..2c1eb0fd2 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -28,13 +28,13 @@ import (
 //
 // +stateify savable
 type regularFileOperations struct {
-	waiter.AlwaysReady   `state:"nosave"`
-	fsutil.NoopRelease   `state:"nosave"`
-	fsutil.GenericSeek   `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoopFsync     `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
 
 	// iops is the InodeOperations of a regular tmpfs file. It is
 	// guaranteed to be the same as file.Dirent.Inode.InodeOperations,
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index 02da9af82..e7bbdc404 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -52,19 +52,19 @@ func TestGrow(t *testing.T) {
 	abuf := bytes.Repeat([]byte{'a'}, 68)
 	n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0)
 	if n != int64(len(abuf)) || err != nil {
-		t.Fatalf("DeprecatedPwritev got (%d, %v) want (%d, nil)", n, err, len(abuf))
+		t.Fatalf("Pwritev got (%d, %v) want (%d, nil)", n, err, len(abuf))
 	}
 
 	bbuf := bytes.Repeat([]byte{'b'}, 856)
 	n, err = f.Pwritev(ctx, usermem.BytesIOSequence(bbuf), 68)
 	if n != int64(len(bbuf)) || err != nil {
-		t.Fatalf("DeprecatedPwritev got (%d, %v) want (%d, nil)", n, err, len(bbuf))
+		t.Fatalf("Pwritev got (%d, %v) want (%d, nil)", n, err, len(bbuf))
 	}
 
 	rbuf := make([]byte, len(abuf)+len(bbuf))
 	n, err = f.Preadv(ctx, usermem.BytesIOSequence(rbuf), 0)
 	if n != int64(len(rbuf)) || err != nil {
-		t.Fatalf("DeprecatedPreadv got (%d, %v) want (%d, nil)", n, err, len(rbuf))
+		t.Fatalf("Preadv got (%d, %v) want (%d, nil)", n, err, len(rbuf))
 	}
 
 	if want := append(abuf, bbuf...); !bytes.Equal(rbuf, want) {
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 88f85b85a..caa3220ee 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -50,6 +50,8 @@ const (
 // +stateify savable
 type Filesystem struct{}
 
+var _ fs.Filesystem = (*Filesystem)(nil)
+
 func init() {
 	fs.RegisterFilesystem(&Filesystem{})
 }
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index ca2b4aabb..42d4bc76f 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -46,11 +47,13 @@ import (
 //
 // +stateify savable
 type fileInodeOperations struct {
-	fsutil.DeprecatedFileOperations `state:"nosave"`
-	fsutil.InodeNotDirectory        `state:"nosave"`
-	fsutil.InodeNotSocket           `state:"nosave"`
-	fsutil.InodeNotSymlink          `state:"nosave"`
-	fsutil.NoopWriteOut             `state:"nosave"`
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+
+	fsutil.InodeSimpleExtendedAttributes
 
 	// kernel is used to allocate platform memory that stores the file's contents.
 	kernel *kernel.Kernel
@@ -62,10 +65,10 @@ type fileInodeOperations struct {
 
 	// attr contains the unstable metadata for the file.
 	//
-	// attr is protected by attrMu. attr.Unstable.Size is protected by both
-	// attrMu and dataMu; reading it requires locking either mutex, while
-	// mutating it requires locking both.
-	attr fsutil.InMemoryAttributes
+	// attr is protected by attrMu. attr.Size is protected by both attrMu
+	// and dataMu; reading it requires locking either mutex, while mutating
+	// it requires locking both.
+	attr fs.UnstableAttr
 
 	mapsMu sync.Mutex `state:"nosave"`
 
@@ -83,12 +86,12 @@ type fileInodeOperations struct {
 	data fsutil.FileRangeSet
 }
 
+var _ fs.InodeOperations = (*fileInodeOperations)(nil)
+
 // NewInMemoryFile returns a new file backed by p.Memory().
 func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr, k *kernel.Kernel) fs.InodeOperations {
 	return &fileInodeOperations{
-		attr: fsutil.InMemoryAttributes{
-			Unstable: uattr,
-		},
+		attr:     uattr,
 		kernel:   k,
 		memUsage: usage,
 	}
@@ -121,71 +124,56 @@ func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags f
 // UnstableAttr returns unstable attributes of this tmpfs file.
 func (f *fileInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
 	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
 	f.dataMu.RLock()
-	defer f.dataMu.RUnlock()
-	attr := f.attr.Unstable
+	attr := f.attr
 	attr.Usage = int64(f.data.Span())
+	f.dataMu.RUnlock()
+	f.attrMu.Unlock()
 	return attr, nil
 }
 
-// Getxattr implements fs.InodeOperations.Getxattr.
-func (f *fileInodeOperations) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
-	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.Getxattr(name)
-}
-
-// Setxattr implements fs.InodeOperations.Setxattr.
-func (f *fileInodeOperations) Setxattr(inode *fs.Inode, name string, value []byte) error {
-	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.Setxattr(name, value)
-}
-
-// Listxattr implements fs.InodeOperations.Listxattr.
-func (f *fileInodeOperations) Listxattr(inode *fs.Inode) (map[string]struct{}, error) {
-	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.Listxattr()
-}
-
 // Check implements fs.InodeOperations.Check.
 func (f *fileInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
 	return fs.ContextCanAccessFile(ctx, inode, p)
 }
 
 // SetPermissions implements fs.InodeOperations.SetPermissions.
-func (f *fileInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
+func (f *fileInodeOperations) SetPermissions(ctx context.Context, _ *fs.Inode, p fs.FilePermissions) bool {
 	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.SetPermissions(ctx, p)
+	f.attr.SetPermissions(ctx, p)
+	f.attrMu.Unlock()
+	return true
 }
 
 // SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (f *fileInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+func (f *fileInodeOperations) SetTimestamps(ctx context.Context, _ *fs.Inode, ts fs.TimeSpec) error {
 	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.SetTimestamps(ctx, ts)
+	f.attr.SetTimestamps(ctx, ts)
+	f.attrMu.Unlock()
+	return nil
 }
 
 // SetOwner implements fs.InodeOperations.SetOwner.
-func (f *fileInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+func (f *fileInodeOperations) SetOwner(ctx context.Context, _ *fs.Inode, owner fs.FileOwner) error {
 	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.SetOwner(ctx, owner)
+	f.attr.SetOwner(ctx, owner)
+	f.attrMu.Unlock()
+	return nil
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (f *fileInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size int64) error {
 	f.attrMu.Lock()
 	defer f.attrMu.Unlock()
 
 	f.dataMu.Lock()
-	oldSize := f.attr.Unstable.Size
+	oldSize := f.attr.Size
 	if oldSize != size {
-		f.attr.Unstable.Size = size
-		f.attr.TouchModificationTime(ctx)
+		f.attr.Size = size
+		// Update mtime and ctime.
+		now := ktime.NowFromContext(ctx)
+		f.attr.ModificationTime = now
+		f.attr.StatusChangeTime = now
 	}
 	f.dataMu.Unlock()
 
@@ -220,21 +208,21 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, siz
 // AddLink implements fs.InodeOperations.AddLink.
 func (f *fileInodeOperations) AddLink() {
 	f.attrMu.Lock()
-	f.attr.Unstable.Links++
+	f.attr.Links++
 	f.attrMu.Unlock()
 }
 
 // DropLink implements fs.InodeOperations.DropLink.
 func (f *fileInodeOperations) DropLink() {
 	f.attrMu.Lock()
-	f.attr.Unstable.Links--
+	f.attr.Links--
 	f.attrMu.Unlock()
 }
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
 func (f *fileInodeOperations) NotifyStatusChange(ctx context.Context) {
 	f.attrMu.Lock()
-	f.attr.TouchStatusChangeTime(ctx)
+	f.attr.StatusChangeTime = ktime.NowFromContext(ctx)
 	f.attrMu.Unlock()
 }
 
@@ -264,7 +252,7 @@ func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence,
 	// TODO: Separate out f.attr.Size and use atomics instead of
 	// f.dataMu.
 	f.dataMu.RLock()
-	size := f.attr.Unstable.Size
+	size := f.attr.Size
 	f.dataMu.RUnlock()
 	if offset >= size {
 		return 0, io.EOF
@@ -273,7 +261,7 @@ func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence,
 	n, err := dst.CopyOutFrom(ctx, &fileReadWriter{f, offset})
 	// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
 	f.attrMu.Lock()
-	f.attr.TouchAccessTime(ctx)
+	f.attr.AccessTime = ktime.NowFromContext(ctx)
 	f.attrMu.Unlock()
 	return n, err
 }
@@ -287,7 +275,9 @@ func (f *fileInodeOperations) write(ctx context.Context, src usermem.IOSequence,
 	f.attrMu.Lock()
 	defer f.attrMu.Unlock()
 	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
-	f.attr.TouchModificationTime(ctx)
+	now := ktime.NowFromContext(ctx)
+	f.attr.ModificationTime = now
+	f.attr.StatusChangeTime = now
 	return src.CopyInTo(ctx, &fileReadWriter{f, offset})
 }
 
@@ -302,10 +292,10 @@ func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	defer rw.f.dataMu.RUnlock()
 
 	// Compute the range to read.
-	if rw.offset >= rw.f.attr.Unstable.Size {
+	if rw.offset >= rw.f.attr.Size {
 		return 0, io.EOF
 	}
-	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.f.attr.Unstable.Size)
+	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.f.attr.Size)
 	if end == rw.offset { // dsts.NumBytes() == 0?
 		return 0, nil
 	}
@@ -371,8 +361,8 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	defer func() {
 		// If the write ends beyond the file's previous size, it causes the
 		// file to grow.
-		if rw.offset > rw.f.attr.Unstable.Size {
-			rw.f.attr.Unstable.Size = rw.offset
+		if rw.offset > rw.f.attr.Size {
+			rw.f.attr.Size = rw.offset
 		}
 	}()
 
@@ -450,9 +440,9 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 	f.dataMu.Lock()
 	defer f.dataMu.Unlock()
 
-	// Constrain translations to f.attr.Unstable.Size (rounded up) to prevent
+	// Constrain translations to f.attr.Size (rounded up) to prevent
 	// translation to pages that may be concurrently truncated.
-	pgend := fs.OffsetPageEnd(f.attr.Unstable.Size)
+	pgend := fs.OffsetPageEnd(f.attr.Size)
 	var beyondEOF bool
 	if required.End > pgend {
 		if required.Start >= pgend {
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 40a8c4b1e..a0277a132 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -19,12 +19,14 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 var fsInfo = fs.Info{
@@ -39,32 +41,54 @@ var fsInfo = fs.Info{
 func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
 	op, ok := oldParent.InodeOperations.(*Dir)
 	if !ok {
-		return ramfs.ErrCrossDevice
+		return syserror.EXDEV
 	}
 	np, ok := newParent.InodeOperations.(*Dir)
 	if !ok {
-		return ramfs.ErrCrossDevice
+		return syserror.EXDEV
 	}
-	return ramfs.Rename(ctx, &op.Dir, oldName, &np.Dir, newName)
+	return ramfs.Rename(ctx, op.ramfsDir, oldName, np.ramfsDir, newName)
 }
 
 // Dir is a directory.
 //
 // +stateify savable
 type Dir struct {
-	ramfs.Dir
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeIsDirTruncate  `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	// Ideally this would be embedded, so that we "inherit" all of the
+	// InodeOperations implemented by ramfs.Dir for free.
+	//
+	// However, ramfs.dirFileOperations stores a pointer to a ramfs.Dir,
+	// and our save/restore package does not allow saving a pointer to an
+	// embedded field elsewhere.
+	//
+	// Thus, we must make the ramfs.Dir is a field, and we delegate all the
+	// InodeOperation methods to it.
+	ramfsDir *ramfs.Dir
 
 	// kernel is used to allocate platform memory as storage for tmpfs Files.
 	kernel *kernel.Kernel
 }
 
+var _ fs.InodeOperations = (*Dir)(nil)
+
 // NewDir returns a new directory.
 func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource, kernel *kernel.Kernel) *fs.Inode {
-	d := &Dir{kernel: kernel}
-	d.InitDir(ctx, contents, owner, perms)
+	d := &Dir{
+		ramfsDir: ramfs.NewDir(ctx, contents, owner, perms),
+		kernel:   kernel,
+	}
 
 	// Manually set the CreateOps.
-	d.CreateOps = d.newCreateOps()
+	d.ramfsDir.CreateOps = d.newCreateOps()
 
 	return fs.NewInode(d, msrc, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
@@ -77,7 +101,107 @@ func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwn
 // afterLoad is invoked by stateify.
 func (d *Dir) afterLoad() {
 	// Per NewDir, manually set the CreateOps.
-	d.Dir.CreateOps = d.newCreateOps()
+	d.ramfsDir.CreateOps = d.newCreateOps()
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (d *Dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return d.ramfsDir.GetFile(ctx, dirent, flags)
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+func (d *Dir) AddLink() {
+	d.ramfsDir.AddLink()
+}
+
+// DropLink implements fs.InodeOperations.DropLink.
+func (d *Dir) DropLink() {
+	d.ramfsDir.DropLink()
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) {
+	return d.ramfsDir.Bind(ctx, dir, name, ep, perms)
+}
+
+// Create implements fs.InodeOperations.Create.
+func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perms fs.FilePermissions) (*fs.File, error) {
+	return d.ramfsDir.Create(ctx, dir, name, flags, perms)
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error {
+	return d.ramfsDir.CreateLink(ctx, dir, oldname, newname)
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error {
+	return d.ramfsDir.CreateHardLink(ctx, dir, target, name)
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
+	return d.ramfsDir.CreateDirectory(ctx, dir, name, perms)
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
+	return d.ramfsDir.CreateFifo(ctx, dir, name, perms)
+}
+
+// Getxattr implements fs.InodeOperations.Getxattr.
+func (d *Dir) Getxattr(i *fs.Inode, name string) ([]byte, error) {
+	return d.ramfsDir.Getxattr(i, name)
+}
+
+// Setxattr implements fs.InodeOperations.Setxattr.
+func (d *Dir) Setxattr(i *fs.Inode, name string, value []byte) error {
+	return d.ramfsDir.Setxattr(i, name, value)
+}
+
+// Listxattr implements fs.InodeOperations.Listxattr.
+func (d *Dir) Listxattr(i *fs.Inode) (map[string]struct{}, error) {
+	return d.ramfsDir.Listxattr(i)
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (d *Dir) Lookup(ctx context.Context, i *fs.Inode, p string) (*fs.Dirent, error) {
+	return d.ramfsDir.Lookup(ctx, i, p)
+}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+func (d *Dir) NotifyStatusChange(ctx context.Context) {
+	d.ramfsDir.NotifyStatusChange(ctx)
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (d *Dir) Remove(ctx context.Context, i *fs.Inode, name string) error {
+	return d.ramfsDir.Remove(ctx, i, name)
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (d *Dir) RemoveDirectory(ctx context.Context, i *fs.Inode, name string) error {
+	return d.ramfsDir.RemoveDirectory(ctx, i, name)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (d *Dir) UnstableAttr(ctx context.Context, i *fs.Inode) (fs.UnstableAttr, error) {
+	return d.ramfsDir.UnstableAttr(ctx, i)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (d *Dir) SetPermissions(ctx context.Context, i *fs.Inode, p fs.FilePermissions) bool {
+	return d.ramfsDir.SetPermissions(ctx, i, p)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (d *Dir) SetOwner(ctx context.Context, i *fs.Inode, owner fs.FileOwner) error {
+	return d.ramfsDir.SetOwner(ctx, i, owner)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (d *Dir) SetTimestamps(ctx context.Context, i *fs.Inode, ts fs.TimeSpec) error {
+	return d.ramfsDir.SetTimestamps(ctx, i, ts)
 }
 
 // newCreateOps builds the custom CreateOps for this Dir.
@@ -132,8 +256,7 @@ type Symlink struct {
 
 // NewSymlink returns a new symlink with the provided permissions.
 func NewSymlink(ctx context.Context, target string, owner fs.FileOwner, msrc *fs.MountSource) *fs.Inode {
-	s := &Symlink{}
-	s.InitSymlink(ctx, owner, target)
+	s := &Symlink{Symlink: *ramfs.NewSymlink(ctx, owner, target)}
 	return fs.NewInode(s, msrc, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
 		InodeID:   tmpfsDevice.NextIno(),
@@ -157,12 +280,12 @@ func (s *Symlink) StatFS(context.Context) (fs.Info, error) {
 // +stateify savable
 type Socket struct {
 	ramfs.Socket
+	fsutil.InodeNotTruncatable `state:"nosave"`
 }
 
 // NewSocket returns a new socket with the provided permissions.
 func NewSocket(ctx context.Context, socket transport.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
-	s := &Socket{}
-	s.InitSocket(ctx, socket, owner, perms)
+	s := &Socket{Socket: *ramfs.NewSocket(ctx, socket, owner, perms)}
 	return fs.NewInode(s, msrc, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
 		InodeID:   tmpfsDevice.NextIno(),
@@ -185,15 +308,22 @@ func (s *Socket) StatFS(context.Context) (fs.Info, error) {
 //
 // +stateify savable
 type Fifo struct {
-	ramfs.Entry
+	fs.InodeOperations
 }
 
 // NewFifo creates a new named pipe.
 func NewFifo(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
-	f := &Fifo{}
-	f.InitEntry(ctx, owner, perms)
-	iops := pipe.NewInodeOperations(f, pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize))
-	return fs.NewInode(iops, msrc, fs.StableAttr{
+	// First create a pipe.
+	p := pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
+
+	// Build pipe InodeOperations.
+	iops := pipe.NewInodeOperations(ctx, perms, p)
+
+	// Wrap the iops with our Fifo.
+	fifoIops := &Fifo{iops}
+
+	// Build a new Inode.
+	return fs.NewInode(fifoIops, msrc, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
 		InodeID:   tmpfsDevice.NextIno(),
 		BlockSize: usermem.PageSize,
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 2b45069a6..011cb6955 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "dir.go",
         "fs.go",
-        "inode.go",
         "line_discipline.go",
         "master.go",
         "queue.go",
@@ -25,7 +24,6 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index e32b05c1d..485cdb456 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -52,13 +52,17 @@ import (
 //
 // +stateify savable
 type dirInodeOperations struct {
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
-	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotRenameable        `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotTruncatable       `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 
 	// msrc is the super block this directory is on.
 	//
@@ -68,9 +72,6 @@ type dirInodeOperations struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
 
-	// attr contains the UnstableAttrs.
-	attr fsutil.InMemoryAttributes
-
 	// master is the master PTY inode.
 	master *fs.Inode
 
@@ -97,15 +98,10 @@ var _ fs.InodeOperations = (*dirInodeOperations)(nil)
 // newDir creates a new dir with a ptmx file and no terminals.
 func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 	d := &dirInodeOperations{
-		attr: fsutil.InMemoryAttributes{
-			Unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-				Owner: fs.RootOwner,
-				Perms: fs.FilePermsFromMode(0555),
-			}),
-		},
-		msrc:      m,
-		slaves:    make(map[uint32]*fs.Inode),
-		dentryMap: fs.NewSortedDentryMap(nil),
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC),
+		msrc:                  m,
+		slaves:                make(map[uint32]*fs.Inode),
+		dentryMap:             fs.NewSortedDentryMap(nil),
 	}
 	// Linux devpts uses a default mode of 0000 for ptmx which can be
 	// changed with the ptmxmode mount option. However, that default is not
@@ -224,70 +220,6 @@ func (d *dirInodeOperations) GetFile(ctx context.Context, dirent *fs.Dirent, fla
 	return fs.NewFile(ctx, dirent, flags, &dirFileOperations{di: d}), nil
 }
 
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (d *dirInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.attr.Unstable, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (d *dirInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// SetPermissions implements fs.InodeOperations.SetPermissions.
-func (d *dirInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.attr.SetPermissions(ctx, p)
-}
-
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (d *dirInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.attr.SetOwner(ctx, owner)
-}
-
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (d *dirInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.attr.SetTimestamps(ctx, ts)
-}
-
-// Truncate implements fs.InodeOperations.Truncate.
-func (d *dirInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return syserror.EINVAL
-}
-
-// AddLink implements fs.InodeOperations.AddLink.
-func (d *dirInodeOperations) AddLink() {}
-
-// DropLink implements fs.InodeOperations.DropLink.
-func (d *dirInodeOperations) DropLink() {}
-
-// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-func (d *dirInodeOperations) NotifyStatusChange(ctx context.Context) {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-
-	d.attr.TouchStatusChangeTime(ctx)
-}
-
-// IsVirtual implements fs.InodeOperations.IsVirtual.
-func (d *dirInodeOperations) IsVirtual() bool {
-	return true
-}
-
-// StatFS implements fs.InodeOperations.StatFS.
-func (d *dirInodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
-	return fs.Info{
-		Type: linux.DEVPTS_SUPER_MAGIC,
-	}, nil
-}
-
 // allocateTerminal creates a new Terminal and installs a pts node for it.
 //
 // The caller must call DecRef when done with the returned Terminal.
@@ -353,13 +285,13 @@ func (d *dirInodeOperations) masterClose(t *Terminal) {
 //
 // +stateify savable
 type dirFileOperations struct {
-	waiter.AlwaysReady `state:"nosave"`
-	fsutil.NoopRelease `state:"nosave"`
-	fsutil.GenericSeek `state:"nosave"`
-	fsutil.NoFsync     `state:"nosave"`
-	fsutil.NoopFlush   `state:"nosave"`
-	fsutil.NoMMap      `state:"nosave"`
-	fsutil.NoIoctl     `state:"nosave"`
+	waiter.AlwaysReady     `state:"nosave"`
+	fsutil.FileNoopRelease `state:"nosave"`
+	fsutil.FileGenericSeek `state:"nosave"`
+	fsutil.FileNoFsync     `state:"nosave"`
+	fsutil.FileNoopFlush   `state:"nosave"`
+	fsutil.FileNoMMap      `state:"nosave"`
+	fsutil.FileNoIoctl     `state:"nosave"`
 
 	// di is the inode operations.
 	di *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go
deleted file mode 100644
index d5d1caafc..000000000
--- a/pkg/sentry/fs/tty/inode.go
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tty
-
-import (
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-)
-
-// inodeOperations are the base fs.InodeOperations for master and slave Inodes.
-//
-// inodeOperations does not implement:
-//
-// * fs.InodeOperations.Release
-// * fs.InodeOperations.GetFile
-//
-// +stateify savable
-type inodeOperations struct {
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
-	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.InodeNotDirectory         `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
-	fsutil.InodeNotSocket            `state:"nosave"`
-	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
-
-	// mu protects the fields below.
-	mu sync.Mutex `state:"nosave"`
-
-	// uattr is the inode's UnstableAttr.
-	uattr fs.UnstableAttr
-}
-
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-	return i.uattr, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// SetPermissions implements fs.InodeOperations.SetPermissions
-func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-	i.uattr.Perms = p
-	i.uattr.StatusChangeTime = ktime.NowFromContext(ctx)
-	return true
-}
-
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-	if owner.UID.Ok() {
-		i.uattr.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		i.uattr.Owner.GID = owner.GID
-	}
-	return nil
-}
-
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	i.mu.Lock()
-	defer i.mu.Unlock()
-
-	now := ktime.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATime.IsZero() {
-			i.uattr.AccessTime = now
-		} else {
-			i.uattr.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTime.IsZero() {
-			i.uattr.ModificationTime = now
-		} else {
-			i.uattr.ModificationTime = ts.MTime
-		}
-	}
-	i.uattr.StatusChangeTime = now
-	return nil
-}
-
-// Truncate implements fs.InodeOperations.Truncate.
-func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return syserror.EINVAL
-}
-
-// AddLink implements fs.InodeOperations.AddLink.
-func (i *inodeOperations) AddLink() {
-}
-
-// DropLink implements fs.InodeOperations.DropLink.
-func (i *inodeOperations) DropLink() {
-}
-
-// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-	i.uattr.StatusChangeTime = ktime.NowFromContext(ctx)
-}
-
-// IsVirtual implements fs.InodeOperations.IsVirtual.
-func (i *inodeOperations) IsVirtual() bool {
-	return true
-}
-
-// StatFS implements fs.InodeOperations.StatFS.
-func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
-	return fs.Info{
-		Type: linux.DEVPTS_SUPER_MAGIC,
-	}, nil
-}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 00bec4c2c..b5e13ab36 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -31,7 +31,7 @@ import (
 //
 // +stateify savable
 type masterInodeOperations struct {
-	inodeOperations
+	fsutil.SimpleFileInode
 
 	// d is the containing dir.
 	d *dirInodeOperations
@@ -42,15 +42,8 @@ var _ fs.InodeOperations = (*masterInodeOperations)(nil)
 // newMasterInode creates an Inode for the master end of a terminal.
 func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
 	iops := &masterInodeOperations{
-		inodeOperations: inodeOperations{
-			uattr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-				Owner: owner,
-				Perms: p,
-				Links: 1,
-				// Size and Blocks are always 0.
-			}),
-		},
-		d: d,
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
+		d:               d,
 	}
 
 	return fs.NewInode(iops, d.msrc, fs.StableAttr{
@@ -102,11 +95,11 @@ func (mi *masterInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flag
 //
 // +stateify savable
 type masterFileOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 
 	// d is the containing dir.
 	d *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index a696fbb51..6dbce90b4 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -30,7 +30,7 @@ import (
 //
 // +stateify savable
 type slaveInodeOperations struct {
-	inodeOperations
+	fsutil.SimpleFileInode
 
 	// d is the containing dir.
 	d *dirInodeOperations
@@ -46,16 +46,9 @@ var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
 // newSlaveInode takes ownership of t.
 func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
 	iops := &slaveInodeOperations{
-		inodeOperations: inodeOperations{
-			uattr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-				Owner: owner,
-				Perms: p,
-				Links: 1,
-				// Size and Blocks are always 0.
-			}),
-		},
-		d: d,
-		t: t,
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
+		d:               d,
+		t:               t,
 	}
 
 	return fs.NewInode(iops, d.msrc, fs.StableAttr{
@@ -91,11 +84,11 @@ func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags
 //
 // +stateify savable
 type slaveFileOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 
 	// si is the inode operations.
 	si *slaveInodeOperations
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 9c13ecfcc..502395f18 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -99,12 +99,12 @@ func (p *pollEntry) WeakRefGone() {
 //
 // +stateify savable
 type EventPoll struct {
-	fsutil.PipeSeek      `state:"zerovalue"`
-	fsutil.NotDirReaddir `state:"zerovalue"`
-	fsutil.NoFsync       `state:"zerovalue"`
-	fsutil.NoopFlush     `state:"zerovalue"`
-	fsutil.NoMMap        `state:"zerovalue"`
-	fsutil.NoIoctl       `state:"zerovalue"`
+	fsutil.FilePipeSeek      `state:"zerovalue"`
+	fsutil.FileNotDirReaddir `state:"zerovalue"`
+	fsutil.FileNoFsync       `state:"zerovalue"`
+	fsutil.FileNoopFlush     `state:"zerovalue"`
+	fsutil.FileNoMMap        `state:"zerovalue"`
+	fsutil.FileNoIoctl       `state:"zerovalue"`
 
 	// Wait queue is used to notify interested parties when the event poll
 	// object itself becomes readable or writable.
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 063a1d5f5..2d43c986d 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -38,13 +38,13 @@ import (
 //
 // +stateify savable
 type EventOperations struct {
-	fsutil.NoopRelease   `state:"nosave"`
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
 
 	// Mutex that protects accesses to the fields of this event.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 4b0e00b85..1336b6293 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -17,17 +17,30 @@ package pipe
 import (
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/amutex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
-// inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics.
+// inodeOperations implements fs.InodeOperations for pipes.
 //
 // +stateify savable
 type inodeOperations struct {
-	fs.InodeOperations
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -46,12 +59,15 @@ type inodeOperations struct {
 	wWakeup chan struct{} `state:"nosave"`
 }
 
-// NewInodeOperations creates a new pipe fs.InodeOperations.
-func NewInodeOperations(base fs.InodeOperations, p *Pipe) fs.InodeOperations {
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// NewInodeOperations returns a new fs.InodeOperations for a given pipe.
+func NewInodeOperations(ctx context.Context, perms fs.FilePermissions, p *Pipe) *inodeOperations {
 	return &inodeOperations{
-		InodeOperations: base,
-		p:               p,
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), perms, linux.PIPEFS_MAGIC),
+		p:                     p,
 	}
+
 }
 
 // GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking
@@ -164,18 +180,6 @@ func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Slee
 	}
 }
 
-// Truncate implements fs.InodeOperations.Truncate
-//
-// This method is required to override the default i.InodeOperations.Truncate
-// which may return ErrInvalidOperation, this allows open related
-// syscalls to set the O_TRUNC flag without returning an error by
-// calling Truncate directly during openat. The ftruncate and truncate
-// system calls will check that the file is an actual file and return
-// EINVAL because it's a PIPE, making this behavior consistent with linux.
-func (i *inodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
 // newHandleLocked signals a new pipe reader or writer depending on where
 // 'wakeupChan' points. This unblocks any corresponding reader or writer
 // waiting for the other end of the channel to be opened, see Fifo.waitFor.
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index eda551594..ad103b195 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -53,6 +53,10 @@ type openResult struct {
 	error
 }
 
+var perms fs.FilePermissions = fs.FilePermissions{
+	User: fs.PermMask{Read: true, Write: true},
+}
+
 func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) {
 	file, err := n.GetFile(ctx, nil, flags)
 	if err != nil {
@@ -93,8 +97,8 @@ func assertRecvBlocks(t *testing.T, c <-chan struct{}, blockDuration time.Durati
 }
 
 func TestReadOpenBlocksForWriteOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
@@ -111,8 +115,8 @@ func TestReadOpenBlocksForWriteOpen(t *testing.T) {
 }
 
 func TestWriteOpenBlocksForReadOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	wDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
@@ -129,8 +133,8 @@ func TestWriteOpenBlocksForReadOpen(t *testing.T) {
 }
 
 func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rDone1 := make(chan struct{})
 	rDone2 := make(chan struct{})
@@ -151,8 +155,8 @@ func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) {
 }
 
 func TestClosedReaderBlocksWriteOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
 	rFile.DecRef()
@@ -172,8 +176,8 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) {
 }
 
 func TestReadWriteOpenNeverBlocks(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rwDone := make(chan struct{})
 	// Open for read-write never wait for a reader or writer, even if the
@@ -183,8 +187,8 @@ func TestReadWriteOpenNeverBlocks(t *testing.T) {
 }
 
 func TestReadWriteOpenUnblocksReadOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
@@ -197,8 +201,8 @@ func TestReadWriteOpenUnblocksReadOpen(t *testing.T) {
 }
 
 func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	wDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
@@ -211,8 +215,8 @@ func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) {
 }
 
 func TestBlockedOpenIsCancellable(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	done := make(chan openResult)
 	go testOpen(ctx, t, f, fs.FileFlags{Read: true}, done)
@@ -233,18 +237,18 @@ func TestBlockedOpenIsCancellable(t *testing.T) {
 	}
 }
 
-func TestNonblockingReadOpenNoWriters(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
+func TestNonblockingReadOpenFileNoWriters(t *testing.T) {
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
 		t.Fatalf("Nonblocking open for read failed with error %v.", err)
 	}
 }
 
-func TestNonblockingWriteOpenNoReaders(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
+func TestNonblockingWriteOpenFileNoReaders(t *testing.T) {
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO {
 		t.Fatalf("Nonblocking open for write failed unexpected error %v.", err)
@@ -252,8 +256,8 @@ func TestNonblockingWriteOpenNoReaders(t *testing.T) {
 }
 
 func TestNonBlockingReadOpenWithWriter(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	wDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
@@ -271,8 +275,8 @@ func TestNonBlockingReadOpenWithWriter(t *testing.T) {
 }
 
 func TestNonBlockingWriteOpenWithReader(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
@@ -290,8 +294,8 @@ func TestNonBlockingWriteOpenWithReader(t *testing.T) {
 }
 
 func TestAnonReadOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newAnonPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newAnonPipe(t))
 
 	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true}, nil); err != nil {
 		t.Fatalf("open anon pipe for read failed: %v", err)
@@ -299,8 +303,8 @@ func TestAnonReadOpen(t *testing.T) {
 }
 
 func TestAnonWriteOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newAnonPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newAnonPipe(t))
 
 	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true}, nil); err != nil {
 		t.Fatalf("open anon pipe for write failed: %v", err)
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 126054826..fad077d2d 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -25,11 +25,9 @@ import (
 	"sync/atomic"
 	"syscall"
 
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -50,7 +48,7 @@ type Pipe struct {
 	isNamed bool
 
 	// The dirent backing this pipe. Shared by all readers and writers.
-	dirent *fs.Dirent
+	Dirent *fs.Dirent
 
 	// The buffered byte queue.
 	data ilist.List
@@ -97,28 +95,19 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *P
 
 	// Build the fs.Dirent of this pipe, shared by all fs.Files associated
 	// with this pipe.
+	perms := fs.FilePermissions{
+		User: fs.PermMask{Read: true, Write: true},
+	}
+	iops := NewInodeOperations(ctx, perms, p)
 	ino := pipeDevice.NextIno()
-	base := fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
-		FSType: linux.PIPEFS_MAGIC,
-		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: fs.FileOwnerFromContext(ctx),
-			Perms: fs.FilePermissions{
-				User: fs.PermMask{Read: true, Write: true},
-			},
-			Links: 1,
-		}),
-	})
 	sattr := fs.StableAttr{
 		Type:      fs.Pipe,
 		DeviceID:  pipeDevice.DeviceID(),
 		InodeID:   ino,
 		BlockSize: int64(atomicIOBytes),
 	}
-	// There is no real filesystem backing this pipe, so we pass in a nil
-	// Filesystem.
-	sb := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
-	p.dirent = fs.NewDirent(fs.NewInode(NewInodeOperations(base, p), sb, sattr), fmt.Sprintf("pipe:[%d]", ino))
-
+	ms := fs.NewPseudoMountSource()
+	p.Dirent = fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
 	return p
 }
 
@@ -135,7 +124,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes int, atomicIOBytes int) (*f
 // ROpen opens the pipe for reading.
 func (p *Pipe) ROpen(ctx context.Context) *fs.File {
 	p.rOpen()
-	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true}, &Reader{
+	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Read: true}, &Reader{
 		ReaderWriter: ReaderWriter{Pipe: p},
 	})
 }
@@ -143,7 +132,7 @@ func (p *Pipe) ROpen(ctx context.Context) *fs.File {
 // WOpen opens the pipe for writing.
 func (p *Pipe) WOpen(ctx context.Context) *fs.File {
 	p.wOpen()
-	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Write: true}, &Writer{
+	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Write: true}, &Writer{
 		ReaderWriter: ReaderWriter{Pipe: p},
 	})
 }
@@ -152,7 +141,7 @@ func (p *Pipe) WOpen(ctx context.Context) *fs.File {
 func (p *Pipe) RWOpen(ctx context.Context) *fs.File {
 	p.rOpen()
 	p.wOpen()
-	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
+	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
 		Pipe: p,
 	})
 }
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 36be1efc3..028175530 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -35,11 +35,11 @@ import (
 //
 // +stateify savable
 type ReaderWriter struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	*Pipe
 }
 
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 437cc5da1..c070c7316 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -20,7 +20,6 @@ import (
 	"io"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi"
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -38,20 +37,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// byteReaderFileOperations implements fs.FileOperations for reading
-// from a []byte source.
-type byteReader struct {
-	fsutil.NoopRelease
-	fsutil.PipeSeek
-	fsutil.NotDirReaddir
-	fsutil.NoFsync
-	fsutil.NoopFlush
-	fsutil.NoMMap
-	fsutil.NoIoctl
-	waiter.AlwaysReady
-	data []byte
-}
-
 type fileContext struct {
 	context.Context
 }
@@ -65,17 +50,34 @@ func (f *fileContext) Value(key interface{}) interface{} {
 	}
 }
 
+// byteReader implements fs.FileOperations for reading from a []byte source.
+type byteReader struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+
+	data []byte
+}
+
+var _ fs.FileOperations = (*byteReader)(nil)
+
 // newByteReaderFile creates a fake file to read data from.
 func newByteReaderFile(data []byte) *fs.File {
 	// Create a fake inode.
-	inode := fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
-		FSType: linux.ANON_INODE_FS_MAGIC,
-	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
-		Type:      fs.Anonymous,
-		DeviceID:  anon.PseudoDevice.DeviceID(),
-		InodeID:   anon.PseudoDevice.NextIno(),
-		BlockSize: usermem.PageSize,
-	})
+	inode := fs.NewInode(
+		&fsutil.SimpleFileInode{},
+		fs.NewPseudoMountSource(),
+		fs.StableAttr{
+			Type:      fs.Anonymous,
+			DeviceID:  anon.PseudoDevice.DeviceID(),
+			InodeID:   anon.PseudoDevice.NextIno(),
+			BlockSize: usermem.PageSize,
+		})
 
 	// Use the fake inode to create a fake dirent.
 	dirent := fs.NewTransientDirent(inode)
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index d65b5f49e..ca865b111 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -138,11 +138,11 @@ type commonEndpoint interface {
 //
 // +stateify savable
 type SocketOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 	*waiter.Queue
 
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index f3ecb6dc3..2c54e8de2 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -46,11 +46,11 @@ const (
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	fd    int // must be O_NONBLOCK
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 0a7d4772c..5b0c11c84 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -65,11 +65,11 @@ var netlinkSocketDevice = device.NewAnonDevice()
 //
 // +stateify savable
 type Socket struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	// ports provides netlink port allocation.
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 8c8ebadb7..13681100e 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -45,11 +45,11 @@ import (
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	fd       uint32 // must be O_NONBLOCK
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 9d4aaeb9d..e28d2c4fa 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -178,18 +178,12 @@ func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*
 // NewDirent returns a sockfs fs.Dirent that resides on device d.
 func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 	ino := d.NextIno()
-	// There is no real filesystem backing this pipe, so we pass in a nil
-	// Filesystem.
-	inode := fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
-		FSType: linux.SOCKFS_MAGIC,
-		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: fs.FileOwnerFromContext(ctx),
-			Perms: fs.FilePermissions{
-				User: fs.PermMask{Read: true, Write: true},
-			},
-			Links: 1,
-		}),
-	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+	iops := &fsutil.SimpleFileInode{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), fs.FilePermissions{
+			User: fs.PermMask{Read: true, Write: true},
+		}, linux.SOCKFS_MAGIC),
+	}
+	inode := fs.NewInode(iops, fs.NewPseudoMountSource(), fs.StableAttr{
 		Type:      fs.Socket,
 		DeviceID:  d.DeviceID(),
 		InodeID:   ino,
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index da225eabb..19258e692 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -45,11 +45,11 @@ import (
 //
 // +stateify savable
 type SocketOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	refs.AtomicRefCount
 	socket.SendReceiveTimeout
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 1e75b0efc..942315d6e 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -489,9 +489,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
 // addSubmountOverlay overlays the inode over a ramfs tree containing the given
 // paths.
 func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
-	// There is no real filesystem backing this ramfs tree, so we pass in
-	// "nil" here.
-	msrc := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	msrc := fs.NewPseudoMountSource()
 	mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
 	if err != nil {
 		return nil, fmt.Errorf("error creating mount tree: %v", err)
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index e64df97b0..6ffe9aed6 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -334,6 +334,11 @@ int ReadlinkWhileExited(std::string const& basename, char* buf, size_t count) {
   return ret;
 }
 
+TEST(ProcTest, NotFoundInRoot) {
+  struct stat s;
+  EXPECT_THAT(stat("/proc/foobar", &s), SyscallFailsWithErrno(ENOENT));
+}
+
 TEST(ProcSelfTest, IsThreadGroupLeader) {
   ScopedThread([] {
     const pid_t tgid = getpid();
-- 
cgit v1.2.3


From 92cf3764e032740f0c84a1b242c54b99f45a6bf0 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 15 Jan 2019 14:12:19 -0800
Subject: Create working directory if it doesn't yet exist

PiperOrigin-RevId: 229438125
Change-Id: I58eb0d10178d1adfc709d7b859189d1acbcb2f22
---
 runsc/container/container_test.go | 21 +++++++++++++++++++++
 runsc/container/fs.go             | 12 +++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index affb51fab..bd8655f3e 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1699,6 +1699,27 @@ func TestDestroyStarting(t *testing.T) {
 	}
 }
 
+func TestCreateWorkingDir(t *testing.T) {
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir() failed: %v", err)
+		}
+		dir := path.Join(tmpDir, "new/working/dir")
+
+		// touch will fail if the directory doesn't exist.
+		spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+		spec.Process.Cwd = dir
+		spec.Root.Readonly = true
+
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("Error running container: %v", err)
+		}
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index 41022686b..97195550f 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -87,7 +87,7 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mou
 		// container.
 		dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
 		if err != nil {
-			return nil, fmt.Errorf("failed to resolve symlinks: %v", err)
+			return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
 		}
 
 		flags := optionsToFlags(m.Options)
@@ -113,6 +113,16 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mou
 		rv = append(rv, cpy)
 	}
 
+	if spec.Process.Cwd != "" {
+		dst, err := resolveSymlinks(spec.Root.Path, spec.Process.Cwd)
+		if err != nil {
+			return nil, fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
+		}
+		if err := os.MkdirAll(dst, 0755); err != nil {
+			return nil, err
+		}
+	}
+
 	// If root is read only, check if it needs to be remounted as readonly.
 	if spec.Root.Readonly {
 		isMountPoint, readonly, err := mountInfo(spec.Root.Path)
-- 
cgit v1.2.3


From e4d3ca7263291b43cdc49c7553c62608be062cd9 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 16 Jan 2019 12:47:21 -0800
Subject: Prevent internal tmpfs mount to override files in /tmp

Runsc wants to mount /tmp using internal tmpfs implementation for
performance. However, it risks hiding files that may exist under
/tmp in case it's present in the container. Now, it only mounts
over /tmp iff:
  - /tmp was not explicitly asked to be mounted
  - /tmp is empty

If any of this is not true, then /tmp maps to the container's
image /tmp.

Note: checkpoint doesn't have sentry FS mounted to check if /tmp
is empty. It simply looks for explicit mounts right now.
PiperOrigin-RevId: 229607856
Change-Id: I10b6dae7ac157ef578efc4dfceb089f3b94cde06
---
 runsc/boot/fs.go                           | 174 +++++++++++++++++++----------
 runsc/boot/loader_test.go                  |  21 ++--
 runsc/container/multi_container_test.go    |   8 +-
 runsc/test/integration/integration_test.go |  13 +++
 test/syscalls/BUILD                        |   1 +
 test/syscalls/syscall_test_runner.go       |   8 +-
 6 files changed, 157 insertions(+), 68 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 942315d6e..e0c8291ac 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -81,9 +81,11 @@ func (f *fdDispenser) empty() bool {
 	return len(f.fds) == 0
 }
 
-// createMountNamespace creates a mount namespace containing the root filesystem
+// setupRootContainerFS creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
-func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int) (*fs.MountNamespace, error) {
+// 'setMountNS' is called after namespace is created. It must set the mount NS
+// to 'rootCtx'.
+func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error {
 	mounts := compileMounts(spec)
 
 	// Create a tmpfs mount where we create and mount a root filesystem for
@@ -96,32 +98,24 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 	fds := &fdDispenser{fds: goferFDs}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create root mount: %v", err)
+		return fmt.Errorf("failed to create root mount: %v", err)
 	}
 	mns, err := fs.NewMountNamespace(userCtx, rootInode)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
+		return fmt.Errorf("failed to create root mount namespace: %v", err)
 	}
+	setMountNS(mns)
 
 	root := mns.Root()
 	defer root.DecRef()
-	for _, m := range mounts {
-		if err := mountSubmount(rootCtx, conf, mns, root, fds, m, mounts); err != nil {
-			return nil, fmt.Errorf("mount submount: %v", err)
-		}
-	}
-
-	if !fds.empty() {
-		return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
-	}
-	return mns, nil
+	return mountSubmounts(rootCtx, conf, mns, root, mounts, fds)
 }
 
 // compileMounts returns the supported mounts from the mount spec, adding any
 // mandatory mounts that are required by the OCI specification.
 func compileMounts(spec *specs.Spec) []specs.Mount {
-	// Keep track of whether proc, sys, and tmp were mounted.
-	var procMounted, sysMounted, tmpMounted bool
+	// Keep track of whether proc and sys were mounted.
+	var procMounted, sysMounted bool
 	var mounts []specs.Mount
 
 	// Always mount /dev.
@@ -147,8 +141,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 			procMounted = true
 		case "/sys":
 			sysMounted = true
-		case "/tmp":
-			tmpMounted = true
 		}
 	}
 
@@ -168,20 +160,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 		})
 	}
 
-	// Technically we don't have to mount tmpfs at /tmp, as we could just
-	// rely on the host /tmp, but this is a nice optimization, and fixes
-	// some apps that call mknod in /tmp.
-	if !tmpMounted {
-		// TODO: If the host /tmp (or a mount at /tmp) has
-		// files in it, we should overlay our tmpfs implementation over
-		// that. Until then, the /tmp mount will always appear empty at
-		// container creation.
-		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        tmpfs,
-			Destination: "/tmp",
-		})
-	}
-
 	// The mandatory mounts should be ordered right after the root, in case
 	// there are submounts of these mandatory mounts already in the spec.
 	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
@@ -288,6 +266,23 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 	return fsName, opts, useOverlay, err
 }
 
+func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error {
+	for _, m := range mounts {
+		if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil {
+			return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+		}
+	}
+
+	if err := mountTmp(ctx, conf, mns, root, fds, mounts); err != nil {
+		return fmt.Errorf("mount submount %q: %v", "tmp", err)
+	}
+
+	if !fds.empty() {
+		return fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
+	}
+	return nil
+}
+
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
@@ -453,11 +448,27 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
 
 	// Add submounts.
+	var tmpMounted bool
 	for _, m := range compileMounts(spec) {
 		if err := addRestoreMount(conf, renv, m, fds); err != nil {
 			return nil, err
 		}
+		if filepath.Clean(m.Destination) == "/tmp" {
+			tmpMounted = true
+		}
 	}
+
+	// TODO: handle '/tmp' properly (see mountTmp()).
+	if !tmpMounted {
+		tmpMount := specs.Mount{
+			Type:        tmpfs,
+			Destination: "/tmp",
+		}
+		if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil {
+			return nil, err
+		}
+	}
+
 	return renv, nil
 }
 
@@ -555,28 +566,13 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	mns := k.RootMountNamespace()
 	if mns == nil {
 		// Setup the root container.
-
-		// Create the virtual filesystem.
-		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, goferFDs)
-		if err != nil {
-			return fmt.Errorf("error creating mounts: %v", err)
-		}
-		k.SetRootMountNamespace(mns)
-
-		// We're done with root container.
-		return nil
+		return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) {
+			k.SetRootMountNamespace(mns)
+		})
 	}
 
 	// Setup a child container.
-
-	// Create the container's root filesystem mount.
 	log.Infof("Creating new process in child container.")
-	fds := &fdDispenser{fds: append([]int{}, goferFDs...)}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
-	if err != nil {
-		return fmt.Errorf("error creating filesystem for container: %v", err)
-	}
-
 	globalRoot := mns.Root()
 	defer globalRoot.DecRef()
 
@@ -595,6 +591,13 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	}
 	defer containerRoot.DecRef()
 
+	// Create the container's root filesystem mount.
+	fds := &fdDispenser{fds: goferFDs}
+	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+	if err != nil {
+		return fmt.Errorf("error creating filesystem for container: %v", err)
+	}
+
 	// Mount the container's root filesystem to the newly created mount point.
 	if err := mns.Mount(ctx, containerRoot, rootInode); err != nil {
 		return fmt.Errorf("mount container root: %v", err)
@@ -606,20 +609,20 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	if err != nil {
 		return fmt.Errorf("find container mount point %q: %v", cid, err)
 	}
+	cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
+	defer cu.Clean()
 
 	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
 
+	// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
+	procArgs.Root = containerRoot
+
 	// Mount all submounts.
 	mounts := compileMounts(spec)
-	for _, m := range mounts {
-		if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), containerRoot, fds, m, mounts); err != nil {
-			containerRoot.DecRef()
-			return fmt.Errorf("error mounting filesystem for container: %v", err)
-		}
+	if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil {
+		return err
 	}
-
-	// Set the procArgs root directory.
-	procArgs.Root = containerRoot
+	cu.Release()
 	return nil
 }
 
@@ -705,3 +708,58 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 
 	return nil
 }
+
+// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+//   1. /tmp is mounted explictly: we should not override user's wish
+//   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, mounts []specs.Mount) error {
+	for _, m := range mounts {
+		if filepath.Clean(m.Destination) == "/tmp" {
+			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
+			return nil
+		}
+	}
+
+	maxTraversals := uint(0)
+	tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
+	switch err {
+	case nil:
+		// Found '/tmp' in filesystem, check if it's empty.
+		defer tmp.DecRef()
+		f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
+		if err != nil {
+			return err
+		}
+		defer f.DecRef()
+		serializer := &fs.CollectEntriesSerializer{}
+		if err := f.Readdir(ctx, serializer); err != nil {
+			return err
+		}
+		// If more than "." and ".." is found, skip internal tmpfs to prevent hiding
+		// existing files.
+		if len(serializer.Order) > 2 {
+			log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
+			return nil
+		}
+		log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
+		fallthrough
+
+	case syserror.ENOENT:
+		// No '/tmp' found (or fallthrough from above). Safe to mount internal
+		// tmpfs.
+		tmpMount := specs.Mount{
+			Type:        tmpfs,
+			Destination: "/tmp",
+		}
+		return mountSubmount(ctx, conf, mns, root, fds, tmpMount, mounts)
+
+	default:
+		return err
+	}
+}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 0ed3002e0..4fcc0faea 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -398,16 +398,21 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, []int{sandEnd})
-			if err != nil {
+			// setupRootContainerFS needs to find root from the context after the
+			// namespace is created.
+			var mns *fs.MountNamespace
+			setMountNS := func(m *fs.MountNamespace) {
+				mns = m
+				ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
+			}
+			if err := setupRootContainerFS(ctx, ctx, &tc.spec, conf, []int{sandEnd}, setMountNS); err != nil {
 				t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
 			}
-			defer mm.DecRef()
-			root := mm.Root()
+			root := mns.Root()
 			defer root.DecRef()
 			for _, p := range tc.expectedPaths {
 				maxTraversals := uint(0)
-				if d, err := mm.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
+				if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
 				} else {
 					d.DecRef()
@@ -569,14 +574,14 @@ func TestRestoreEnvironment(t *testing.T) {
 						},
 					},
 					"tmpfs": {
-						{
-							Dev: "none",
-						},
 						{
 							Dev:   "none",
 							Flags: fs.MountSourceFlags{NoAtime: true},
 							Data:  "uid=1022",
 						},
+						{
+							Dev: "none",
+						},
 					},
 					"devtmpfs": {
 						{
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 6b3c41a9b..8490999ea 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -739,6 +739,11 @@ func TestMultiContainerGoferStop(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "gofer-stop-test")
+	if err != nil {
+		t.Fatal("ioutil.TempDir failed:", err)
+	}
+
 	// Setup containers. Root container just reaps children, while the others
 	// perform some IOs. Children are executed in 3 batches of 10. Within the
 	// batch there is overlap between containers starting and being destroyed. In
@@ -746,7 +751,8 @@ func TestMultiContainerGoferStop(t *testing.T) {
 	cmds := [][]string{{app, "reaper"}}
 	const batchSize = 10
 	for i := 0; i < 3*batchSize; i++ {
-		cmds = append(cmds, []string{"sh", "-c", "find /bin -type f | head | xargs -I SRC cp SRC /tmp/output"})
+		cmd := "find /bin -type f | head | xargs -I SRC cp SRC " + dir
+		cmds = append(cmds, []string{"sh", "-c", cmd})
 	}
 	allSpecs, allIDs := createSpecs(cmds...)
 
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 526b3a7a1..4a2770d48 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -279,6 +279,19 @@ func TestJobControl(t *testing.T) {
 	}
 }
 
+// TestTmpFile checks that files inside '/tmp' are not overridden. In addition,
+// it checks that working dir is created if it doesn't exit.
+func TestTmpFile(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("tmp-file-test")
+	if err := d.Run("-w=/tmp/foo/bar", "--read-only", "alpine", "touch", "/tmp/foo/bar/file"); err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+	defer d.CleanUp()
+}
+
 func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 674e4b5b1..c46ac77f7 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -538,6 +538,7 @@ go_binary(
         "//runsc/specutils",
         "//runsc/test/testutil",
         "//test/syscalls/gtest",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index 1f2ff9864..e5c2358a0 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -29,6 +29,7 @@ import (
 	"syscall"
 	"testing"
 
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -107,7 +108,12 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 	// Mark the root as writeable, as some tests attempt to
 	// write to the rootfs, and expect EACCES, not EROFS.
 	spec.Root.Readonly = false
-	spec.Mounts = nil
+
+	// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on features
+	// available in gVisor's tmpfs and not gofers, may fail.
+	spec.Mounts = []specs.Mount{
+		{Destination: "/tmp", Type: "tmpfs"},
+	}
 
 	// Set environment variable that indicates we are
 	// running in gVisor and with the given platform.
-- 
cgit v1.2.3


From c063a1350f4ac6249fb26e6125c9cc99db14263b Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 18 Jan 2019 12:16:24 -0800
Subject: runsc: create a new proc mount if the sandbox process is running in a
 new pidns

PiperOrigin-RevId: 229971902
Change-Id: Ief4fac731e839ef092175908de9375d725eaa3aa
---
 runsc/cmd/boot.go        |  6 +++++-
 runsc/cmd/chroot.go      | 13 ++++++++++---
 runsc/sandbox/sandbox.go |  1 +
 3 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 7ca2744bd..fb1fd3e70 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -75,6 +75,9 @@ type Boot struct {
 
 	// startSyncFD is the file descriptor to synchronize runsc and sandbox.
 	startSyncFD int
+
+	// pidns is set if the sanadbox is in its own pid namespace.
+	pidns bool
 }
 
 // Name implements subcommands.Command.Name.
@@ -103,6 +106,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
 	f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
+	f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
 	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
 	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
 	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
@@ -121,7 +125,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	debug.SetTraceback("all")
 
 	if b.setUpRoot {
-		if err := setUpChroot(); err != nil {
+		if err := setUpChroot(b.pidns); err != nil {
 			Fatalf("error setting up chroot: %v", err)
 		}
 
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
index b53085934..ec539a11c 100644
--- a/runsc/cmd/chroot.go
+++ b/runsc/cmd/chroot.go
@@ -42,7 +42,7 @@ func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
 
 // setUpChroot creates an empty directory with runsc mounted at /runsc and proc
 // mounted at /proc.
-func setUpChroot() error {
+func setUpChroot(pidns bool) error {
 	// We are a new mount namespace, so we can use /tmp as a directory to
 	// construct a new root.
 	chroot := os.TempDir()
@@ -59,8 +59,15 @@ func setUpChroot() error {
 		return fmt.Errorf("error mounting tmpfs in choot: %v", err)
 	}
 
-	if err := mountInChroot(chroot, "/proc", "/proc", "bind", syscall.MS_BIND|syscall.MS_RDONLY|syscall.MS_REC); err != nil {
-		return fmt.Errorf("error mounting proc in chroot: %v", err)
+	if pidns {
+		flags := uint32(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC | syscall.MS_RDONLY)
+		if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil {
+			return fmt.Errorf("error mounting proc in chroot: %v", err)
+		}
+	} else {
+		if err := mountInChroot(chroot, "/proc", "/proc", "bind", syscall.MS_BIND|syscall.MS_RDONLY|syscall.MS_REC); err != nil {
+			return fmt.Errorf("error mounting proc in chroot: %v", err)
+		}
 	}
 
 	if err := mountInChroot(chroot, specutils.ExePath, chrootBinPath, "bind", syscall.MS_BIND|syscall.MS_RDONLY); err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 411200793..d28d93b0a 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -458,6 +458,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	} else {
 		log.Infof("Sandbox will be started in a new PID namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
+		cmd.Args = append(cmd.Args, "--pidns=true")
 	}
 
 	// Joins the network namespace if network is enabled. the sandbox talks
-- 
cgit v1.2.3


From c0a981629cf44688687548490c5e665d851afe06 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 18 Jan 2019 16:07:28 -0800
Subject: Start a sandbox process in a new userns only if CAP_SETUID is set

In addition, it fixes a race condition in TestMultiContainerGoferStop.
There are two scripts copy the same set of files into the same directory
and sometime one of this command fails with EXIST.

PiperOrigin-RevId: 230011247
Change-Id: I9289f72e65dc407cdcd0e6cd632a509e01f43e9c
---
 runsc/container/multi_container_test.go | 11 ++++++-----
 runsc/sandbox/sandbox.go                |  6 +++---
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 8490999ea..8922e6dbe 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -739,11 +739,6 @@ func TestMultiContainerGoferStop(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	dir, err := ioutil.TempDir(testutil.TmpDir(), "gofer-stop-test")
-	if err != nil {
-		t.Fatal("ioutil.TempDir failed:", err)
-	}
-
 	// Setup containers. Root container just reaps children, while the others
 	// perform some IOs. Children are executed in 3 batches of 10. Within the
 	// batch there is overlap between containers starting and being destroyed. In
@@ -751,6 +746,12 @@ func TestMultiContainerGoferStop(t *testing.T) {
 	cmds := [][]string{{app, "reaper"}}
 	const batchSize = 10
 	for i := 0; i < 3*batchSize; i++ {
+		dir, err := ioutil.TempDir(testutil.TmpDir(), "gofer-stop-test")
+		if err != nil {
+			t.Fatal("ioutil.TempDir failed:", err)
+		}
+		defer os.RemoveAll(dir)
+
 		cmd := "find /bin -type f | head | xargs -I SRC cp SRC " + dir
 		cmds = append(cmds, []string{"sh", "-c", cmd})
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index d28d93b0a..df4c3c787 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -500,15 +500,15 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
 		}
 	} else {
-		log.Infof("Sandbox will be started in new user namespace")
-		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
-
 		// If we have CAP_SETUID and CAP_SETGID, then we can also run
 		// as user nobody.
 		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
 			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
 		} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
+			log.Infof("Sandbox will be started in new user namespace")
+			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+
 			// Map nobody in the new namespace to nobody in the parent namespace.
 			//
 			// A sandbox process will construct an empty
-- 
cgit v1.2.3


From c1be25b78d89a3a55a32a7aa10724134eda9813d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 18 Jan 2019 17:35:09 -0800
Subject: Scrub runsc error messages

Removed "error" and "failed to" prefix that don't add value
from messages. Adjusted a few other messages.  In particular,
when the container fail to start, the message returned is easier
for humans to read:

$ docker run --rm --runtime=runsc alpine foobar
docker: Error response from daemon: OCI runtime start failed: <path> did not terminate sucessfully: starting container: starting root container [foobar]: starting sandbox: searching for executable "foobar", cwd: "/", $PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin": no such file or directory

Closes #77

PiperOrigin-RevId: 230022798
Change-Id: I83339017c70dae09e4f9f8e0ea2e554c4d5d5cd1
---
 runsc/boot/controller.go     | 12 +++----
 runsc/boot/fs.go             | 43 +++++++++++------------
 runsc/boot/loader.go         | 81 ++++++++++++++++++++++++--------------------
 runsc/cgroup/cgroup.go       |  2 +-
 runsc/cmd/boot.go            |  8 ++---
 runsc/cmd/checkpoint.go      | 14 ++++----
 runsc/cmd/create.go          |  4 +--
 runsc/cmd/debug.go           |  8 ++---
 runsc/cmd/delete.go          |  4 +--
 runsc/cmd/events.go          |  6 ++--
 runsc/cmd/exec.go            | 24 ++++++-------
 runsc/cmd/gofer.go           | 16 ++++-----
 runsc/cmd/kill.go            |  2 +-
 runsc/cmd/list.go            |  4 +--
 runsc/cmd/path.go            |  2 +-
 runsc/cmd/pause.go           |  2 +-
 runsc/cmd/ps.go              |  8 ++---
 runsc/cmd/restore.go         |  8 ++---
 runsc/cmd/resume.go          |  2 +-
 runsc/cmd/run.go             |  4 +--
 runsc/cmd/spec.go            |  2 +-
 runsc/cmd/start.go           |  4 +--
 runsc/cmd/state.go           |  4 +--
 runsc/cmd/wait.go            | 12 +++----
 runsc/console/console.go     |  8 ++---
 runsc/container/container.go | 34 +++++++++----------
 runsc/container/fs.go        |  6 ++--
 runsc/sandbox/network.go     | 26 +++++++-------
 runsc/sandbox/sandbox.go     | 62 ++++++++++++++++-----------------
 29 files changed, 211 insertions(+), 201 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 36e9d2c6b..989f49388 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -163,7 +163,7 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
 	if err := <-cm.startResultChan; err != nil {
-		return fmt.Errorf("failed to start sandbox: %v", err)
+		return fmt.Errorf("starting sandbox: %v", err)
 	}
 	return nil
 }
@@ -319,7 +319,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 
 	p, err := createPlatform(cm.l.conf, int(deviceFile.Fd()))
 	if err != nil {
-		return fmt.Errorf("error creating platform: %v", err)
+		return fmt.Errorf("creating platform: %v", err)
 	}
 	k := &kernel.Kernel{
 		Platform: p,
@@ -330,14 +330,14 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	fds := &fdDispenser{fds: cm.l.goferFDs}
 	renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
 	if err != nil {
-		return fmt.Errorf("error creating RestoreEnvironment: %v", err)
+		return fmt.Errorf("creating RestoreEnvironment: %v", err)
 	}
 	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
 	networkStack, err := newEmptyNetworkStack(cm.l.conf, k)
 	if err != nil {
-		return fmt.Errorf("failed to create network: %v", err)
+		return fmt.Errorf("creating network: %v", err)
 	}
 	if eps, ok := networkStack.(*epsocket.Stack); ok {
 		stack.StackFromEnv = eps.Stack // FIXME
@@ -347,7 +347,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return err
 	}
 	if info.Size() == 0 {
-		return fmt.Errorf("error file was empty")
+		return fmt.Errorf("file cannot be empty")
 	}
 
 	// Load the state.
@@ -385,7 +385,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
 	if err := <-cm.startResultChan; err != nil {
-		return fmt.Errorf("failed to start sandbox: %v", err)
+		return fmt.Errorf("starting sandbox: %v", err)
 	}
 
 	return nil
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e0c8291ac..5c5e650ca 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -98,11 +98,11 @@ func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec
 	fds := &fdDispenser{fds: goferFDs}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
-		return fmt.Errorf("failed to create root mount: %v", err)
+		return fmt.Errorf("creating root mount: %v", err)
 	}
 	mns, err := fs.NewMountNamespace(userCtx, rootInode)
 	if err != nil {
-		return fmt.Errorf("failed to create root mount namespace: %v", err)
+		return fmt.Errorf("creating root mount namespace: %v", err)
 	}
 	setMountNS(mns)
 
@@ -183,7 +183,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	opts := p9MountOptions(fd, conf.FileAccess)
 	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
 	if err != nil {
-		return nil, fmt.Errorf("failed to generate root mount point: %v", err)
+		return nil, fmt.Errorf("creating root mount point: %v", err)
 	}
 
 	// We need to overlay the root on top of a ramfs with stub directories
@@ -192,7 +192,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
 	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
 	if err != nil {
-		return nil, fmt.Errorf("error adding submount overlay: %v", err)
+		return nil, fmt.Errorf("adding submount overlay: %v", err)
 	}
 
 	if conf.Overlay && !spec.Root.Readonly {
@@ -204,7 +204,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 		}
 	}
 
-	log.Infof("Mounted %q to \"/\" type root", spec.Root.Path)
+	log.Infof("Mounted %q to %q type root", spec.Root.Path, "/")
 	return rootInode, nil
 }
 
@@ -222,7 +222,7 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 	// Create overlay on top of mount dir.
 	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
 	if err != nil {
-		return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err)
+		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
 	}
 	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
 }
@@ -311,7 +311,7 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, ro
 
 	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","))
 	if err != nil {
-		return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
+		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
 	}
 
 	// If there are submounts, we need to overlay the mount on top of a
@@ -321,7 +321,7 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, ro
 		log.Infof("Adding submount overlay over %q", m.Destination)
 		inode, err = addSubmountOverlay(ctx, inode, submounts)
 		if err != nil {
-			return fmt.Errorf("error adding submount overlay: %v", err)
+			return fmt.Errorf("adding submount overlay: %v", err)
 		}
 	}
 
@@ -336,11 +336,11 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, ro
 	maxTraversals := uint(0)
 	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
 	if err != nil {
-		return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
+		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
 	}
 	defer dirent.DecRef()
 	if err := mns.Mount(ctx, dirent, inode); err != nil {
-		return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err)
+		return fmt.Errorf("mount %q error: %v", m.Destination, err)
 	}
 
 	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
@@ -503,11 +503,11 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 	msrc := fs.NewPseudoMountSource()
 	mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
 	if err != nil {
-		return nil, fmt.Errorf("error creating mount tree: %v", err)
+		return nil, fmt.Errorf("creating mount tree: %v", err)
 	}
 	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
 	if err != nil {
-		return nil, fmt.Errorf("failed to make mount overlay: %v", err)
+		return nil, fmt.Errorf("adding mount overlay: %v", err)
 	}
 	return overlayInode, err
 }
@@ -544,7 +544,7 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	// fd.
 	fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
 	if err != nil {
-		return fmt.Errorf("error importing fds: %v", err)
+		return fmt.Errorf("importing fds: %v", err)
 	}
 
 	// CreateProcess takes a reference on FDMap if successful. We
@@ -595,7 +595,7 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	fds := &fdDispenser{fds: goferFDs}
 	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
 	if err != nil {
-		return fmt.Errorf("error creating filesystem for container: %v", err)
+		return fmt.Errorf("creating filesystem for container: %v", err)
 	}
 
 	// Mount the container's root filesystem to the newly created mount point.
@@ -630,9 +630,10 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 // executable matching the procArgs.Argv[0].
 func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
 	paths := fs.GetPath(procArgs.Envv)
-	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, procArgs.Argv[0], paths)
+	exe := procArgs.Argv[0]
+	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
 	if err != nil {
-		return err
+		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
 	}
 	procArgs.Filename = f
 	return nil
@@ -666,7 +667,7 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 		return nil
 	}
 	if err != nil {
-		return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
+		return fmt.Errorf("finding container root directory %q: %v", containerRoot, err)
 	}
 	defer containerRootDirent.DecRef()
 
@@ -682,7 +683,7 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 		log.Debugf("Unmounting container submount %q", root.BaseName())
 		m.FlushDirentRefs()
 		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
-			return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
+			return fmt.Errorf("unmounting container submount %q: %v", root.BaseName(), err)
 		}
 	}
 
@@ -690,7 +691,7 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 	log.Debugf("Unmounting container root %q", containerRoot)
 	containerRootDirent.Inode.MountSource.FlushDirentRefs()
 	if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
-		return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
+		return fmt.Errorf("unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
 	}
 
 	// Get a reference to the parent directory and remove the root
@@ -698,12 +699,12 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 	maxTraversals = 0
 	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals)
 	if err != nil {
-		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
+		return fmt.Errorf("finding containers directory %q: %v", ChildContainersDir, err)
 	}
 	defer containersDirDirent.DecRef()
 	log.Debugf("Deleting container root %q", containerRoot)
 	if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
-		return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
+		return fmt.Errorf("removing directory %q: %v", containerRoot, err)
 	}
 
 	return nil
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 71a2ab962..f3dc15f00 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -173,17 +173,17 @@ func New(args Args) (*Loader, error) {
 	// We initialize the rand package now to make sure /dev/urandom is pre-opened
 	// on kernels that do not support getrandom(2).
 	if err := rand.Init(); err != nil {
-		return nil, fmt.Errorf("error setting up rand: %v", err)
+		return nil, fmt.Errorf("setting up rand: %v", err)
 	}
 
 	if err := usage.Init(); err != nil {
-		return nil, fmt.Errorf("error setting up memory usage: %v", err)
+		return nil, fmt.Errorf("setting up memory usage: %v", err)
 	}
 
 	// Create kernel and platform.
 	p, err := createPlatform(args.Conf, args.DeviceFD)
 	if err != nil {
-		return nil, fmt.Errorf("error creating platform: %v", err)
+		return nil, fmt.Errorf("creating platform: %v", err)
 	}
 	k := &kernel.Kernel{
 		Platform: p,
@@ -194,18 +194,18 @@ func New(args Args) (*Loader, error) {
 	// Pass k as the platform since it is savable, unlike the actual platform.
 	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
-		return nil, fmt.Errorf("error creating vdso: %v", err)
+		return nil, fmt.Errorf("creating vdso: %v", err)
 	}
 
 	// Create timekeeper.
 	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
 	if err != nil {
-		return nil, fmt.Errorf("error creating timekeeper: %v", err)
+		return nil, fmt.Errorf("creating timekeeper: %v", err)
 	}
 	tk.SetClocks(time.NewCalibratedClocks())
 
 	if err := enableStrace(args.Conf); err != nil {
-		return nil, fmt.Errorf("failed to enable strace: %v", err)
+		return nil, fmt.Errorf("enabling strace: %v", err)
 	}
 
 	// Create an empty network stack because the network namespace may be empty at
@@ -214,13 +214,13 @@ func New(args Args) (*Loader, error) {
 	// Run().
 	networkStack, err := newEmptyNetworkStack(args.Conf, k)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create network: %v", err)
+		return nil, fmt.Errorf("creating network: %v", err)
 	}
 
 	// Create capabilities.
 	caps, err := specutils.Capabilities(args.Spec.Process.Capabilities)
 	if err != nil {
-		return nil, fmt.Errorf("error creating capabilities: %v", err)
+		return nil, fmt.Errorf("converting capabilities: %v", err)
 	}
 
 	// Convert the spec's additional GIDs to KGIDs.
@@ -262,7 +262,7 @@ func New(args Args) (*Loader, error) {
 		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
 		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
 	}); err != nil {
-		return nil, fmt.Errorf("error initializing kernel: %v", err)
+		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
 	// Turn on packet logging if enabled.
@@ -279,11 +279,11 @@ func New(args Args) (*Loader, error) {
 
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create init process for root container: %v", err)
+		return nil, fmt.Errorf("creating init process for root container: %v", err)
 	}
 
 	if err := initCompatLogs(args.UserLogFD); err != nil {
-		return nil, fmt.Errorf("init compat logs: %v", err)
+		return nil, fmt.Errorf("initializing compat logs: %v", err)
 	}
 
 	eid := execID{cid: args.ID}
@@ -303,7 +303,7 @@ func New(args Args) (*Loader, error) {
 	// We don't care about child signals; some platforms can generate a
 	// tremendous number of useless ones (I'm looking at you, ptrace).
 	if err := sighandling.IgnoreChildStop(); err != nil {
-		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+		return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
 	}
 
 	// Handle signals by forwarding them to the root container process
@@ -353,7 +353,7 @@ func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
-		return kernel.CreateProcessArgs{}, fmt.Errorf("error creating limits: %v", err)
+		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
 	}
 
 	// Create the process arguments.
@@ -441,7 +441,7 @@ func (l *Loader) run() error {
 			ControllerFD: l.ctrl.srv.FD(),
 		}
 		if err := filter.Install(opts); err != nil {
-			return fmt.Errorf("Failed to install seccomp filters: %v", err)
+			return fmt.Errorf("installing seccomp filters: %v", err)
 		}
 	}
 
@@ -465,13 +465,13 @@ func (l *Loader) run() error {
 		rootCtx := l.rootProcArgs.NewContext(l.k)
 		rootMns := l.k.RootMountNamespace()
 		if err := setExecutablePath(rootCtx, rootMns, &l.rootProcArgs); err != nil {
-			return fmt.Errorf("error setting executable path for %+v: %v", l.rootProcArgs, err)
+			return err
 		}
 
 		// Create the root container init task.
 		_, _, err := l.k.CreateProcess(l.rootProcArgs)
 		if err != nil {
-			return fmt.Errorf("failed to create init process: %v", err)
+			return fmt.Errorf("creating init process: %v", err)
 		}
 
 		// CreateProcess takes a reference on FDMap if successful.
@@ -521,7 +521,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
-		return fmt.Errorf("error creating capabilities: %v", err)
+		return fmt.Errorf("creating capabilities: %v", err)
 	}
 
 	// Convert the spec's additional GIDs to KGIDs.
@@ -544,7 +544,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	procArgs, err := newProcess(cid, spec, creds, l.k)
 	if err != nil {
-		return fmt.Errorf("failed to create new process: %v", err)
+		return fmt.Errorf("creating new process: %v", err)
 	}
 
 	// Can't take ownership away from os.File. dup them to get a new FDs.
@@ -570,20 +570,20 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		procArgs.Limits,
 		k,
 		cid); err != nil {
-		return fmt.Errorf("failed to create new process: %v", err)
+		return fmt.Errorf("configuring container FS: %v", err)
 	}
 
 	// setFileSystemForProcess dup'd stdioFDs, so we can close them.
 	for i, fd := range stdioFDs {
 		if err := syscall.Close(fd); err != nil {
-			return fmt.Errorf("failed to close stdioFD #%d: %v", i, fd)
+			return fmt.Errorf("closing stdio FD #%d: %v", i, fd)
 		}
 	}
 
 	ctx := procArgs.NewContext(l.k)
 	mns := k.RootMountNamespace()
 	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
-		return fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
+		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
 	}
 
 	l.mu.Lock()
@@ -596,7 +596,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	tg, _, err := l.k.CreateProcess(procArgs)
 	if err != nil {
-		return fmt.Errorf("failed to create process in sentry: %v", err)
+		return fmt.Errorf("creating process: %v", err)
 	}
 	// CreateProcess takes a reference on FDMap if successful.
 	procArgs.FDMap.DecRef()
@@ -615,7 +615,7 @@ func (l *Loader) destroyContainer(cid string) error {
 	if _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}); err == nil {
 		// If the container has started, kill and wait for all processes.
 		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
-			return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
+			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
 		}
 	}
 
@@ -628,7 +628,7 @@ func (l *Loader) destroyContainer(cid string) error {
 
 	ctx := l.rootProcArgs.NewContext(l.k)
 	if err := destroyContainerFS(ctx, cid, l.k); err != nil {
-		return fmt.Errorf("failed to destroy filesystem for container %q: %v", cid, err)
+		return fmt.Errorf("destroying filesystem for container %q: %v", cid, err)
 	}
 
 	// We made it!
@@ -715,11 +715,11 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 	// In this case, find the process in the container's PID namespace.
 	initTG, _, err := l.threadGroupFromID(execID{cid: cid})
 	if err != nil {
-		return fmt.Errorf("failed to wait for PID %d: %v", tgid, err)
+		return fmt.Errorf("waiting for PID %d: %v", tgid, err)
 	}
 	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
 	if tg == nil {
-		return fmt.Errorf("failed to wait for PID %d: no such process", tgid)
+		return fmt.Errorf("waiting for PID %d: no such process", tgid)
 	}
 	if tg.Leader().ContainerID() != cid {
 		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
@@ -778,15 +778,21 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 // processes in the container, or to the foreground process group.
 func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
 	if pid < 0 {
-		return fmt.Errorf("failed to signal container %q PID %d: PID must be positive", cid, pid)
+		return fmt.Errorf("PID (%d) must be positive", pid)
 	}
 
 	switch mode {
 	case DeliverToProcess:
-		return l.signalProcess(cid, kernel.ThreadID(pid), signo)
+		if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
+			return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err)
+		}
+		return nil
 
 	case DeliverToForegroundProcessGroup:
-		return l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo)
+		if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
+			return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err)
+		}
+		return nil
 
 	case DeliverToAllProcesses:
 		if pid != 0 {
@@ -795,12 +801,15 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
 		// Check that the container has actually started before signaling it.
 		_, _, err := l.threadGroupFromID(execID{cid: cid})
 		if err != nil {
-			return fmt.Errorf("failed to signal container %q: %v", cid, err)
+			return err
 		}
-		return l.signalAllProcesses(cid, signo)
+		if err := l.signalAllProcesses(cid, signo); err != nil {
+			return fmt.Errorf("signaling all processes in container %q: %v", cid, err)
+		}
+		return nil
 
 	default:
-		panic(fmt.Sprintf("unknown signal signal delivery mode %v", mode))
+		panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
 	}
 }
 
@@ -816,11 +825,11 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
 	// signal it.
 	initTG, _, err := l.threadGroupFromID(execID{cid: cid})
 	if err != nil {
-		return fmt.Errorf("failed to signal container %q: %v", cid, err)
+		return fmt.Errorf("no thread group found: %v", err)
 	}
 	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
 	if tg == nil {
-		return fmt.Errorf("failed to signal container %q PID %d: no such process", cid, tgid)
+		return fmt.Errorf("no such process with PID %d", tgid)
 	}
 	if tg.Leader().ContainerID() != cid {
 		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
@@ -833,10 +842,10 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s
 	// and send the signal to it.
 	tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
 	if err != nil {
-		return fmt.Errorf("failed to signal foreground process group for container %q PID %d: %v", cid, tgid, err)
+		return fmt.Errorf("no thread group found: %v", err)
 	}
 	if tty == nil {
-		return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, tgid)
+		return fmt.Errorf("no TTY attached")
 	}
 	pg := tty.ForegroundProcessGroup()
 	if pg == nil {
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 65a0b6d7a..87f051e79 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -236,7 +236,7 @@ func (c *Cgroup) Uninstall() error {
 			}
 			return err
 		}, b); err != nil {
-			return fmt.Errorf("error removing cgroup path %q: %v", path, err)
+			return fmt.Errorf("removing cgroup path %q: %v", path, err)
 		}
 	}
 	return nil
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index fb1fd3e70..7f87b2623 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -154,7 +154,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	defer specFile.Close()
 	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile)
 	if err != nil {
-		Fatalf("error reading spec: %v", err)
+		Fatalf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
@@ -208,7 +208,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	l, err := boot.New(bootArgs)
 	if err != nil {
-		Fatalf("error creating loader: %v", err)
+		Fatalf("creating loader: %v", err)
 	}
 
 	// Fatalf exits the process and doesn't run defers.
@@ -220,7 +220,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	buf := make([]byte, 1)
 	if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
 		l.Destroy()
-		Fatalf("Unable to write into the start-sync descriptor: %v", err)
+		Fatalf("unable to write into the start-sync descriptor: %v", err)
 	}
 	// Closes startSyncFile because 'l.Run()' only returns when the sandbox exits.
 	startSyncFile.Close()
@@ -231,7 +231,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Run the application and wait for it to finish.
 	if err := l.Run(); err != nil {
 		l.Destroy()
-		Fatalf("error running sandbox: %v", err)
+		Fatalf("running sandbox: %v", err)
 	}
 
 	ws := l.WaitExit()
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 4f4771da2..d8f748aa0 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -77,7 +77,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading container: %v", err)
+		Fatalf("loading container: %v", err)
 	}
 
 	if c.imagePath == "" {
@@ -85,7 +85,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	}
 
 	if err := os.MkdirAll(c.imagePath, 0755); err != nil {
-		Fatalf("error making directories at path provided: %v", err)
+		Fatalf("making directories at path provided: %v", err)
 	}
 
 	fullImagePath := filepath.Join(c.imagePath, checkpointFileName)
@@ -115,12 +115,12 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	// Restore into new container with same ID.
 	bundleDir := cont.BundleDir
 	if bundleDir == "" {
-		Fatalf("error setting bundleDir")
+		Fatalf("setting bundleDir")
 	}
 
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("error reading spec: %v", err)
+		Fatalf("reading spec: %v", err)
 	}
 
 	specutils.LogSpec(spec)
@@ -130,17 +130,17 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	}
 
 	if err := cont.Destroy(); err != nil {
-		Fatalf("error destroying container: %v", err)
+		Fatalf("destroying container: %v", err)
 	}
 
 	cont, err = container.Create(id, spec, conf, bundleDir, "", "", "")
 	if err != nil {
-		Fatalf("error restoring container: %v", err)
+		Fatalf("restoring container: %v", err)
 	}
 	defer cont.Destroy()
 
 	if err := cont.Restore(spec, conf, fullImagePath); err != nil {
-		Fatalf("error starting container: %v", err)
+		Fatalf("starting container: %v", err)
 	}
 
 	ws, err := cont.Wait()
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index d187b8592..30c8fa283 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -89,7 +89,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("error reading spec: %v", err)
+		Fatalf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
@@ -97,7 +97,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	// container unless the metadata specifies that it should be run in an
 	// existing container.
 	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, c.userLog); err != nil {
-		Fatalf("error creating container: %v", err)
+		Fatalf("creating container: %v", err)
 	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index de530c068..e10326754 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -68,7 +68,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		var err error
 		c, err = container.Load(conf.RootDir, f.Arg(0))
 		if err != nil {
-			Fatalf("error loading container %q: %v", f.Arg(0), err)
+			Fatalf("loading container %q: %v", f.Arg(0), err)
 		}
 	} else {
 		if f.NArg() != 0 {
@@ -78,12 +78,12 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		// Go over all sandboxes and find the one that matches PID.
 		ids, err := container.List(conf.RootDir)
 		if err != nil {
-			Fatalf("error listing containers: %v", err)
+			Fatalf("listing containers: %v", err)
 		}
 		for _, id := range ids {
 			candidate, err := container.Load(conf.RootDir, id)
 			if err != nil {
-				Fatalf("error loading container %q: %v", id, err)
+				Fatalf("loading container %q: %v", id, err)
 			}
 			if candidate.SandboxPid() == d.pid {
 				c = candidate
@@ -110,7 +110,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		log.Infof("Retrieving sandbox stacks")
 		stacks, err := c.Sandbox.Stacks()
 		if err != nil {
-			Fatalf("error retrieving stacks: %v", err)
+			Fatalf("retrieving stacks: %v", err)
 		}
 		log.Infof("     *** Stack dump ***\n%s", stacks)
 	}
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 8c7c7a5cd..3206b267a 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -74,13 +74,13 @@ func (d *Delete) execute(ids []string, conf *boot.Config) error {
 				log.Warningf("couldn't find container %q: %v", id, err)
 				return nil
 			}
-			return fmt.Errorf("error loading container %q: %v", id, err)
+			return fmt.Errorf("loading container %q: %v", id, err)
 		}
 		if !d.force && c.Status != container.Created && c.Status != container.Stopped {
 			return fmt.Errorf("cannot delete container that is not stopped without --force flag")
 		}
 		if err := c.Destroy(); err != nil {
-			return fmt.Errorf("error destroying container: %v", err)
+			return fmt.Errorf("destroying container: %v", err)
 		}
 	}
 	return nil
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index a54856fb4..208d2f74b 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -76,7 +76,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandbox: %v", err)
+		Fatalf("loading sandbox: %v", err)
 	}
 
 	// Repeatedly get stats from the container.
@@ -84,13 +84,13 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 		// Get the event and print it as JSON.
 		ev, err := c.Event()
 		if err != nil {
-			log.Warningf("error getting events for container: %v", err)
+			log.Warningf("Error getting events for container: %v", err)
 		}
 		// err must be preserved because it is used below when breaking
 		// out of the loop.
 		b, err := json.Marshal(ev)
 		if err != nil {
-			log.Warningf("error while marshalling event %v: %v", ev, err)
+			log.Warningf("Error while marshalling event %v: %v", ev, err)
 		} else {
 			os.Stdout.Write(b)
 		}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 548207222..13584d800 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -111,14 +111,14 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	e, id, err := ex.parseArgs(f)
 	if err != nil {
-		Fatalf("error parsing process spec: %v", err)
+		Fatalf("parsing process spec: %v", err)
 	}
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandbox: %v", err)
+		Fatalf("loading sandbox: %v", err)
 	}
 
 	// Replace empty settings with defaults from container.
@@ -128,13 +128,13 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if e.Envv == nil {
 		e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env)
 		if err != nil {
-			Fatalf("error getting environment variables: %v", err)
+			Fatalf("getting environment variables: %v", err)
 		}
 	}
 	if e.Capabilities == nil {
 		e.Capabilities, err = specutils.Capabilities(c.Spec.Process.Capabilities)
 		if err != nil {
-			Fatalf("error creating capabilities: %v", err)
+			Fatalf("creating capabilities: %v", err)
 		}
 	}
 
@@ -149,7 +149,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Start the new process and get it pid.
 	pid, err := c.Execute(e)
 	if err != nil {
-		Fatalf("error getting processes for container: %v", err)
+		Fatalf("getting processes for container: %v", err)
 	}
 
 	if e.StdioIsPty {
@@ -163,7 +163,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if ex.internalPidFile != "" {
 		pidStr := []byte(strconv.Itoa(int(pid)))
 		if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
-			Fatalf("error writing internal pid file %q: %v", ex.internalPidFile, err)
+			Fatalf("writing internal pid file %q: %v", ex.internalPidFile, err)
 		}
 	}
 
@@ -172,14 +172,14 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// returns.
 	if ex.pidFile != "" {
 		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
-			Fatalf("error writing pid file: %v", err)
+			Fatalf("writing pid file: %v", err)
 		}
 	}
 
 	// Wait for the process to exit.
 	ws, err := c.WaitPID(pid, ex.clearStatus)
 	if err != nil {
-		Fatalf("error waiting on pid %d: %v", pid, err)
+		Fatalf("waiting on pid %d: %v", pid, err)
 	}
 	*waitStatus = ws
 	return subcommands.ExitSuccess
@@ -188,7 +188,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
 	binPath, err := specutils.BinPath()
 	if err != nil {
-		Fatalf("error getting bin path: %v", err)
+		Fatalf("getting bin path: %v", err)
 	}
 	var args []string
 
@@ -199,7 +199,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	if pidFile == "" {
 		tmpDir, err := ioutil.TempDir("", "exec-pid-")
 		if err != nil {
-			Fatalf("error creating TempDir: %v", err)
+			Fatalf("creating TempDir: %v", err)
 		}
 		defer os.RemoveAll(tmpDir)
 		pidFile = filepath.Join(tmpDir, "pid")
@@ -232,7 +232,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		// socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
 		if err != nil {
-			Fatalf("error setting up console with socket %q: %v", ex.consoleSocket, err)
+			Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err)
 		}
 		defer tty.Close()
 
@@ -307,7 +307,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
 	for _, s := range ex.extraKGIDs {
 		kgid, err := strconv.Atoi(s)
 		if err != nil {
-			Fatalf("error parsing GID: %s, %v", s, err)
+			Fatalf("parsing GID: %s, %v", s, err)
 		}
 		extraKGIDs = append(extraKGIDs, auth.KGID(kgid))
 	}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 7276f3f26..43286a2e5 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -101,12 +101,12 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	specFile, err := specutils.OpenCleanSpec(g.bundleDir)
 	if err != nil {
-		Fatalf("error opening spec: %v", err)
+		Fatalf("opening spec: %v", err)
 	}
 	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
 	specFile.Close()
 	if err != nil {
-		Fatalf("error reading spec: %v", err)
+		Fatalf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
@@ -120,7 +120,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("failed to chroot to %q: %v", root, err)
 	}
 	if err := syscall.Chdir("/"); err != nil {
-		Fatalf("failed to change working dir: %v", err)
+		Fatalf("changing working dir: %v", err)
 	}
 	log.Infof("Process chroot'd to %q", root)
 
@@ -131,7 +131,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		PanicOnWrite: g.panicOnWrite,
 	})
 	if err != nil {
-		Fatalf("Error creating attach point: %v", err)
+		Fatalf("creating attach point: %v", err)
 	}
 	ats = append(ats, ap)
 	log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly)
@@ -145,12 +145,12 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			}
 			ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
 			if err != nil {
-				Fatalf("Error creating attach point: %v", err)
+				Fatalf("creating attach point: %v", err)
 			}
 			ats = append(ats, ap)
 
 			if mountIdx >= len(g.ioFDs) {
-				Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
+				Fatalf("no FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
 			}
 			log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfg.ROMount)
 			mountIdx++
@@ -161,7 +161,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	if err := filter.Install(); err != nil {
-		Fatalf("Failed to install seccomp filters: %v", err)
+		Fatalf("installing seccomp filters: %v", err)
 	}
 
 	runServers(ats, g.ioFDs)
@@ -176,7 +176,7 @@ func runServers(ats []p9.Attacher, ioFDs []int) {
 		go func(ioFD int, at p9.Attacher) {
 			socket, err := unet.NewSocket(ioFD)
 			if err != nil {
-				Fatalf("err creating server on FD %d: %v", ioFD, err)
+				Fatalf("creating server on FD %d: %v", ioFD, err)
 			}
 			s := p9.NewServer(at)
 			if err := s.Handle(socket); err != nil {
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 7d86bb043..e67f82473 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -71,7 +71,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading container: %v", err)
+		Fatalf("loading container: %v", err)
 	}
 
 	// The OCI command-line spec says that the signal should be specified
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index acefcb2db..481066225 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -81,7 +81,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	for _, id := range ids {
 		c, err := container.Load(conf.RootDir, id)
 		if err != nil {
-			Fatalf("error loading container %q: %v", id, err)
+			Fatalf("loading container %q: %v", id, err)
 		}
 		containers = append(containers, c)
 	}
@@ -108,7 +108,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			states = append(states, c.State())
 		}
 		if err := json.NewEncoder(os.Stdout).Encode(states); err != nil {
-			Fatalf("error marshaling container state: %v", err)
+			Fatalf("marshaling container state: %v", err)
 		}
 	default:
 		Fatalf("unknown list format %q", l.format)
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
index baba937a8..1276f0dbd 100644
--- a/runsc/cmd/path.go
+++ b/runsc/cmd/path.go
@@ -22,7 +22,7 @@ import (
 func getwdOrDie() string {
 	wd, err := os.Getwd()
 	if err != nil {
-		Fatalf("error getting current working directory: %v", err)
+		Fatalf("getting current working directory: %v", err)
 	}
 	return wd
 }
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index ee608faba..2c93e5f3e 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -57,7 +57,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading container: %v", err)
+		Fatalf("loading container: %v", err)
 	}
 
 	if err := cont.Pause(); err != nil {
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index fd76cf975..060d796f2 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -62,11 +62,11 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandbox: %v", err)
+		Fatalf("loading sandbox: %v", err)
 	}
 	pList, err := c.Processes()
 	if err != nil {
-		Fatalf("error getting processes for container: %v", err)
+		Fatalf("getting processes for container: %v", err)
 	}
 
 	switch ps.format {
@@ -75,11 +75,11 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 	case "json":
 		o, err := control.PrintPIDsJSON(pList)
 		if err != nil {
-			Fatalf("error generating JSON: %v", err)
+			Fatalf("generating JSON: %v", err)
 		}
 		fmt.Println(o)
 	default:
-		Fatalf("Unsupported format: %s", ps.format)
+		Fatalf("unsupported format: %s", ps.format)
 	}
 
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 64b302b0c..66b23c38e 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -84,7 +84,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("error reading spec: %v", err)
+		Fatalf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
@@ -96,15 +96,15 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading container: %v", err)
+		Fatalf("loading container: %v", err)
 	}
 	if err := c.Restore(spec, conf, restoreFile); err != nil {
-		Fatalf("error restoring container: %v", err)
+		Fatalf("restoring container: %v", err)
 	}
 
 	ws, err := c.Wait()
 	if err != nil {
-		Fatalf("error running container: %v", err)
+		Fatalf("running container: %v", err)
 	}
 	*waitStatus = ws
 
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index e684aeb5c..5551d1450 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -58,7 +58,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading container: %v", err)
+		Fatalf("loading container: %v", err)
 	}
 
 	if err := cont.Resume(); err != nil {
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 9a574679f..be1c1b678 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -69,13 +69,13 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("error reading spec: %v", err)
+		Fatalf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog)
 	if err != nil {
-		Fatalf("error running container: %v", err)
+		Fatalf("running container: %v", err)
 	}
 
 	*waitStatus = ws
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index ee306bfa6..063bd39c5 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -175,7 +175,7 @@ func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	if err := ioutil.WriteFile(confPath, specTemplate, 0664); err != nil {
-		Fatalf("error writing to %q: %v", confPath, err)
+		Fatalf("writing to %q: %v", confPath, err)
 	}
 
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 065efec06..9e2e0c11d 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -56,10 +56,10 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading container: %v", err)
+		Fatalf("loading container: %v", err)
 	}
 	if err := c.Start(conf); err != nil {
-		Fatalf("error starting container: %v", err)
+		Fatalf("starting container: %v", err)
 	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index 15e27b250..c3ef65ab5 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -59,7 +59,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading container: %v", err)
+		Fatalf("loading container: %v", err)
 	}
 	log.Debugf("Returning state for container %+v", c)
 
@@ -69,7 +69,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	// Write json-encoded state directly to stdout.
 	b, err := json.MarshalIndent(state, "", "  ")
 	if err != nil {
-		Fatalf("error marshaling container state: %v", err)
+		Fatalf("marshaling container state: %v", err)
 	}
 	os.Stdout.Write(b)
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 1e1c1fe17..6498dd15c 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -66,7 +66,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	// You can't specify both -pid and -rootpid.
 	if wt.rootPID != unsetPID && wt.pid != unsetPID {
-		Fatalf("only up to one of -pid and -rootPid can be set")
+		Fatalf("only one of -pid and -rootPid can be set")
 	}
 
 	id := f.Arg(0)
@@ -74,7 +74,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading container: %v", err)
+		Fatalf("loading container: %v", err)
 	}
 
 	var waitStatus syscall.WaitStatus
@@ -83,21 +83,21 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	case wt.rootPID == unsetPID && wt.pid == unsetPID:
 		ws, err := c.Wait()
 		if err != nil {
-			Fatalf("error waiting on container %q: %v", c.ID, err)
+			Fatalf("waiting on container %q: %v", c.ID, err)
 		}
 		waitStatus = ws
 	// Wait on a PID in the root PID namespace.
 	case wt.rootPID != unsetPID:
 		ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */)
 		if err != nil {
-			Fatalf("error waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
+			Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
 		}
 		waitStatus = ws
 	// Wait on a PID in the container's PID namespace.
 	case wt.pid != unsetPID:
 		ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */)
 		if err != nil {
-			Fatalf("error waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
+			Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
 		}
 		waitStatus = ws
 	}
@@ -107,7 +107,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	// Write json-encoded wait result directly to stdout.
 	if err := json.NewEncoder(os.Stdout).Encode(result); err != nil {
-		Fatalf("error marshaling wait result: %v", err)
+		Fatalf("marshaling wait result: %v", err)
 	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 9f4f9214d..2eb9a8807 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -30,7 +30,7 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	// Create a new pty master and slave.
 	ptyMaster, ptySlave, err := pty.Open()
 	if err != nil {
-		return nil, fmt.Errorf("error opening pty: %v", err)
+		return nil, fmt.Errorf("opening pty: %v", err)
 	}
 	defer ptyMaster.Close()
 
@@ -38,7 +38,7 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	conn, err := net.Dial("unix", socketPath)
 	if err != nil {
 		ptySlave.Close()
-		return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
+		return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err)
 	}
 	defer conn.Close()
 	uc, ok := conn.(*net.UnixConn)
@@ -49,7 +49,7 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	socket, err := uc.File()
 	if err != nil {
 		ptySlave.Close()
-		return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
+		return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err)
 	}
 	defer socket.Close()
 
@@ -57,7 +57,7 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	msg := unix.UnixRights(int(ptyMaster.Fd()))
 	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
 		ptySlave.Close()
-		return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err)
+		return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err)
 	}
 	return ptySlave, nil
 }
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 544e7a250..2d4b85d9f 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -137,7 +137,7 @@ type Container struct {
 func Load(rootDir, id string) (*Container, error) {
 	log.Debugf("Load container %q %q", rootDir, id)
 	if err := validateID(id); err != nil {
-		return nil, fmt.Errorf("error validating id: %v", err)
+		return nil, fmt.Errorf("validating id: %v", err)
 	}
 
 	cRoot, err := findContainerRoot(rootDir, id)
@@ -162,11 +162,11 @@ func Load(rootDir, id string) (*Container, error) {
 			// Preserve error so that callers can distinguish 'not found' errors.
 			return nil, err
 		}
-		return nil, fmt.Errorf("error reading container metadata file %q: %v", metaFile, err)
+		return nil, fmt.Errorf("reading container metadata file %q: %v", metaFile, err)
 	}
 	var c Container
 	if err := json.Unmarshal(metaBytes, &c); err != nil {
-		return nil, fmt.Errorf("error unmarshaling container metadata from %q: %v", metaFile, err)
+		return nil, fmt.Errorf("unmarshaling container metadata from %q: %v", metaFile, err)
 	}
 
 	// If the status is "Running" or "Created", check that the sandbox
@@ -225,7 +225,7 @@ func List(rootDir string) ([]string, error) {
 	log.Debugf("List containers %q", rootDir)
 	fs, err := ioutil.ReadDir(rootDir)
 	if err != nil {
-		return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err)
+		return nil, fmt.Errorf("reading dir %q: %v", rootDir, err)
 	}
 	var out []string
 	for _, f := range fs {
@@ -257,7 +257,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil {
 		return nil, fmt.Errorf("container with id %q already exists", id)
 	} else if !os.IsNotExist(err) {
-		return nil, fmt.Errorf("error looking for existing container in %q: %v", containerRoot, err)
+		return nil, fmt.Errorf("looking for existing container in %q: %v", containerRoot, err)
 	}
 
 	c := &Container{
@@ -446,14 +446,14 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	log.Debugf("Run container %q in root dir: %s", id, conf.RootDir)
 	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, userLog)
 	if err != nil {
-		return 0, fmt.Errorf("error creating container: %v", err)
+		return 0, fmt.Errorf("creating container: %v", err)
 	}
 	// Clean up partially created container if an error ocurrs.
 	// Any errors returned by Destroy() itself are ignored.
 	defer c.Destroy()
 
 	if err := c.Start(conf); err != nil {
-		return 0, fmt.Errorf("error starting container: %v", err)
+		return 0, fmt.Errorf("starting container: %v", err)
 	}
 	return c.Wait()
 }
@@ -595,7 +595,7 @@ func (c *Container) Pause() error {
 	}
 
 	if err := c.Sandbox.Pause(c.ID); err != nil {
-		return fmt.Errorf("error pausing container: %v", err)
+		return fmt.Errorf("pausing container: %v", err)
 	}
 	c.changeStatus(Paused)
 	return c.save()
@@ -615,7 +615,7 @@ func (c *Container) Resume() error {
 		return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
 	}
 	if err := c.Sandbox.Resume(c.ID); err != nil {
-		return fmt.Errorf("error resuming container: %v", err)
+		return fmt.Errorf("resuming container: %v", err)
 	}
 	c.changeStatus(Running)
 	return c.save()
@@ -657,19 +657,19 @@ func (c *Container) Destroy() error {
 	var errs []string
 
 	if err := c.stop(); err != nil {
-		err = fmt.Errorf("error stopping container: %v", err)
+		err = fmt.Errorf("stopping container: %v", err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
 
 	if err := destroyFS(c.Spec); err != nil {
-		err = fmt.Errorf("error destroying container fs: %v", err)
+		err = fmt.Errorf("destroying container fs: %v", err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
 
 	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
-		err = fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
+		err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
@@ -702,10 +702,10 @@ func (c *Container) save() error {
 	metaFile := filepath.Join(c.Root, metadataFilename)
 	meta, err := json.Marshal(c)
 	if err != nil {
-		return fmt.Errorf("error marshaling container metadata: %v", err)
+		return fmt.Errorf("invalid container metadata: %v", err)
 	}
 	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
-		return fmt.Errorf("error writing container metadata: %v", err)
+		return fmt.Errorf("writing container metadata: %v", err)
 	}
 	return nil
 }
@@ -719,7 +719,7 @@ func (c *Container) stop() error {
 	if c.Sandbox != nil {
 		log.Debugf("Destroying container %q", c.ID)
 		if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
-			return fmt.Errorf("error destroying container %q: %v", c.ID, err)
+			return fmt.Errorf("destroying container %q: %v", c.ID, err)
 		}
 		cgroup = c.Sandbox.Cgroup
 		// Only set sandbox to nil after it has been told to destroy the container.
@@ -917,12 +917,12 @@ func (c *Container) lock() (func() error, error) {
 // given container root directory.
 func lockContainerMetadata(containerRootDir string) (func() error, error) {
 	if err := os.MkdirAll(containerRootDir, 0711); err != nil {
-		return nil, fmt.Errorf("error creating container root directory %q: %v", containerRootDir, err)
+		return nil, fmt.Errorf("creating container root directory %q: %v", containerRootDir, err)
 	}
 	f := filepath.Join(containerRootDir, metadataLockFilename)
 	l := flock.NewFlock(f)
 	if err := l.Lock(); err != nil {
-		return nil, fmt.Errorf("error acquiring lock on container lock file %q: %v", f, err)
+		return nil, fmt.Errorf("acquiring lock on container lock file %q: %v", f, err)
 	}
 	return l.Unlock, nil
 }
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index 97195550f..998160487 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -94,14 +94,14 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mou
 		flags |= syscall.MS_BIND
 		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
 		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
-			return nil, fmt.Errorf("failed to mount %v: %v", m, err)
+			return nil, fmt.Errorf("mounting %v: %v", m, err)
 		}
 
 		// Make the mount a slave, so that for recursive bind mount, umount won't
 		// propagate to the source.
 		flags = syscall.MS_SLAVE | syscall.MS_REC
 		if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
-			return nil, fmt.Errorf("failed to rslave mount dst: %q, flags: %#x, err: %v", dst, flags, err)
+			return nil, fmt.Errorf("mount rslave dst: %q, flags: %#x, err: %v", dst, flags, err)
 		}
 
 		cpy := m
@@ -146,7 +146,7 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mou
 		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
 		src := spec.Root.Path
 		if err := syscall.Mount(src, src, "bind", flags, ""); err != nil {
-			return nil, fmt.Errorf("failed to remount root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
+			return nil, fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
 		}
 	}
 	return rv, nil
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 52fe8fc0f..8ec320d09 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -61,19 +61,19 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 	case boot.NetworkNone:
 		log.Infof("Network is disabled, create loopback interface only")
 		if err := createDefaultLoopbackInterface(conn); err != nil {
-			return fmt.Errorf("error creating default loopback interface: %v", err)
+			return fmt.Errorf("creating default loopback interface: %v", err)
 		}
 	case boot.NetworkSandbox:
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
 		if err := createInterfacesAndRoutesFromNS(conn, nsPath); err != nil {
-			return fmt.Errorf("error creating interfaces from net namespace %q: %v", nsPath, err)
+			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
 	case boot.NetworkHost:
 		// Nothing to do here.
 	default:
-		return fmt.Errorf("Invalid network type: %d", conf.Network)
+		return fmt.Errorf("invalid network type: %d", conf.Network)
 	}
 	return nil
 }
@@ -99,7 +99,7 @@ func createDefaultLoopbackInterface(conn *urpc.Client) error {
 	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
 		LoopbackLinks: []boot.LoopbackLink{link},
 	}, nil); err != nil {
-		return fmt.Errorf("error creating loopback link and routes: %v", err)
+		return fmt.Errorf("creating loopback link and routes: %v", err)
 	}
 	return nil
 }
@@ -112,7 +112,7 @@ func joinNetNS(nsPath string) (func(), error) {
 	})
 	if err != nil {
 		runtime.UnlockOSThread()
-		return nil, fmt.Errorf("error joining net namespace %q: %v", nsPath, err)
+		return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err)
 	}
 	return func() {
 		restoreNS()
@@ -147,7 +147,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 	// Get all interfaces in the namespace.
 	ifaces, err := net.Interfaces()
 	if err != nil {
-		return fmt.Errorf("error querying interfaces: %v", err)
+		return fmt.Errorf("querying interfaces: %v", err)
 	}
 
 	if isRootNS(ifaces) {
@@ -164,14 +164,14 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 
 		allAddrs, err := iface.Addrs()
 		if err != nil {
-			return fmt.Errorf("error fetching interface addresses for %q: %v", iface.Name, err)
+			return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err)
 		}
 
 		// We build our own loopback devices.
 		if iface.Flags&net.FlagLoopback != 0 {
 			links, err := loopbackLinks(iface, allAddrs)
 			if err != nil {
-				return fmt.Errorf("error getting loopback routes and links for iface %q: %v", iface.Name, err)
+				return fmt.Errorf("getting loopback routes and links for iface %q: %v", iface.Name, err)
 			}
 			args.LoopbackLinks = append(args.LoopbackLinks, links...)
 			continue
@@ -218,7 +218,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 		// will remove the routes as well.
 		routes, def, err := routesForIface(iface)
 		if err != nil {
-			return fmt.Errorf("error getting routes for interface %q: %v", iface.Name, err)
+			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
 		}
 		if def != nil {
 			if !args.DefaultGateway.Route.Empty() {
@@ -237,7 +237,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 		// Get the link for the interface.
 		ifaceLink, err := netlink.LinkByName(iface.Name)
 		if err != nil {
-			return fmt.Errorf("error getting link for interface %q: %v", iface.Name, err)
+			return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
 		}
 
 		// Collect the addresses for the interface, enable forwarding,
@@ -247,7 +247,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 
 			// Steal IP address from NIC.
 			if err := removeAddress(ifaceLink, addr.String()); err != nil {
-				return fmt.Errorf("error removing address %v from device %q: %v", iface.Name, addr, err)
+				return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err)
 			}
 		}
 
@@ -257,7 +257,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 
 	log.Debugf("Setting up network, config: %+v", args)
 	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
-		return fmt.Errorf("error creating links and routes: %v", err)
+		return fmt.Errorf("creating links and routes: %v", err)
 	}
 	return nil
 }
@@ -291,7 +291,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
 	}
 	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
 	if err != nil {
-		return nil, nil, fmt.Errorf("error getting routes from %q: %v", iface.Name, err)
+		return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
 	}
 
 	var def *boot.Route
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index df4c3c787..53cb464d2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -100,7 +100,7 @@ func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	// Wait until the sandbox has booted.
 	b := make([]byte, 1)
 	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
-		return nil, fmt.Errorf("error reading from the start-sync descriptor: %v", err)
+		return nil, fmt.Errorf("reading from the start-sync descriptor: %v", err)
 	}
 
 	c.Release()
@@ -133,13 +133,13 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 
 	// Configure the network.
 	if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
-		return fmt.Errorf("error setting up network: %v", err)
+		return fmt.Errorf("setting up network: %v", err)
 	}
 
 	// Send a message to the sandbox control server to start the root
 	// container.
 	if err := conn.Call(boot.RootContainerStart, &s.ID, nil); err != nil {
-		return fmt.Errorf("error starting root container %v: %v", spec.Process.Args, err)
+		return fmt.Errorf("starting root container: %v", err)
 	}
 
 	return nil
@@ -169,18 +169,18 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string
 		FilePayload: urpc.FilePayload{Files: files},
 	}
 	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
-		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
+		return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err)
 	}
 	return nil
 }
 
 // Restore sends the restore call for a container in the sandbox.
-func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f string) error {
+func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error {
 	log.Debugf("Restore sandbox %q", s.ID)
 
-	rf, err := os.Open(f)
+	rf, err := os.Open(filename)
 	if err != nil {
-		return fmt.Errorf("os.Open(%q) failed: %v", f, err)
+		return fmt.Errorf("opening restore file %q failed: %v", filename, err)
 	}
 	defer rf.Close()
 
@@ -207,12 +207,12 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 
 	// Configure the network.
 	if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
-		return fmt.Errorf("error setting up network: %v", err)
+		return fmt.Errorf("setting up network: %v", err)
 	}
 
 	// Restore the container and start the root container.
 	if err := conn.Call(boot.ContainerRestore, &opt, nil); err != nil {
-		return fmt.Errorf("error restoring container %q: %v", cid, err)
+		return fmt.Errorf("restoring container %q: %v", cid, err)
 	}
 
 	return nil
@@ -230,7 +230,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 
 	var pl []*control.Process
 	if err := conn.Call(boot.ContainerProcesses, &cid, &pl); err != nil {
-		return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
+		return nil, fmt.Errorf("retrieving process data from sandbox: %v", err)
 	}
 	return pl, nil
 }
@@ -248,7 +248,7 @@ func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
 	// Send a message to the sandbox control server to start the container.
 	var pid int32
 	if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil {
-		return 0, fmt.Errorf("error executing command %q in sandbox: %v", args, err)
+		return 0, fmt.Errorf("executing command %q in sandbox: %v", args, err)
 	}
 	return pid, nil
 }
@@ -266,7 +266,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	// TODO: Pass in the container id (cid) here. The sandbox
 	// should return events only for that container.
 	if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil {
-		return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err)
+		return nil, fmt.Errorf("retrieving event data from sandbox: %v", err)
 	}
 	e.ID = cid
 	return &e, nil
@@ -282,7 +282,7 @@ func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
 }
 
 func (s *Sandbox) connError(err error) error {
-	return fmt.Errorf("error connecting to control server at PID %d: %v", s.Pid, err)
+	return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid, err)
 }
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
@@ -305,7 +305,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	if conf.LogFilename != "" {
 		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
 		if err != nil {
-			return fmt.Errorf("error opening log file %q: %v", conf.LogFilename, err)
+			return fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
 		}
 		defer logFile.Close()
 		cmd.ExtraFiles = append(cmd.ExtraFiles, logFile)
@@ -315,7 +315,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	if conf.DebugLog != "" {
 		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot")
 		if err != nil {
-			return fmt.Errorf("error opening debug log file in %q: %v", conf.DebugLog, err)
+			return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
 		}
 		defer debugLogFile.Close()
 		cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile)
@@ -333,7 +333,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	sockFD, err := server.CreateSocket(addr)
 	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
 	if err != nil {
-		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
+		return fmt.Errorf("creating control server socket for sandbox %q: %v", s.ID, err)
 	}
 	controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket")
 	defer controllerFile.Close()
@@ -391,7 +391,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		// socket, and return the slave.
 		tty, err := console.NewWithSocket(consoleSocket)
 		if err != nil {
-			return fmt.Errorf("error setting up console with socket %q: %v", consoleSocket, err)
+			return fmt.Errorf("setting up console with socket %q: %v", consoleSocket, err)
 		}
 		defer tty.Close()
 
@@ -558,13 +558,13 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	if s.Cgroup != nil {
 		cpuNum, err := s.Cgroup.NumCPU()
 		if err != nil {
-			return fmt.Errorf("error getting cpu count from cgroups: %v", err)
+			return fmt.Errorf("getting cpu count from cgroups: %v", err)
 		}
 		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
 
 		mem, err := s.Cgroup.MemoryLimit()
 		if err != nil {
-			return fmt.Errorf("error getting memory limit from cgroups: %v", err)
+			return fmt.Errorf("getting memory limit from cgroups: %v", err)
 		}
 		// When memory limit is unset, a "large" number is returned. In that case,
 		// just stick with the default.
@@ -636,7 +636,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 		// "On Unix systems, FindProcess always succeeds and returns a
 		// Process for the given pid, regardless of whether the process
 		// exists."
-		return ws, fmt.Errorf("FindProcess(%d) failed: %v", s.Pid, err)
+		return ws, fmt.Errorf("Find process %d: %v", s.Pid, err)
 	}
 	ps, err := p.Wait()
 	if err != nil {
@@ -662,7 +662,7 @@ func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.Wait
 		ClearStatus: clearStatus,
 	}
 	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
-		return ws, fmt.Errorf("error waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
+		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
 	}
 	return ws, nil
 }
@@ -680,10 +680,10 @@ func (s *Sandbox) destroy() error {
 	if s.Pid != 0 {
 		log.Debugf("Killing sandbox %q", s.ID)
 		if err := syscall.Kill(s.Pid, syscall.SIGKILL); err != nil && err != syscall.ESRCH {
-			return fmt.Errorf("error killing sandbox %q PID %q: %v", s.ID, s.Pid, err)
+			return fmt.Errorf("killing sandbox %q PID %q: %v", s.ID, s.Pid, err)
 		}
 		if err := s.waitForStopped(); err != nil {
-			return fmt.Errorf("error waiting sandbox %q stop: %v", s.ID, err)
+			return fmt.Errorf("waiting sandbox %q stop: %v", s.ID, err)
 		}
 	}
 
@@ -712,7 +712,7 @@ func (s *Sandbox) SignalContainer(cid string, sig syscall.Signal, all bool) erro
 		Mode:  mode,
 	}
 	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
-		return fmt.Errorf("err signaling container %q: %v", cid, err)
+		return fmt.Errorf("signaling container %q: %v", cid, err)
 	}
 	return nil
 }
@@ -741,7 +741,7 @@ func (s *Sandbox) SignalProcess(cid string, pid int32, sig syscall.Signal, fgPro
 		Mode:  mode,
 	}
 	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
-		return fmt.Errorf("err signaling container %q PID %d: %v", cid, pid, err)
+		return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err)
 	}
 	return nil
 }
@@ -763,7 +763,7 @@ func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
 	}
 
 	if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil {
-		return fmt.Errorf("err checkpointing container %q: %v", cid, err)
+		return fmt.Errorf("checkpointing container %q: %v", cid, err)
 	}
 	return nil
 }
@@ -778,7 +778,7 @@ func (s *Sandbox) Pause(cid string) error {
 	defer conn.Close()
 
 	if err := conn.Call(boot.ContainerPause, nil, nil); err != nil {
-		return fmt.Errorf("err pausing container %q: %v", cid, err)
+		return fmt.Errorf("pausing container %q: %v", cid, err)
 	}
 	return nil
 }
@@ -793,7 +793,7 @@ func (s *Sandbox) Resume(cid string) error {
 	defer conn.Close()
 
 	if err := conn.Call(boot.ContainerResume, nil, nil); err != nil {
-		return fmt.Errorf("err resuming container %q: %v", cid, err)
+		return fmt.Errorf("resuming container %q: %v", cid, err)
 	}
 	return nil
 }
@@ -821,7 +821,7 @@ func (s *Sandbox) Stacks() (string, error) {
 
 	var stacks string
 	if err := conn.Call(boot.SandboxStacks, nil, &stacks); err != nil {
-		return "", fmt.Errorf("err getting sandbox %q stacks: %v", s.ID, err)
+		return "", fmt.Errorf("getting sandbox %q stacks: %v", s.ID, err)
 	}
 	return stacks, nil
 }
@@ -846,7 +846,7 @@ func (s *Sandbox) DestroyContainer(cid string) error {
 	}
 	defer conn.Close()
 	if err := conn.Call(boot.ContainerDestroy, &cid, nil); err != nil {
-		return fmt.Errorf("error destroying container %q: %v", cid, err)
+		return fmt.Errorf("destroying container %q: %v", cid, err)
 	}
 	return nil
 }
@@ -889,7 +889,7 @@ func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) {
 		return nil, nil
 	}
 	if err != nil {
-		return nil, fmt.Errorf("error opening device file for platform %q: %v", p, err)
+		return nil, fmt.Errorf("opening device file for platform %q: %v", p, err)
 	}
 	return f, err
 }
-- 
cgit v1.2.3


From 5f08f8fd8162fa2fc2ca7b862263081d8d07b206 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 22 Jan 2019 16:45:45 -0800
Subject: Don't bind-mount runsc into a sandbox mntns

PiperOrigin-RevId: 230437407
Change-Id: Id9d8ceeb018aad2fe317407c78c6ee0f4b47aa2b
---
 runsc/cmd/boot.go              |  1 -
 runsc/cmd/chroot.go            |  8 --------
 runsc/cmd/cmd.go               |  9 +++------
 runsc/cmd/exec.go              |  6 ++----
 runsc/container/container.go   |  6 ++----
 runsc/sandbox/sandbox.go       |  5 +----
 runsc/specutils/specutils.go   | 10 ----------
 runsc/test/root/chroot_test.go | 13 ++++---------
 8 files changed, 12 insertions(+), 46 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 7f87b2623..3039b389f 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -129,7 +129,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			Fatalf("error setting up chroot: %v", err)
 		}
 
-		specutils.ExePath = "/runsc"
 		if !b.applyCaps {
 			// Remove --setup-root arg to call myself.
 			var args []string
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
index ec539a11c..c1acbf26b 100644
--- a/runsc/cmd/chroot.go
+++ b/runsc/cmd/chroot.go
@@ -24,10 +24,6 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-// chrootBinPath is the location inside the chroot where the runsc binary will
-// be mounted.
-const chrootBinPath = "/runsc"
-
 // mountInChroot creates the destination mount point in the given chroot and
 // mounts the source.
 func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
@@ -70,10 +66,6 @@ func setUpChroot(pidns bool) error {
 		}
 	}
 
-	if err := mountInChroot(chroot, specutils.ExePath, chrootBinPath, "bind", syscall.MS_BIND|syscall.MS_RDONLY); err != nil {
-		return fmt.Errorf("error mounting runsc in chroot: %v", err)
-	}
-
 	if err := os.Chdir(chroot); err != nil {
 		return fmt.Errorf("error changing working directory: %v", err)
 	}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index fbfc18fc9..208cf5304 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -80,13 +80,10 @@ func setCapsAndCallSelf(args []string, caps *specs.LinuxCapabilities) error {
 	if err := applyCaps(caps); err != nil {
 		return fmt.Errorf("applyCaps() failed: %v", err)
 	}
-	binPath, err := specutils.BinPath()
-	if err != nil {
-		return err
-	}
+	binPath := specutils.ExePath
 
 	log.Infof("Execve %q again, bye!", binPath)
-	err = syscall.Exec(binPath, args, []string{})
+	err := syscall.Exec(binPath, args, []string{})
 	return fmt.Errorf("error executing %s: %v", binPath, err)
 }
 
@@ -105,7 +102,7 @@ func callSelfAsNobody(args []string) error {
 		return fmt.Errorf("error setting gid: %v", err)
 	}
 
-	binPath := "/runsc"
+	binPath := specutils.ExePath
 
 	log.Infof("Execve %q again, bye!", binPath)
 	err := syscall.Exec(binPath, args, []string{})
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 13584d800..9e058ad97 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -186,10 +186,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 }
 
 func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
-	binPath, err := specutils.BinPath()
-	if err != nil {
-		Fatalf("getting bin path: %v", err)
-	}
+	binPath := specutils.ExePath
 	var args []string
 
 	// The command needs to write a pid file so that execAndWait can tell
@@ -219,6 +216,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	}
 
 	cmd := exec.Command(binPath, args...)
+	cmd.Args[0] = "runsc-exec"
 
 	// Exec stdio defaults to current process stdio.
 	cmd.Stdin = os.Stdin
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 2d4b85d9f..6d88dff7f 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -818,12 +818,10 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 		args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
 	}
 
-	binPath, err := specutils.BinPath()
-	if err != nil {
-		return nil, err
-	}
+	binPath := specutils.ExePath
 	cmd := exec.Command(binPath, args...)
 	cmd.ExtraFiles = goferEnds
+	cmd.Args[0] = "runsc-gofer"
 
 	// Enter new namespaces to isolate from the rest of the system. Don't unshare
 	// cgroup because gofer is added to a cgroup in the caller's namespace.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 53cb464d2..721a49141 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -292,10 +292,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
 
-	binPath, err := specutils.BinPath()
-	if err != nil {
-		return err
-	}
+	binPath := specutils.ExePath
 	cmd := exec.Command(binPath, conf.ToFlags()...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{}
 
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 7b0dcf231..4e7893ab4 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -315,16 +315,6 @@ func IsSupportedDevMount(m specs.Mount) bool {
 	return true
 }
 
-// BinPath returns the real path to self, resolving symbolink links. This is done
-// to make the process name appears as 'runsc', instead of 'exe'.
-func BinPath() (string, error) {
-	binPath, err := filepath.EvalSymlinks(ExePath)
-	if err != nil {
-		return "", fmt.Errorf(`error resolving %q symlink: %v`, ExePath, err)
-	}
-	return binPath, nil
-}
-
 const (
 	// ContainerdContainerTypeAnnotation is the OCI annotation set by
 	// containerd to indicate whether the container to create should have
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 04124703d..89f90c3e0 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -26,8 +26,6 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
-	"reflect"
-	"sort"
 	"strconv"
 	"strings"
 	"testing"
@@ -73,16 +71,13 @@ func TestChroot(t *testing.T) {
 	if err != nil {
 		t.Fatalf("error listing %q: %v", chroot, err)
 	}
-	if want, got := 2, len(fi); want != got {
+	if want, got := 1, len(fi); want != got {
 		t.Fatalf("chroot dir got %d entries, want %d", got, want)
 	}
 
-	// chroot dir is prepared by runsc and should contains only the executable
-	// and /proc.
-	files := []string{fi[0].Name(), fi[1].Name()}
-	sort.Strings(files)
-	if want := []string{"proc", "runsc"}; !reflect.DeepEqual(files, want) {
-		t.Errorf("chroot got children %v, want %v", files, want)
+	// chroot dir is prepared by runsc and should contains only /proc.
+	if fi[0].Name() != "proc" {
+		t.Errorf("chroot got children %v, want %v", fi[0].Name(), "proc")
 	}
 
 	d.CleanUp()
-- 
cgit v1.2.3


From c28f886c0bb0ff996e07fc133e0ebe1d842b496a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 25 Jan 2019 14:38:10 -0800
Subject: Execute statically linked binary

Mounting lib and lib64 are not necessary anymore and simplifies the test.

PiperOrigin-RevId: 230971195
Change-Id: Ib91a3ffcec4b322cd3687c337eedbde9641685ed
---
 runsc/container/BUILD             |  2 ++
 runsc/container/container_test.go | 27 +++++++++++++++++++--------
 2 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index d9534cbcc..5dfff5c5e 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -65,7 +65,9 @@ go_test(
 
 go_binary(
     name = "test_app",
+    testonly = 1,
     srcs = ["test_app.go"],
+    pure = "on",
     deps = [
         "//runsc/test/testutil",
         "@com_github_google_subcommands//:go_default_library",
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index bd8655f3e..9f3d6b454 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1526,19 +1526,30 @@ func TestGoferExits(t *testing.T) {
 }
 
 func TestRootNotMount(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/true")
+	if testutil.RaceEnabled {
+		// Requires statically linked binary, since it's mapping the root to a
+		// random dir, libs cannot be located.
+		t.Skip("race makes test_app not statically linked")
+	}
 
-	root, err := ioutil.TempDir(testutil.TmpDir(), "root")
+	appSym, err := testutil.FindFile("runsc/container/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+	app, err := filepath.EvalSymlinks(appSym)
 	if err != nil {
-		t.Fatalf("failure to create tmp dir: %v", err)
+		t.Fatalf("error resolving %q symlink: %v", appSym, err)
 	}
+	log.Infof("App path %q is a symlink to %q", appSym, app)
+
+	root := filepath.Dir(app)
+	exe := "/" + filepath.Base(app)
+	log.Infof("Executing %q in %q", exe, root)
+
+	spec := testutil.NewSpecWithArgs(exe, "help")
 	spec.Root.Path = root
 	spec.Root.Readonly = true
-	spec.Mounts = []specs.Mount{
-		{Destination: "/bin", Source: "/bin", Type: "bind", Options: []string{"ro"}},
-		{Destination: "/lib", Source: "/lib", Type: "bind", Options: []string{"ro"}},
-		{Destination: "/lib64", Source: "/lib64", Type: "bind", Options: []string{"ro"}},
-	}
+	spec.Mounts = nil
 
 	conf := testutil.TestConfig()
 	if err := run(spec, conf); err != nil {
-- 
cgit v1.2.3


From c6facd0358ae61849786dbbc0f4f5a07a25cb6f1 Mon Sep 17 00:00:00 2001
From: ShiruRen <renshiru2000@gmail.com>
Date: Fri, 25 Jan 2019 15:01:55 -0800
Subject: Fix a nil pointer dereference bug in Container.Destroy()

In Container.Destroy(), we call c.stop() before calling
executeHooksBestEffort(), therefore, when we call
executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) to execute
the poststop hook, it results in a nil pointer dereference since it
reads c.Sandbox.Pid in c.State() after the sandbox has been destroyed.
To fix this bug, we can change container's status to "stopped" before
executing the poststop hook.

Signed-off-by: ShiruRen <renshiru2000@gmail.com>
Change-Id: I4d835e430066fab7e599e188f945291adfc521ef
PiperOrigin-RevId: 230975505
---
 runsc/container/container.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 6d88dff7f..1b410c63a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -674,6 +674,8 @@ func (c *Container) Destroy() error {
 		errs = append(errs, err.Error())
 	}
 
+	c.changeStatus(Stopped)
+
 	// "If any poststop hook fails, the runtime MUST log a warning, but the
 	// remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
 	// Based on the OCI, "The post-stop hooks MUST be called after the container is
@@ -686,8 +688,6 @@ func (c *Container) Destroy() error {
 		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
 	}
 
-	c.changeStatus(Stopped)
-
 	if len(errs) == 0 {
 		return nil
 	}
-- 
cgit v1.2.3


From 55e8eb775b422a7485d6d1dc4f8e4c8fd32096da Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 25 Jan 2019 17:22:04 -0800
Subject: Make cacheRemoteRevalidating detect changes to file size

When file size changes outside the sandbox, page cache was not
refreshing file size which is required for cacheRemoteRevalidating.
In fact, cacheRemoteRevalidating should be skipping the cache
completely since it's not really benefiting from it. The cache is
cache is already bypassed for unstable attributes (see
cachePolicy.cacheUAttrs). And althought the cache is called to
map pages, they will always miss the cache and map directly from
the host.

Created a HostMappable struct that maps directly to the host and
use it for files with cacheRemoteRevalidating.

Closes #124

PiperOrigin-RevId: 230998440
Change-Id: Ic5f632eabe33b47241e05e98c95e9b2090ae08fc
---
 pkg/sentry/fs/fsutil/BUILD                  |   2 +
 pkg/sentry/fs/fsutil/host_mappable.go       | 136 ++++++++++++++
 pkg/sentry/fs/fsutil/host_mappable_state.go |  22 +++
 pkg/sentry/fs/gofer/cache_policy.go         |  22 ++-
 pkg/sentry/fs/gofer/file.go                 |   9 +-
 pkg/sentry/fs/gofer/inode.go                |  30 +++-
 pkg/sentry/fs/gofer/path.go                 |   2 +-
 pkg/sentry/fs/gofer/session.go              |  14 +-
 runsc/container/BUILD                       |   1 +
 runsc/container/container_test.go           | 137 --------------
 runsc/container/shared_volume_test.go       | 267 ++++++++++++++++++++++++++++
 11 files changed, 486 insertions(+), 156 deletions(-)
 create mode 100644 pkg/sentry/fs/fsutil/host_mappable.go
 create mode 100644 pkg/sentry/fs/fsutil/host_mappable_state.go
 create mode 100644 runsc/container/shared_volume_test.go

(limited to 'runsc')

diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 4965e1a5f..d4767642b 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -70,6 +70,8 @@ go_library(
         "host_file_mapper.go",
         "host_file_mapper_state.go",
         "host_file_mapper_unsafe.go",
+        "host_mappable.go",
+        "host_mappable_state.go",
         "inode.go",
         "inode_cached.go",
     ],
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
new file mode 100644
index 000000000..4e4bcf4a4
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -0,0 +1,136 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// HostMappable implements memmap.Mappable and platform.File over an arbitrary
+// host file descriptor.
+//
+// +stateify savable
+type HostMappable struct {
+	hostFileMapper *HostFileMapper
+
+	mu sync.Mutex `state:"nosave"`
+
+	// fd is the file descriptor to the host. Protected by mu.
+	fd int `state:"nosave"`
+
+	// mappings tracks mappings of the cached file object into
+	// memmap.MappingSpaces so it can invalidated upon save. Protected by mu.
+	mappings memmap.MappingSet
+}
+
+// NewHostMappable creates a new mappable that maps directly to host FD.
+func NewHostMappable() *HostMappable {
+	return &HostMappable{
+		hostFileMapper: NewHostFileMapper(),
+		fd:             -1,
+	}
+}
+
+func (h *HostMappable) getFD() int {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	if h.fd < 0 {
+		panic("HostMappable FD isn't set")
+	}
+	return h.fd
+}
+
+// UpdateFD sets the host FD iff FD hasn't been set before or if there are
+// no mappings.
+func (h *HostMappable) UpdateFD(fd int) {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.fd = fd
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (h *HostMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	// Hot path. Avoid defers.
+	h.mu.Lock()
+	mapped := h.mappings.AddMapping(ms, ar, offset, writable)
+	for _, r := range mapped {
+		h.hostFileMapper.IncRefOn(r)
+	}
+	h.mu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (h *HostMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	// Hot path. Avoid defers.
+	h.mu.Lock()
+	unmapped := h.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		h.hostFileMapper.DecRefOn(r)
+	}
+	h.mu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (h *HostMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return h.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (h *HostMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	return []memmap.Translation{
+		{
+			Source: optional,
+			File:   h,
+			Offset: optional.Start,
+		},
+	}, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (h *HostMappable) InvalidateUnsavable(ctx context.Context) error {
+	h.mu.Lock()
+	h.mappings.InvalidateAll(memmap.InvalidateOpts{})
+	h.mu.Unlock()
+	return nil
+}
+
+// MapInto implements platform.File.MapInto.
+func (h *HostMappable) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	return as.MapFile(addr, h.getFD(), fr, at, precommit)
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	return h.hostFileMapper.MapInternal(fr, h.getFD(), at.Write)
+}
+
+// IncRef implements platform.File.IncRef.
+func (h *HostMappable) IncRef(fr platform.FileRange) {
+	mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
+	h.hostFileMapper.IncRefOn(mr)
+}
+
+// DecRef implements platform.File.DecRef.
+func (h *HostMappable) DecRef(fr platform.FileRange) {
+	mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
+	h.hostFileMapper.DecRefOn(mr)
+}
diff --git a/pkg/sentry/fs/fsutil/host_mappable_state.go b/pkg/sentry/fs/fsutil/host_mappable_state.go
new file mode 100644
index 000000000..765f1ec87
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_mappable_state.go
@@ -0,0 +1,22 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+// afterLoad is invoked by stateify.
+func (h *HostMappable) afterLoad() {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.fd = -1
+}
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 3d380f0e8..507d6900f 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -90,17 +90,29 @@ func (cp cachePolicy) cacheReaddir() bool {
 	return cp == cacheAll || cp == cacheAllWritethrough
 }
 
-// usePageCache determines whether the page cache should be used for the given
-// inode. If the remote filesystem donates host FDs to the sentry, then the
-// host kernel's page cache will be used, otherwise we will use a
+// useCachingInodeOps determines whether the page cache should be used for the
+// given inode. If the remote filesystem donates host FDs to the sentry, then
+// the host kernel's page cache will be used, otherwise we will use a
 // sentry-internal page cache.
-func (cp cachePolicy) usePageCache(inode *fs.Inode) bool {
+func (cp cachePolicy) useCachingInodeOps(inode *fs.Inode) bool {
 	// Do cached IO for regular files only. Some "character devices" expect
 	// no caching.
 	if !fs.IsFile(inode.StableAttr) {
 		return false
 	}
-	return cp == cacheAll || cp == cacheAllWritethrough || cp == cacheRemoteRevalidating
+	return cp == cacheAll || cp == cacheAllWritethrough
+}
+
+// cacheHandles determine whether handles need to be cached with the given
+// inode. Handles must be cached when inode can be mapped into memory to
+// implement InodeOperations.Mappable with stable handles.
+func (cp cachePolicy) cacheHandles(inode *fs.Inode) bool {
+	// Do cached IO for regular files only. Some "character devices" expect
+	// no caching.
+	if !fs.IsFile(inode.StableAttr) {
+		return false
+	}
+	return cp.useCachingInodeOps(inode) || cp == cacheRemoteRevalidating
 }
 
 // writeThough indicates whether writes to the file should be synced to the
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 3578b07a0..2181ddc68 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -204,7 +204,7 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I
 		return 0, syserror.EISDIR
 	}
 	cp := f.inodeOperations.session().cachePolicy
-	if cp.usePageCache(file.Dirent.Inode) {
+	if cp.useCachingInodeOps(file.Dirent.Inode) {
 		n, err := f.inodeOperations.cachingInodeOps.Write(ctx, src, offset)
 		if err != nil {
 			return n, err
@@ -225,7 +225,7 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO
 		return 0, syserror.EISDIR
 	}
 
-	if f.inodeOperations.session().cachePolicy.usePageCache(file.Dirent.Inode) {
+	if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
 		return f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset)
 	}
 	return dst.CopyOutFrom(ctx, f.handles.readWriterAt(ctx, offset))
@@ -267,10 +267,7 @@ func (f *fileOperations) Flush(ctx context.Context, file *fs.File) error {
 
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
-	if !f.inodeOperations.session().cachePolicy.usePageCache(file.Dirent.Inode) {
-		return syserror.ENODEV
-	}
-	return fsutil.GenericConfigureMMap(file, f.inodeOperations.cachingInodeOps, opts)
+	return f.inodeOperations.configureMMap(file, opts)
 }
 
 // Seek implements fs.FileOperations.Seek.
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index f0dc99fd0..043705c58 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -125,6 +125,10 @@ type inodeFileState struct {
 	// failures. S/R is transparent to Sentry and the latter will continue
 	// using its cached values after restore.
 	savedUAttr *fs.UnstableAttr
+
+	// hostMappable is created when using 'cacheRemoteRevalidating' to map pages
+	// directly from host.
+	hostMappable *fsutil.HostMappable
 }
 
 // Release releases file handles.
@@ -166,6 +170,9 @@ func (i *inodeFileState) setHandlesForCachedIO(flags fs.FileFlags, h *handles) {
 			i.writebackRW = true
 		}
 	}
+	if i.hostMappable != nil {
+		i.hostMappable.UpdateFD(i.fdLocked())
+	}
 }
 
 // getCachedHandles returns any cached handles which would accelerate
@@ -287,7 +294,10 @@ func (i *inodeFileState) Sync(ctx context.Context) error {
 func (i *inodeFileState) FD() int {
 	i.handlesMu.RLock()
 	defer i.handlesMu.RUnlock()
+	return i.fdLocked()
+}
 
+func (i *inodeFileState) fdLocked() int {
 	// Assert that the file was actually opened.
 	if i.writeback == nil && i.readthrough == nil {
 		panic("cannot get host FD for a file that was never opened")
@@ -344,9 +354,13 @@ func (i *inodeOperations) Release(ctx context.Context) {
 
 // Mappable implements fs.InodeOperations.Mappable.
 func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
-	if i.session().cachePolicy.usePageCache(inode) {
+	if i.session().cachePolicy.useCachingInodeOps(inode) {
 		return i.cachingInodeOps
 	}
+	// This check is necessary because it's returning an interface type.
+	if i.fileState.hostMappable != nil {
+		return i.fileState.hostMappable
+	}
 	return nil
 }
 
@@ -434,7 +448,7 @@ func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*
 }
 
 func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	if !i.session().cachePolicy.usePageCache(d.Inode) {
+	if !i.session().cachePolicy.cacheHandles(d.Inode) {
 		h, err := newHandles(ctx, i.fileState.file, flags)
 		if err != nil {
 			return nil, err
@@ -503,7 +517,7 @@ func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts
 // Truncate implements fs.InodeOperations.Truncate.
 func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length int64) error {
 	// This can only be called for files anyway.
-	if i.session().cachePolicy.usePageCache(inode) {
+	if i.session().cachePolicy.useCachingInodeOps(inode) {
 		return i.cachingInodeOps.Truncate(ctx, inode, length)
 	}
 
@@ -561,6 +575,16 @@ func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
 	return info, nil
 }
 
+func (i *inodeOperations) configureMMap(file *fs.File, opts *memmap.MMapOpts) error {
+	if i.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
+		return fsutil.GenericConfigureMMap(file, i.cachingInodeOps, opts)
+	}
+	if i.fileState.hostMappable != nil {
+		return fsutil.GenericConfigureMMap(file, i.fileState.hostMappable, opts)
+	}
+	return syserror.ENODEV
+}
+
 func init() {
 	syserror.AddErrorUnwrapper(func(err error) (syscall.Errno, bool) {
 		if _, ok := err.(p9.ErrSocket); ok {
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index a324dc990..faedfb81c 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -128,7 +128,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 		File: newFile,
 		Host: hostFile,
 	}
-	if iops.session().cachePolicy.usePageCache(d.Inode) {
+	if iops.session().cachePolicy.cacheHandles(d.Inode) {
 		iops.fileState.setHandlesForCachedIO(flags, h)
 	}
 	return NewFile(ctx, d, name, flags, iops, h), nil
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index f76a83cd9..b5b1c8202 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -197,11 +197,17 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 		}
 	}
 
+	var hm *fsutil.HostMappable
+	if s.cachePolicy == cacheRemoteRevalidating && fs.IsFile(sattr) {
+		hm = fsutil.NewHostMappable()
+	}
+
 	fileState := &inodeFileState{
-		s:     s,
-		file:  file,
-		sattr: sattr,
-		key:   deviceKey,
+		s:            s,
+		file:         file,
+		sattr:        sattr,
+		key:          deviceKey,
+		hostMappable: hm,
 	}
 
 	uattr := unstable(ctx, valid, attr, s.mounter, s.client)
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 5dfff5c5e..354ce2661 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -36,6 +36,7 @@ go_test(
         "container_test.go",
         "fs_test.go",
         "multi_container_test.go",
+        "shared_volume_test.go",
     ],
     data = [
         ":test_app",
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 9f3d6b454..06a25de6d 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1353,143 +1353,6 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 }
 
-// Check that modifications to a volume mount are propigated into and out of
-// the sandbox.
-func TestContainerVolumeContentsShared(t *testing.T) {
-	// Only run this test with shared file access, since that is the only
-	// behavior it is testing.
-	conf := testutil.TestConfig()
-	conf.FileAccess = boot.FileAccessShared
-	t.Logf("Running test with conf: %+v", conf)
-
-	// Main process just sleeps. We will use "exec" to probe the state of
-	// the filesystem.
-	spec := testutil.NewSpecWithArgs("sleep", "1000")
-
-	dir, err := ioutil.TempDir(testutil.TmpDir(), "root-fs-test")
-	if err != nil {
-		t.Fatalf("TempDir failed: %v", err)
-	}
-
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// Create and start the container.
-	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer c.Destroy()
-	if err := c.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-
-	// File that will be used to check consistency inside/outside sandbox.
-	filename := filepath.Join(dir, "file")
-
-	// File does not exist yet. Reading from the sandbox should fail.
-	argsTestFile := &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", filename},
-	}
-	if ws, err := c.executeSync(argsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", filename, err)
-	} else if ws.ExitStatus() == 0 {
-		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
-	}
-
-	// Create the file from outside of the sandbox.
-	if err := ioutil.WriteFile(filename, []byte("foobar"), 0777); err != nil {
-		t.Fatalf("error writing to file %q: %v", filename, err)
-	}
-
-	// Now we should be able to test the file from within the sandbox.
-	if ws, err := c.executeSync(argsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", filename, err)
-	} else if ws.ExitStatus() != 0 {
-		t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus())
-	}
-
-	// Rename the file from outside of the sandbox.
-	newFilename := filepath.Join(dir, "newfile")
-	if err := os.Rename(filename, newFilename); err != nil {
-		t.Fatalf("os.Rename(%q, %q) failed: %v", filename, newFilename, err)
-	}
-
-	// File should no longer exist at the old path within the sandbox.
-	if ws, err := c.executeSync(argsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", filename, err)
-	} else if ws.ExitStatus() == 0 {
-		t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus())
-	}
-
-	// We should be able to test the new filename from within the sandbox.
-	argsTestNewFile := &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", newFilename},
-	}
-	if ws, err := c.executeSync(argsTestNewFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
-	} else if ws.ExitStatus() != 0 {
-		t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus())
-	}
-
-	// Delete the renamed file from outside of the sandbox.
-	if err := os.Remove(newFilename); err != nil {
-		t.Fatalf("error removing file %q: %v", filename, err)
-	}
-
-	// Renamed file should no longer exist at the old path within the sandbox.
-	if ws, err := c.executeSync(argsTestNewFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
-	} else if ws.ExitStatus() == 0 {
-		t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus())
-	}
-
-	// Now create the file from WITHIN the sandbox.
-	argsTouch := &control.ExecArgs{
-		Filename: "/usr/bin/touch",
-		Argv:     []string{"touch", filename},
-		KUID:     auth.KUID(os.Getuid()),
-		KGID:     auth.KGID(os.Getgid()),
-	}
-	if ws, err := c.executeSync(argsTouch); err != nil {
-		t.Fatalf("unexpected error touching file %q: %v", filename, err)
-	} else if ws.ExitStatus() != 0 {
-		t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus())
-	}
-
-	// File should exist outside the sandbox.
-	if _, err := os.Stat(filename); err != nil {
-		t.Errorf("stat %q got error %v, wanted nil", filename, err)
-	}
-
-	// File should exist outside the sandbox.
-	if _, err := os.Stat(filename); err != nil {
-		t.Errorf("stat %q got error %v, wanted nil", filename, err)
-	}
-
-	// Delete the file from within the sandbox.
-	argsRemove := &control.ExecArgs{
-		Filename: "/bin/rm",
-		Argv:     []string{"rm", filename},
-	}
-	if ws, err := c.executeSync(argsRemove); err != nil {
-		t.Fatalf("unexpected error removing file %q: %v", filename, err)
-	} else if ws.ExitStatus() != 0 {
-		t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus())
-	}
-
-	// File should not exist outside the sandbox.
-	if _, err := os.Stat(filename); !os.IsNotExist(err) {
-		t.Errorf("stat %q got error %v, wanted ErrNotExist", filename, err)
-	}
-}
-
 func TestGoferExits(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
 	conf := testutil.TestConfig()
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
new file mode 100644
index 000000000..8f81ed630
--- /dev/null
+++ b/runsc/container/shared_volume_test.go
@@ -0,0 +1,267 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// TestSharedVolume checks that modifications to a volume mount are propagated
+// into and out of the sandbox.
+func TestSharedVolume(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.FileAccess = boot.FileAccessShared
+	t.Logf("Running test with conf: %+v", conf)
+
+	// Main process just sleeps. We will use "exec" to probe the state of
+	// the filesystem.
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "shared-volume-test")
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// File that will be used to check consistency inside/outside sandbox.
+	filename := filepath.Join(dir, "file")
+
+	// File does not exist yet. Reading from the sandbox should fail.
+	argsTestFile := &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", filename},
+	}
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
+	}
+
+	// Create the file from outside of the sandbox.
+	if err := ioutil.WriteFile(filename, []byte("foobar"), 0777); err != nil {
+		t.Fatalf("error writing to file %q: %v", filename, err)
+	}
+
+	// Now we should be able to test the file from within the sandbox.
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// Rename the file from outside of the sandbox.
+	newFilename := filepath.Join(dir, "newfile")
+	if err := os.Rename(filename, newFilename); err != nil {
+		t.Fatalf("os.Rename(%q, %q) failed: %v", filename, newFilename, err)
+	}
+
+	// File should no longer exist at the old path within the sandbox.
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus())
+	}
+
+	// We should be able to test the new filename from within the sandbox.
+	argsTestNewFile := &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", newFilename},
+	}
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus())
+	}
+
+	// Delete the renamed file from outside of the sandbox.
+	if err := os.Remove(newFilename); err != nil {
+		t.Fatalf("error removing file %q: %v", filename, err)
+	}
+
+	// Renamed file should no longer exist at the old path within the sandbox.
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus())
+	}
+
+	// Now create the file from WITHIN the sandbox.
+	argsTouch := &control.ExecArgs{
+		Filename: "/usr/bin/touch",
+		Argv:     []string{"touch", filename},
+		KUID:     auth.KUID(os.Getuid()),
+		KGID:     auth.KGID(os.Getgid()),
+	}
+	if ws, err := c.executeSync(argsTouch); err != nil {
+		t.Fatalf("unexpected error touching file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(filename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", filename, err)
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(filename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", filename, err)
+	}
+
+	// Delete the file from within the sandbox.
+	argsRemove := &control.ExecArgs{
+		Filename: "/bin/rm",
+		Argv:     []string{"rm", filename},
+	}
+	if ws, err := c.executeSync(argsRemove); err != nil {
+		t.Fatalf("unexpected error removing file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// File should not exist outside the sandbox.
+	if _, err := os.Stat(filename); !os.IsNotExist(err) {
+		t.Errorf("stat %q got error %v, wanted ErrNotExist", filename, err)
+	}
+}
+
+func checkFile(c *Container, filename string, want []byte) error {
+	cpy := filename + ".copy"
+	argsCp := &control.ExecArgs{
+		Filename: "/bin/cp",
+		Argv:     []string{"cp", "-f", filename, cpy},
+	}
+	if _, err := c.executeSync(argsCp); err != nil {
+		return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err)
+	}
+	got, err := ioutil.ReadFile(cpy)
+	if err != nil {
+		return fmt.Errorf("Error reading file %q: %v", filename, err)
+	}
+	if !bytes.Equal(got, want) {
+		return fmt.Errorf("file content inside the sandbox is wrong, got: %q, want: %q", got, want)
+	}
+	return nil
+}
+
+// TestSharedVolumeFile tests that changes to file content outside the sandbox
+// is reflected inside.
+func TestSharedVolumeFile(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.FileAccess = boot.FileAccessShared
+	t.Logf("Running test with conf: %+v", conf)
+
+	// Main process just sleeps. We will use "exec" to probe the state of
+	// the filesystem.
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "shared-volume-test")
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// File that will be used to check consistency inside/outside sandbox.
+	filename := filepath.Join(dir, "file")
+
+	// Write file from outside the container and check that the same content is
+	// read inside.
+	want := []byte("host-")
+	if err := ioutil.WriteFile(filename, []byte(want), 0666); err != nil {
+		t.Fatalf("Error writing to %q: %v", filename, err)
+	}
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Append to file inside the container and check that content is not lost.
+	argsAppend := &control.ExecArgs{
+		Filename: "/bin/bash",
+		Argv:     []string{"bash", "-c", "echo -n sandbox- >> " + filename},
+	}
+	if _, err := c.executeSync(argsAppend); err != nil {
+		t.Fatalf("unexpected error appending file %q: %v", filename, err)
+	}
+	want = []byte("host-sandbox-")
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Write again from outside the container and check that the same content is
+	// read inside.
+	f, err := os.OpenFile(filename, os.O_APPEND|os.O_WRONLY, 0)
+	if err != nil {
+		t.Fatalf("Error openning file %q: %v", filename, err)
+	}
+	defer f.Close()
+	if _, err := f.Write([]byte("host")); err != nil {
+		t.Fatalf("Error writing to file %q: %v", filename, err)
+	}
+	want = []byte("host-sandbox-host")
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Shrink file outside and check that the same content is read inside.
+	if err := f.Truncate(5); err != nil {
+		t.Fatalf("Error truncating file %q: %v", filename, err)
+	}
+	want = want[:5]
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+}
-- 
cgit v1.2.3


From 52b3cd873dafcc114c00fd95111bcf57818b7903 Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Mon, 28 Jan 2019 11:57:22 -0800
Subject: runsc: Only uninstall cgroup for sandbox stop.

PiperOrigin-RevId: 231263114
Change-Id: I57467a34fe94e395fdd3685462c4fe9776d040a3
---
 runsc/container/container.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 1b410c63a..37969d8c5 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -721,7 +721,10 @@ func (c *Container) stop() error {
 		if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
 			return fmt.Errorf("destroying container %q: %v", c.ID, err)
 		}
-		cgroup = c.Sandbox.Cgroup
+		// Only uninstall cgroup for sandbox stop.
+		if c.Sandbox.IsRootContainer(c.ID) {
+			cgroup = c.Sandbox.Cgroup
+		}
 		// Only set sandbox to nil after it has been told to destroy the container.
 		c.Sandbox = nil
 	}
-- 
cgit v1.2.3


From b44699c5299bb0fc1b16d25a9ac2250cf0a7446d Mon Sep 17 00:00:00 2001
From: Shijiang Wei <mountkin@gmail.com>
Date: Mon, 28 Jan 2019 17:19:18 -0800
Subject: check isRootNS by ns inode

Signed-off-by: Shijiang Wei <mountkin@gmail.com>
Change-Id: I032f834edae5c716fb2d3538285eec07aa11a902
PiperOrigin-RevId: 231318438
---
 runsc/sandbox/network.go | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'runsc')

diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 8ec320d09..ec0a252d1 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -121,16 +121,17 @@ func joinNetNS(nsPath string) (func(), error) {
 }
 
 // isRootNS determines whether we are running in the root net namespace.
-//
-// TODO: Find a better way to detect root network.
-func isRootNS(ifaces []net.Interface) bool {
-	for _, iface := range ifaces {
-		if iface.Name == "docker0" {
-			return true
-		}
+// /proc/sys/net/core/rmem_default only exists in root network namespace.
+func isRootNS() (bool, error) {
+	err := syscall.Access("/proc/sys/net/core/rmem_default", syscall.F_OK)
+	switch err {
+	case nil:
+		return true, nil
+	case syscall.ENOENT:
+		return false, nil
+	default:
+		return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err)
 	}
-	return false
-
 }
 
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
@@ -150,8 +151,13 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 		return fmt.Errorf("querying interfaces: %v", err)
 	}
 
-	if isRootNS(ifaces) {
-		return fmt.Errorf("cannot run in with network enabled in root network namespace")
+	isRoot, err := isRootNS()
+	if err != nil {
+		return err
+	}
+	if isRoot {
+
+		return fmt.Errorf("cannot run with network enabled in root network namespace")
 	}
 
 	// Collect addresses and routes from the interfaces.
-- 
cgit v1.2.3


From 24cb2c0a7256cdb515c2fc2cfc90d130e2a405ef Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 29 Jan 2019 01:37:54 -0800
Subject: Use recvmmsg() instead of readv() to read packets from NIC.

This should reduce the number of syscalls required to process packets
significantly and improve throughputs.

PiperOrigin-RevId: 231366886
Change-Id: I8b38077262bf9c53176bc4a94b530188d3d7c0ca
---
 pkg/tcpip/link/fdbased/BUILD             |   1 +
 pkg/tcpip/link/fdbased/endpoint.go       | 166 ++++++++++++++++++++++++++-----
 pkg/tcpip/link/fdbased/endpoint_test.go  |  24 ++++-
 pkg/tcpip/link/rawfile/rawfile_unsafe.go |  31 +++++-
 runsc/boot/filter/config.go              |  14 +--
 runsc/boot/network.go                    |   1 +
 6 files changed, 199 insertions(+), 38 deletions(-)

(limited to 'runsc')

diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 94391433c..a4aa3feec 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -27,6 +27,7 @@ go_test(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/link/rawfile",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 3a79d13d4..87c8ab1fc 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -24,6 +24,7 @@
 package fdbased
 
 import (
+	"fmt"
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
@@ -33,9 +34,19 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 )
 
+const (
+	// MaxMsgsPerRecv is the maximum number of packets we want to retrieve
+	// in a single RecvMMsg call.
+	MaxMsgsPerRecv = 8
+)
+
 // BufConfig defines the shape of the vectorised view used to read packets from the NIC.
 var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
 
+// linkDispatcher reads packets from the link FD and dispatches them to the
+// NetworkDispatcher.
+type linkDispatcher func() (bool, *tcpip.Error)
+
 type endpoint struct {
 	// fd is the file descriptor used to send and receive packets.
 	fd int
@@ -57,14 +68,25 @@ type endpoint struct {
 	// its end of the communication pipe.
 	closed func(*tcpip.Error)
 
-	iovecs     []syscall.Iovec
-	views      []buffer.View
-	dispatcher stack.NetworkDispatcher
+	views             [][]buffer.View
+	iovecs            [][]syscall.Iovec
+	msgHdrs           []rawfile.MMsgHdr
+	inboundDispatcher linkDispatcher
+	dispatcher        stack.NetworkDispatcher
 
 	// handleLocal indicates whether packets destined to itself should be
 	// handled by the netstack internally (true) or be forwarded to the FD
 	// endpoint (false).
 	handleLocal bool
+
+	// useRecvMMsg enables use of recvmmsg() syscall instead of readv() to
+	// read inbound packets. This reduces # of syscalls needed to process
+	// packets.
+	//
+	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
+	// FD is not a socket then the code will still fall back to the readv()
+	// path.
+	useRecvMMsg bool
 }
 
 // Options specify the details about the fd-based endpoint to be created.
@@ -78,6 +100,7 @@ type Options struct {
 	SaveRestore     bool
 	DisconnectOk    bool
 	HandleLocal     bool
+	UseRecvMMsg     bool
 }
 
 // New creates a new fd-based endpoint.
@@ -85,7 +108,10 @@ type Options struct {
 // Makes fd non-blocking, but does not take ownership of fd, which must remain
 // open for the lifetime of the returned endpoint.
 func New(opts *Options) tcpip.LinkEndpointID {
-	syscall.SetNonblock(opts.FD, true)
+	if err := syscall.SetNonblock(opts.FD, true); err != nil {
+		// TODO : replace panic with an error return.
+		panic(fmt.Sprintf("syscall.SetNonblock(%v) failed: %v", opts.FD, err))
+	}
 
 	caps := stack.LinkEndpointCapabilities(0)
 	if opts.ChecksumOffload {
@@ -113,13 +139,44 @@ func New(opts *Options) tcpip.LinkEndpointID {
 		closed:      opts.ClosedFunc,
 		addr:        opts.Address,
 		hdrSize:     hdrSize,
-		views:       make([]buffer.View, len(BufConfig)),
-		iovecs:      make([]syscall.Iovec, len(BufConfig)),
 		handleLocal: opts.HandleLocal,
+		useRecvMMsg: opts.UseRecvMMsg,
+	}
+	// For non-socket FDs we read one packet a time (e.g. TAP devices)
+	msgsPerRecv := 1
+	e.inboundDispatcher = e.dispatch
+	// If the provided FD is a socket then we optimize packet reads by
+	// using recvmmsg() instead of read() to read packets in a batch.
+	if isSocketFD(opts.FD) && e.useRecvMMsg {
+		e.inboundDispatcher = e.recvMMsgDispatch
+		msgsPerRecv = MaxMsgsPerRecv
+	}
+
+	e.views = make([][]buffer.View, msgsPerRecv)
+	for i, _ := range e.views {
+		e.views[i] = make([]buffer.View, len(BufConfig))
+	}
+	e.iovecs = make([][]syscall.Iovec, msgsPerRecv)
+	for i, _ := range e.iovecs {
+		e.iovecs[i] = make([]syscall.Iovec, len(BufConfig))
+	}
+	e.msgHdrs = make([]rawfile.MMsgHdr, msgsPerRecv)
+	for i, _ := range e.msgHdrs {
+		e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0]
+		e.msgHdrs[i].Msg.Iovlen = uint64(len(BufConfig))
 	}
 	return stack.RegisterLinkEndpoint(e)
 }
 
+func isSocketFD(fd int) bool {
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(fd, &stat); err != nil {
+		// TODO : replace panic with an error return.
+		panic(fmt.Sprintf("syscall.Fstat(%v,...) failed: %v", fd, err))
+	}
+	return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK
+}
+
 // Attach launches the goroutine that reads packets from the file descriptor and
 // dispatches them via the provided dispatcher.
 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
@@ -191,12 +248,12 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload b
 	return rawfile.NonBlockingWrite2(e.fd, hdr.View(), payload.ToView())
 }
 
-func (e *endpoint) capViews(n int, buffers []int) int {
+func (e *endpoint) capViews(k, n int, buffers []int) int {
 	c := 0
 	for i, s := range buffers {
 		c += s
 		if c >= n {
-			e.views[i].CapLength(s - (c - n))
+			e.views[k][i].CapLength(s - (c - n))
 			return i + 1
 		}
 	}
@@ -204,24 +261,26 @@ func (e *endpoint) capViews(n int, buffers []int) int {
 }
 
 func (e *endpoint) allocateViews(bufConfig []int) {
-	for i, v := range e.views {
-		if v != nil {
-			break
-		}
-		b := buffer.NewView(bufConfig[i])
-		e.views[i] = b
-		e.iovecs[i] = syscall.Iovec{
-			Base: &b[0],
-			Len:  uint64(len(b)),
+	for k := 0; k < len(e.views); k++ {
+		for i := 0; i < len(bufConfig); i++ {
+			if e.views[k][i] != nil {
+				break
+			}
+			b := buffer.NewView(bufConfig[i])
+			e.views[k][i] = b
+			e.iovecs[k][i] = syscall.Iovec{
+				Base: &b[0],
+				Len:  uint64(len(b)),
+			}
 		}
 	}
 }
 
 // dispatch reads one packet from the file descriptor and dispatches it.
-func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
+func (e *endpoint) dispatch() (bool, *tcpip.Error) {
 	e.allocateViews(BufConfig)
 
-	n, err := rawfile.BlockingReadv(e.fd, e.iovecs)
+	n, err := rawfile.BlockingReadv(e.fd, e.iovecs[0])
 	if err != nil {
 		return false, err
 	}
@@ -235,14 +294,14 @@ func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
 		remote, local tcpip.LinkAddress
 	)
 	if e.hdrSize > 0 {
-		eth := header.Ethernet(e.views[0])
+		eth := header.Ethernet(e.views[0][0])
 		p = eth.Type()
 		remote = eth.SourceAddress()
 		local = eth.DestinationAddress()
 	} else {
 		// We don't get any indication of what the packet is, so try to guess
 		// if it's an IPv4 or IPv6 packet.
-		switch header.IPVersion(e.views[0]) {
+		switch header.IPVersion(e.views[0][0]) {
 		case header.IPv4Version:
 			p = header.IPv4ProtocolNumber
 		case header.IPv6Version:
@@ -252,15 +311,71 @@ func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
 		}
 	}
 
-	used := e.capViews(n, BufConfig)
-	vv := buffer.NewVectorisedView(n, e.views[:used])
+	used := e.capViews(0, n, BufConfig)
+	vv := buffer.NewVectorisedView(n, e.views[0][:used])
 	vv.TrimFront(e.hdrSize)
 
 	e.dispatcher.DeliverNetworkPacket(e, remote, local, p, vv)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
-		e.views[i] = nil
+		e.views[0][i] = nil
+	}
+
+	return true, nil
+}
+
+// recvMMsgDispatch reads more than one packet at a time from the file
+// descriptor and dispatches it.
+func (e *endpoint) recvMMsgDispatch() (bool, *tcpip.Error) {
+	e.allocateViews(BufConfig)
+
+	nMsgs, err := rawfile.BlockingRecvMMsg(e.fd, e.msgHdrs)
+	if err != nil {
+		return false, err
+	}
+	// Process each of received packets.
+	for k := 0; k < nMsgs; k++ {
+		n := e.msgHdrs[k].Len
+		if n <= uint32(e.hdrSize) {
+			return false, nil
+		}
+
+		var (
+			p             tcpip.NetworkProtocolNumber
+			remote, local tcpip.LinkAddress
+		)
+		if e.hdrSize > 0 {
+			eth := header.Ethernet(e.views[k][0])
+			p = eth.Type()
+			remote = eth.SourceAddress()
+			local = eth.DestinationAddress()
+		} else {
+			// We don't get any indication of what the packet is, so try to guess
+			// if it's an IPv4 or IPv6 packet.
+			switch header.IPVersion(e.views[k][0]) {
+			case header.IPv4Version:
+				p = header.IPv4ProtocolNumber
+			case header.IPv6Version:
+				p = header.IPv6ProtocolNumber
+			default:
+				return true, nil
+			}
+		}
+
+		used := e.capViews(k, int(n), BufConfig)
+		vv := buffer.NewVectorisedView(int(n), e.views[k][:used])
+		vv.TrimFront(e.hdrSize)
+		e.dispatcher.DeliverNetworkPacket(e, remote, local, p, vv)
+
+		// Prepare e.views for another packet: release used views.
+		for i := 0; i < used; i++ {
+			e.views[k][i] = nil
+		}
+	}
+
+	for k := 0; k < nMsgs; k++ {
+		e.msgHdrs[k].Len = 0
 	}
 
 	return true, nil
@@ -269,9 +384,8 @@ func (e *endpoint) dispatch(largeV buffer.View) (bool, *tcpip.Error) {
 // dispatchLoop reads packets from the file descriptor in a loop and dispatches
 // them to the network stack.
 func (e *endpoint) dispatchLoop() *tcpip.Error {
-	v := buffer.NewView(header.MaxIPPacketSize)
 	for {
-		cont, err := e.dispatch(v)
+		cont, err := e.inboundDispatcher()
 		if err != nil || !cont {
 			if e.closed != nil {
 				e.closed(err)
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 226639443..14abacdf2 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 )
 
@@ -309,9 +310,22 @@ func TestBufConfigFirst(t *testing.T) {
 
 func build(bufConfig []int) *endpoint {
 	e := &endpoint{
-		views:  make([]buffer.View, len(bufConfig)),
-		iovecs: make([]syscall.Iovec, len(bufConfig)),
+		views:   make([][]buffer.View, MaxMsgsPerRecv),
+		iovecs:  make([][]syscall.Iovec, MaxMsgsPerRecv),
+		msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv),
 	}
+
+	for i, _ := range e.views {
+		e.views[i] = make([]buffer.View, len(bufConfig))
+	}
+	for i := range e.iovecs {
+		e.iovecs[i] = make([]syscall.Iovec, len(bufConfig))
+	}
+	for k, msgHdr := range e.msgHdrs {
+		msgHdr.Msg.Iov = &e.iovecs[k][0]
+		msgHdr.Msg.Iovlen = uint64(len(bufConfig))
+	}
+
 	e.allocateViews(bufConfig)
 	return e
 }
@@ -356,12 +370,12 @@ var capLengthTestCases = []struct {
 func TestCapLength(t *testing.T) {
 	for _, c := range capLengthTestCases {
 		e := build(c.config)
-		used := e.capViews(c.n, c.config)
+		used := e.capViews(0, c.n, c.config)
 		if used != c.wantUsed {
 			t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed)
 		}
-		lengths := make([]int, len(e.views))
-		for i, v := range e.views {
+		lengths := make([]int, len(e.views[0]))
+		for i, v := range e.views[0] {
 			lengths[i] = len(v)
 		}
 		if !reflect.DeepEqual(lengths, c.wantLengths) {
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index be4a4fa9c..5deea093a 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -124,7 +124,7 @@ func BlockingRead(fd int, b []byte) (int, *tcpip.Error) {
 
 // BlockingReadv reads from a file descriptor that is set up as non-blocking and
 // stores the data in a list of iovecs buffers. If no data is available, it will
-// block in a poll() syscall until the file descirptor becomes readable.
+// block in a poll() syscall until the file descriptor becomes readable.
 func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) {
 	for {
 		n, _, e := syscall.RawSyscall(syscall.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&iovecs[0])), uintptr(len(iovecs)))
@@ -143,3 +143,32 @@ func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) {
 		}
 	}
 }
+
+// MMsgHdr represents the mmsg_hdr structure required by recvmmsg() on linux.
+type MMsgHdr struct {
+	Msg syscall.Msghdr
+	Len uint32
+	_   [4]byte
+}
+
+// BlockingRecvMMsg reads from a file descriptor that is set up as non-blocking
+// and stores the received messages in a slice of MMsgHdr structures. If no data
+// is available, it will block in a poll() syscall until the file descriptor
+// becomes readable.
+func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
+	for {
+		n, _, e := syscall.RawSyscall6(syscall.SYS_RECVMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), syscall.MSG_DONTWAIT, 0, 0)
+		if e == 0 {
+			return int(n), nil
+		}
+
+		event := pollEvent{
+			fd:     int32(fd),
+			events: 1, // POLLIN
+		}
+
+		if _, e := blockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR {
+			return 0, TranslateErrno(e)
+		}
+	}
+}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 83c1fbcce..bde749861 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -211,23 +211,25 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_PREAD64:   {},
 	syscall.SYS_PWRITE64:  {},
 	syscall.SYS_READ:      {},
-	syscall.SYS_READV: []seccomp.Rule{
+	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(len(fdbased.BufConfig)),
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
 		},
-	},
-	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
 		},
+	},
+	syscall.SYS_RECVMMSG: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+			seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
+			seccomp.AllowValue(syscall.MSG_DONTWAIT),
+			seccomp.AllowValue(0),
 		},
 	},
 	syscall.SYS_RESTART_SYSCALL: {},
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 89f186139..83d56f93a 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -140,6 +140,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			EthernetHeader: true,
 			HandleLocal:    true,
 			Address:        mac,
+			UseRecvMMsg:    true,
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
-- 
cgit v1.2.3


From dd577f5410a90d31a927b7b0fd6c4bb32b34b9f9 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 29 Jan 2019 17:14:26 -0800
Subject: runsc: reap a sandbox process only in sandbox.Wait()

PiperOrigin-RevId: 231504064
Change-Id: I585b769aef04a3ad7e7936027958910a6eed9c8d
---
 runsc/boot/controller.go |  2 +-
 runsc/sandbox/sandbox.go | 31 +++++++++++++++++++------------
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 989f49388..23d476f7f 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -235,7 +235,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 
 	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
-		log.Debugf("containerManager.Start failed %q: %+v", args.CID, args)
+		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
 	}
 	log.Debugf("Container %q started", args.CID)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 721a49141..ce8c21681 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -21,6 +21,7 @@ import (
 	"os"
 	"os/exec"
 	"strconv"
+	"sync"
 	"syscall"
 	"time"
 
@@ -64,6 +65,12 @@ type Sandbox struct {
 	// This field isn't saved to json, because only a creator of sandbox
 	// will have it as a child process.
 	child bool
+
+	// status is an exit status of a sandbox process.
+	status syscall.WaitStatus
+
+	// statusMu protects status.
+	statusMu sync.Mutex
 }
 
 // New creates the sandbox process. The caller must call Destroy() on the
@@ -628,18 +635,13 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	// Wait RPC. The best we can do is ask Linux what the sandbox exit
 	// status was, since in most cases that will be the same as the
 	// container exit status.
-	p, err := os.FindProcess(s.Pid)
-	if err != nil {
-		// "On Unix systems, FindProcess always succeeds and returns a
-		// Process for the given pid, regardless of whether the process
-		// exists."
-		return ws, fmt.Errorf("Find process %d: %v", s.Pid, err)
+	if err := s.waitForStopped(); err != nil {
+		return ws, err
 	}
-	ps, err := p.Wait()
-	if err != nil {
-		return ws, fmt.Errorf("sandbox no longer running, tried to get exit status, but Wait failed: %v", err)
+	if !s.child {
+		return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable")
 	}
-	return ps.Sys().(syscall.WaitStatus), nil
+	return s.status, nil
 }
 
 // WaitPID waits for process 'pid' in the container's sandbox and returns its
@@ -853,10 +855,15 @@ func (s *Sandbox) waitForStopped() error {
 	defer cancel()
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
-		if s.child && s.Pid != 0 {
+		if s.child {
+			s.statusMu.Lock()
+			defer s.statusMu.Unlock()
+			if s.Pid == 0 {
+				return nil
+			}
 			// The sandbox process is a child of the current process,
 			// so we can wait it and collect its zombie.
-			wpid, err := syscall.Wait4(int(s.Pid), nil, syscall.WNOHANG, nil)
+			wpid, err := syscall.Wait4(int(s.Pid), &s.status, syscall.WNOHANG, nil)
 			if err != nil {
 				return fmt.Errorf("error waiting the sandbox process: %v", err)
 			}
-- 
cgit v1.2.3


From 7e8a56087bfb4ab89e058cd9f9d2459a06275559 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 31 Jan 2019 10:33:09 -0800
Subject: runsc: check whether a container is deleted or not before
 setupContainerFS

PiperOrigin-RevId: 231811387
Change-Id: Ib143fb9a4d0fa1f105d1a3a3bd533dfc44e792af
---
 runsc/boot/loader.go | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f3dc15f00..973578484 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -430,6 +430,15 @@ func (l *Loader) run() error {
 		}
 	}
 
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: l.sandboxID}
+	ep, ok := l.processes[eid]
+	if !ok {
+		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
+	}
+
 	// Finally done with all configuration. Setup filters before user code
 	// is loaded.
 	if l.conf.DisableSeccomp {
@@ -478,14 +487,6 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
-	l.mu.Lock()
-	defer l.mu.Unlock()
-
-	eid := execID{cid: l.sandboxID}
-	ep := l.processes[eid]
-	if ep == nil {
-		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
-	}
 	ep.tg = l.k.GlobalInit()
 	if l.console {
 		ttyFile := l.rootProcArgs.FDMap.GetFile(0)
@@ -524,6 +525,14 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("creating capabilities: %v", err)
 	}
 
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; !ok {
+		return fmt.Errorf("trying to start a deleted container %q", cid)
+	}
+
 	// Convert the spec's additional GIDs to KGIDs.
 	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
 	for _, GID := range spec.Process.User.AdditionalGids {
@@ -586,14 +595,6 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
 	}
 
-	l.mu.Lock()
-	defer l.mu.Unlock()
-
-	eid := execID{cid: cid}
-	if _, ok := l.processes[eid]; !ok {
-		return fmt.Errorf("trying to start a deleted container %q", cid)
-	}
-
 	tg, _, err := l.k.CreateProcess(procArgs)
 	if err != nil {
 		return fmt.Errorf("creating process: %v", err)
-- 
cgit v1.2.3


From 2a0c69b19f4b55c3f9777f0098a72af123ccff3c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 31 Jan 2019 11:11:44 -0800
Subject: Remove license comments

Nothing reads them and they can simply get stale.

Generated with:
$ sed -i "s/licenses(\(.*\)).*/licenses(\1)/" **/BUILD

PiperOrigin-RevId: 231818945
Change-Id: Ibc3f9838546b7e94f13f217060d31f4ada9d4bf0
---
 pkg/abi/BUILD                                 | 2 +-
 pkg/abi/linux/BUILD                           | 2 +-
 pkg/amutex/BUILD                              | 2 +-
 pkg/atomicbitops/BUILD                        | 2 +-
 pkg/binary/BUILD                              | 2 +-
 pkg/bits/BUILD                                | 2 +-
 pkg/bpf/BUILD                                 | 2 +-
 pkg/compressio/BUILD                          | 2 +-
 pkg/control/client/BUILD                      | 2 +-
 pkg/control/server/BUILD                      | 2 +-
 pkg/cpuid/BUILD                               | 2 +-
 pkg/dhcp/BUILD                                | 2 +-
 pkg/eventchannel/BUILD                        | 2 +-
 pkg/fd/BUILD                                  | 2 +-
 pkg/fdnotifier/BUILD                          | 2 +-
 pkg/gate/BUILD                                | 2 +-
 pkg/ilist/BUILD                               | 2 +-
 pkg/linewriter/BUILD                          | 2 +-
 pkg/log/BUILD                                 | 2 +-
 pkg/metric/BUILD                              | 2 +-
 pkg/p9/BUILD                                  | 2 +-
 pkg/p9/local_server/BUILD                     | 2 +-
 pkg/p9/p9test/BUILD                           | 2 +-
 pkg/rand/BUILD                                | 2 +-
 pkg/refs/BUILD                                | 2 +-
 pkg/seccomp/BUILD                             | 2 +-
 pkg/secio/BUILD                               | 2 +-
 pkg/segment/BUILD                             | 2 +-
 pkg/segment/test/BUILD                        | 2 +-
 pkg/sentry/BUILD                              | 2 +-
 pkg/sentry/arch/BUILD                         | 2 +-
 pkg/sentry/context/BUILD                      | 2 +-
 pkg/sentry/context/contexttest/BUILD          | 2 +-
 pkg/sentry/control/BUILD                      | 2 +-
 pkg/sentry/device/BUILD                       | 2 +-
 pkg/sentry/fs/BUILD                           | 2 +-
 pkg/sentry/fs/anon/BUILD                      | 2 +-
 pkg/sentry/fs/ashmem/BUILD                    | 2 +-
 pkg/sentry/fs/binder/BUILD                    | 2 +-
 pkg/sentry/fs/dev/BUILD                       | 2 +-
 pkg/sentry/fs/fdpipe/BUILD                    | 2 +-
 pkg/sentry/fs/filetest/BUILD                  | 2 +-
 pkg/sentry/fs/fsutil/BUILD                    | 2 +-
 pkg/sentry/fs/gofer/BUILD                     | 2 +-
 pkg/sentry/fs/host/BUILD                      | 2 +-
 pkg/sentry/fs/lock/BUILD                      | 2 +-
 pkg/sentry/fs/proc/BUILD                      | 2 +-
 pkg/sentry/fs/proc/device/BUILD               | 2 +-
 pkg/sentry/fs/proc/seqfile/BUILD              | 2 +-
 pkg/sentry/fs/ramfs/BUILD                     | 2 +-
 pkg/sentry/fs/sys/BUILD                       | 2 +-
 pkg/sentry/fs/timerfd/BUILD                   | 2 +-
 pkg/sentry/fs/tmpfs/BUILD                     | 2 +-
 pkg/sentry/fs/tty/BUILD                       | 2 +-
 pkg/sentry/hostcpu/BUILD                      | 2 +-
 pkg/sentry/inet/BUILD                         | 2 +-
 pkg/sentry/kernel/BUILD                       | 2 +-
 pkg/sentry/kernel/auth/BUILD                  | 2 +-
 pkg/sentry/kernel/contexttest/BUILD           | 2 +-
 pkg/sentry/kernel/epoll/BUILD                 | 2 +-
 pkg/sentry/kernel/eventfd/BUILD               | 2 +-
 pkg/sentry/kernel/fasync/BUILD                | 2 +-
 pkg/sentry/kernel/futex/BUILD                 | 2 +-
 pkg/sentry/kernel/kdefs/BUILD                 | 2 +-
 pkg/sentry/kernel/memevent/BUILD              | 2 +-
 pkg/sentry/kernel/pipe/BUILD                  | 2 +-
 pkg/sentry/kernel/sched/BUILD                 | 2 +-
 pkg/sentry/kernel/semaphore/BUILD             | 2 +-
 pkg/sentry/kernel/shm/BUILD                   | 2 +-
 pkg/sentry/kernel/time/BUILD                  | 2 +-
 pkg/sentry/limits/BUILD                       | 2 +-
 pkg/sentry/loader/BUILD                       | 2 +-
 pkg/sentry/memmap/BUILD                       | 2 +-
 pkg/sentry/memutil/BUILD                      | 2 +-
 pkg/sentry/mm/BUILD                           | 2 +-
 pkg/sentry/platform/BUILD                     | 2 +-
 pkg/sentry/platform/filemem/BUILD             | 2 +-
 pkg/sentry/platform/interrupt/BUILD           | 2 +-
 pkg/sentry/platform/kvm/BUILD                 | 2 +-
 pkg/sentry/platform/kvm/testutil/BUILD        | 2 +-
 pkg/sentry/platform/procid/BUILD              | 2 +-
 pkg/sentry/platform/ptrace/BUILD              | 2 +-
 pkg/sentry/platform/ring0/BUILD               | 2 +-
 pkg/sentry/platform/ring0/gen_offsets/BUILD   | 2 +-
 pkg/sentry/platform/ring0/pagetables/BUILD    | 2 +-
 pkg/sentry/platform/safecopy/BUILD            | 2 +-
 pkg/sentry/safemem/BUILD                      | 2 +-
 pkg/sentry/sighandling/BUILD                  | 2 +-
 pkg/sentry/socket/BUILD                       | 2 +-
 pkg/sentry/socket/control/BUILD               | 2 +-
 pkg/sentry/socket/epsocket/BUILD              | 2 +-
 pkg/sentry/socket/hostinet/BUILD              | 2 +-
 pkg/sentry/socket/netlink/BUILD               | 2 +-
 pkg/sentry/socket/netlink/port/BUILD          | 2 +-
 pkg/sentry/socket/netlink/route/BUILD         | 2 +-
 pkg/sentry/socket/rpcinet/BUILD               | 2 +-
 pkg/sentry/socket/rpcinet/conn/BUILD          | 2 +-
 pkg/sentry/socket/rpcinet/notifier/BUILD      | 2 +-
 pkg/sentry/socket/unix/BUILD                  | 2 +-
 pkg/sentry/socket/unix/transport/BUILD        | 2 +-
 pkg/sentry/state/BUILD                        | 2 +-
 pkg/sentry/strace/BUILD                       | 2 +-
 pkg/sentry/syscalls/BUILD                     | 2 +-
 pkg/sentry/syscalls/linux/BUILD               | 2 +-
 pkg/sentry/time/BUILD                         | 2 +-
 pkg/sentry/unimpl/BUILD                       | 2 +-
 pkg/sentry/uniqueid/BUILD                     | 2 +-
 pkg/sentry/usage/BUILD                        | 2 +-
 pkg/sentry/usermem/BUILD                      | 2 +-
 pkg/sentry/watchdog/BUILD                     | 2 +-
 pkg/sleep/BUILD                               | 2 +-
 pkg/state/BUILD                               | 2 +-
 pkg/state/statefile/BUILD                     | 2 +-
 pkg/sync/BUILD                                | 2 +-
 pkg/sync/atomicptrtest/BUILD                  | 2 +-
 pkg/sync/seqatomictest/BUILD                  | 2 +-
 pkg/syserr/BUILD                              | 2 +-
 pkg/syserror/BUILD                            | 2 +-
 pkg/tcpip/BUILD                               | 2 +-
 pkg/tcpip/adapters/gonet/BUILD                | 2 +-
 pkg/tcpip/buffer/BUILD                        | 2 +-
 pkg/tcpip/checker/BUILD                       | 2 +-
 pkg/tcpip/hash/jenkins/BUILD                  | 2 +-
 pkg/tcpip/header/BUILD                        | 2 +-
 pkg/tcpip/link/channel/BUILD                  | 2 +-
 pkg/tcpip/link/fdbased/BUILD                  | 2 +-
 pkg/tcpip/link/loopback/BUILD                 | 2 +-
 pkg/tcpip/link/rawfile/BUILD                  | 2 +-
 pkg/tcpip/link/sharedmem/BUILD                | 2 +-
 pkg/tcpip/link/sharedmem/pipe/BUILD           | 2 +-
 pkg/tcpip/link/sharedmem/queue/BUILD          | 2 +-
 pkg/tcpip/link/sniffer/BUILD                  | 2 +-
 pkg/tcpip/link/tun/BUILD                      | 2 +-
 pkg/tcpip/link/waitable/BUILD                 | 2 +-
 pkg/tcpip/network/BUILD                       | 2 +-
 pkg/tcpip/network/arp/BUILD                   | 2 +-
 pkg/tcpip/network/fragmentation/BUILD         | 2 +-
 pkg/tcpip/network/hash/BUILD                  | 2 +-
 pkg/tcpip/network/ipv4/BUILD                  | 2 +-
 pkg/tcpip/network/ipv6/BUILD                  | 2 +-
 pkg/tcpip/ports/BUILD                         | 2 +-
 pkg/tcpip/sample/tun_tcp_connect/BUILD        | 2 +-
 pkg/tcpip/sample/tun_tcp_echo/BUILD           | 2 +-
 pkg/tcpip/seqnum/BUILD                        | 2 +-
 pkg/tcpip/stack/BUILD                         | 2 +-
 pkg/tcpip/transport/ping/BUILD                | 2 +-
 pkg/tcpip/transport/tcp/BUILD                 | 2 +-
 pkg/tcpip/transport/tcp/testing/context/BUILD | 2 +-
 pkg/tcpip/transport/tcpconntrack/BUILD        | 2 +-
 pkg/tcpip/transport/udp/BUILD                 | 2 +-
 pkg/tmutex/BUILD                              | 2 +-
 pkg/unet/BUILD                                | 2 +-
 pkg/urpc/BUILD                                | 2 +-
 pkg/waiter/BUILD                              | 2 +-
 runsc/boot/BUILD                              | 2 +-
 runsc/boot/filter/BUILD                       | 2 +-
 runsc/cgroup/BUILD                            | 2 +-
 runsc/cmd/BUILD                               | 2 +-
 runsc/console/BUILD                           | 2 +-
 runsc/container/BUILD                         | 2 +-
 runsc/fsgofer/BUILD                           | 2 +-
 runsc/fsgofer/filter/BUILD                    | 2 +-
 runsc/sandbox/BUILD                           | 2 +-
 runsc/specutils/BUILD                         | 2 +-
 runsc/test/image/BUILD                        | 2 +-
 runsc/test/integration/BUILD                  | 2 +-
 runsc/test/root/BUILD                         | 2 +-
 runsc/test/root/testdata/BUILD                | 2 +-
 runsc/test/testutil/BUILD                     | 2 +-
 runsc/tools/dockercfg/BUILD                   | 2 +-
 test/syscalls/BUILD                           | 2 +-
 test/syscalls/gtest/BUILD                     | 2 +-
 test/syscalls/linux/BUILD                     | 2 +-
 test/util/BUILD                               | 2 +-
 tools/go_generics/BUILD                       | 2 +-
 tools/go_generics/globals/BUILD               | 2 +-
 tools/go_generics/go_merge/BUILD              | 2 +-
 tools/go_generics/rules_tests/BUILD           | 2 +-
 tools/go_stateify/BUILD                       | 2 +-
 vdso/BUILD                                    | 2 +-
 180 files changed, 180 insertions(+), 180 deletions(-)

(limited to 'runsc')

diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index 1ba4f3a46..323263ebf 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index e6043abf4..7648c9469 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -2,7 +2,7 @@
 # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix
 # when the host OS may not be Linux.
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 7cda07418..bdb6e8f2c 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "amutex",
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 235188531..9555bf645 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "atomicbitops",
diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD
index 571151f72..bd37376b0 100644
--- a/pkg/binary/BUILD
+++ b/pkg/binary/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "binary",
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 46794bdb8..5214b2c24 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index 564df3af5..3c7ae3103 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index 72952d735..3a0ac64e6 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "compressio",
diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD
index 32853875d..22a4a4a5a 100644
--- a/pkg/control/client/BUILD
+++ b/pkg/control/client/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "client",
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index ba2b1be9f..76b2e9787 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "server",
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 46fc4703b..29cc38778 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/dhcp/BUILD b/pkg/dhcp/BUILD
index c97dfc14b..003620b48 100644
--- a/pkg/dhcp/BUILD
+++ b/pkg/dhcp/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "dhcp",
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 18348ef54..5c2a44aa1 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "eventchannel",
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index 06cfd445e..ab1109157 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "fd",
diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD
index 27d378d5b..8c8d193cc 100644
--- a/pkg/fdnotifier/BUILD
+++ b/pkg/fdnotifier/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "fdnotifier",
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index 9a87a3a31..83679f2da 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "gate",
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index a67aa2cff..dbd65ab12 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ilist",
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index 3f28ba867..d1aa2e7d6 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "linewriter",
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index 94ac66db3..b2d18eddb 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "log",
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index d96e5563b..4b2c7a00e 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "metric",
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index 2c224e65b..5d972309d 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -2,7 +2,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 go_library(
diff --git a/pkg/p9/local_server/BUILD b/pkg/p9/local_server/BUILD
index b17ebb79d..aa6db186c 100644
--- a/pkg/p9/local_server/BUILD
+++ b/pkg/p9/local_server/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "local_server",
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index 7c4b875ce..cf22edde8 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 alias(
     name = "mockgen",
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index 0c9efc709..4eec3a4dd 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "rand",
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 98150ba8f..fc562f821 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 657f923ed..0e9c4692d 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "victim",
diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD
index 29f751725..2b4b87c61 100644
--- a/pkg/secio/BUILD
+++ b/pkg/secio/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "secio",
diff --git a/pkg/segment/BUILD b/pkg/segment/BUILD
index 964d73af8..700385907 100644
--- a/pkg/segment/BUILD
+++ b/pkg/segment/BUILD
@@ -1,6 +1,6 @@
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 load("//tools/go_generics:defs.bzl", "go_template")
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index bdf53e24e..81e929b8c 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -2,7 +2,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//visibility:private"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index d18cf3555..53989301f 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -1,7 +1,7 @@
 # This BUILD file defines a package_group that allows for interdependencies for
 # sentry-internal packages.
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 package_group(
     name = "internal",
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 9bf04360a..0c044bc33 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
index 02d24defd..a3c8d0177 100644
--- a/pkg/sentry/context/BUILD
+++ b/pkg/sentry/context/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "context",
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 01bb40b04..bed156b70 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index c3b682d6f..f54e01ee8 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "control",
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index bebdb2939..01de708d3 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "device",
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 6f368b0da..e58333da3 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
index 4bd912e95..2111df2e8 100644
--- a/pkg/sentry/fs/anon/BUILD
+++ b/pkg/sentry/fs/anon/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "anon",
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index e5bb661b5..dcf620dca 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index 27155819e..8a448175f 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 85371032a..e5b962c8c 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index 8a0937cda..098463e97 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index d137fee4c..05ca72aa0 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index d4767642b..7dff970ea 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 35ffadd13..f2c79b475 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 6877eb161..ea2ca11bf 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 3159ff1da..7164744b8 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 74954f213..f6bc90634 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD
index ff7dacf07..64b0c5a3a 100644
--- a/pkg/sentry/fs/proc/device/BUILD
+++ b/pkg/sentry/fs/proc/device/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "device",
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index b4ba64e10..6b44c0075 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 4a629e38e..f36e4a5e8 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 7de928e16..42e98230e 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index ffdd7e0dc..0e06a5028 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index c5ec85460..bf5b68869 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 011cb6955..bee2db3f3 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index 33197cf14..b5067ae6d 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "hostcpu",
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 159c50efb..e288d34e9 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -1,6 +1,6 @@
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 7d41626dc..b230aff98 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index a81085372..abd4f2dae 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
index 391986291..5769a3b28 100644
--- a/pkg/sentry/kernel/contexttest/BUILD
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 5e8b36ed6..1567d5050 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index d96803fc9..f2f1a1223 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 17749c0de..5faf95909 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index afd35985f..da24c36c1 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
index 3f8fa206c..38aaca134 100644
--- a/pkg/sentry/kernel/kdefs/BUILD
+++ b/pkg/sentry/kernel/kdefs/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "kdefs",
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index dfd8dd062..347a69062 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "memevent",
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 19b23c6d2..011a3f349 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
index 52e226a39..184e8a35b 100644
--- a/pkg/sentry/kernel/sched/BUILD
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sched",
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index bdcf4ce5c..840943ca8 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 40e641355..f45770eef 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 5d8db2273..584f7c7cc 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 90f4395d4..800166675 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 24e734b49..1ea260a4e 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index c9e0b95a0..9c2cbd18b 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD
index 88738d65d..68b03d4cc 100644
--- a/pkg/sentry/memutil/BUILD
+++ b/pkg/sentry/memutil/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "memutil",
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 0997ec0a7..f679262d0 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index af9ba5394..ac8a6cb7f 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
index 2a5982763..1a61cfaa5 100644
--- a/pkg/sentry/platform/filemem/BUILD
+++ b/pkg/sentry/platform/filemem/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index dbafa3204..eeccd4d0e 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "interrupt",
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 1b71e629f..6e40b3177 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index 1dffe94a4..e10087e8e 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "testutil",
diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/sentry/platform/procid/BUILD
index 20c8bc02c..277509624 100644
--- a/pkg/sentry/platform/procid/BUILD
+++ b/pkg/sentry/platform/procid/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "procid",
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 2eb354ad4..f86790942 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ptrace",
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index c35d49f2d..ecb3e9a9c 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index b76d7974e..d7029d5a9 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index de1b920af..fe93d3030 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index cb8347dd8..05a6a61ae 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0, portions BSD, MIT
+package(licenses = ["notice"])
 
 go_library(
     name = "safecopy",
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
index 87a9bff12..3ab453718 100644
--- a/pkg/sentry/safemem/BUILD
+++ b/pkg/sentry/safemem/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "safemem",
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index 41313d334..cec3af92e 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sighandling",
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 3a8044b5f..076f953e7 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index d3a63f15f..9f4763906 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index da4aaf510..45e418db3 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index b8dceb102..a469af7ac 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index cff922cb8..148306329 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 3a7dbc5ed..a7370a4ec 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index e1bcfe252..be0419679 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 06e121946..4da14a1e0 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "rpcinet",
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
index a16977f29..4336ae9b4 100644
--- a/pkg/sentry/socket/rpcinet/conn/BUILD
+++ b/pkg/sentry/socket/rpcinet/conn/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # BSD
+package(licenses = ["notice"])
 
 go_library(
     name = "conn",
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index 2bab01774..b0b107ddb 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # BSD
+package(licenses = ["notice"])
 
 go_library(
     name = "notifier",
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index a12fa93db..fe6871cc6 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 5a90837bc..5a2de0c4c 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index f1f6fdb7d..42c459acc 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "state",
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 8517db1ac..552e79686 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "strace",
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 35192ff49..6b5469e45 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "syscalls",
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 7621bfdbd..846601881 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 1191010e6..c4b6dcc63 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0, portions BSD
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
index 42e24ace5..b608867a9 100644
--- a/pkg/sentry/unimpl/BUILD
+++ b/pkg/sentry/unimpl/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 proto_library(
     name = "unimplemented_syscall_proto",
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index 0929497c3..ccc5a28d3 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "uniqueid",
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index 868dfd400..09198496b 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index dae41ed0e..1a560b6f3 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index b2c687b20..0bbf3705c 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "watchdog",
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index 338fd9336..2b005bf66 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sleep",
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index dd0f250fa..0a975e162 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD
index 66c8f3807..5967781e8 100644
--- a/pkg/state/statefile/BUILD
+++ b/pkg/state/statefile/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "statefile",
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 6ddc6e812..1624e681c 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -2,7 +2,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0, portions BSD
+    licenses = ["notice"],
 )
 
 load("//tools/go_generics:defs.bzl", "go_template")
diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
index 9cb7f66fe..198fbb895 100644
--- a/pkg/sync/atomicptrtest/BUILD
+++ b/pkg/sync/atomicptrtest/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
index 54f8e59b1..23132650a 100644
--- a/pkg/sync/seqatomictest/BUILD
+++ b/pkg/sync/seqatomictest/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD
index 30ae20772..0d65115ef 100644
--- a/pkg/syserr/BUILD
+++ b/pkg/syserr/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "syserr",
diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD
index d4c6da97a..ac478d0ff 100644
--- a/pkg/syserror/BUILD
+++ b/pkg/syserror/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "syserror",
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index daff9a0a0..83524cc8a 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index 723ad668f..ee2417238 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "gonet",
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index 11a725423..648d12cdf 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index a1de808b9..f597d0b24 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "checker",
diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD
index bbb764db8..ce2194a4d 100644
--- a/pkg/tcpip/hash/jenkins/BUILD
+++ b/pkg/tcpip/hash/jenkins/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "jenkins",
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 8e455fe1e..a5c7290ee 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 25f6c1457..ae285e495 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "channel",
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index a4aa3feec..0d78c9b15 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "fdbased",
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index a46ba7f11..710a05ede 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "loopback",
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 2746d4ced..f01bb2c07 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "rawfile",
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index d7f1e66ef..dc8f1543e 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sharedmem",
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index 12e813509..85deafa38 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "pipe",
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index 661037bb2..d7dc631eb 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "queue",
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 52e237c25..7d0d1781e 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sniffer",
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index 5ec01cec9..e54852d3f 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "tun",
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index ba495c437..89a9eee23 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "waitable",
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index a2a07f533..f36f49453 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_test(
     name = "ip_test",
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index f6fb7daf7..ef18bb93d 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "arp",
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index aaabfcb9a..bf0a7b99c 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
index 401dce646..ea520c6ed 100644
--- a/pkg/tcpip/network/hash/BUILD
+++ b/pkg/tcpip/network/hash/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "hash",
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index e72317e9f..7a5341def 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ipv4",
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index 808c37df3..000e00dba 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ipv6",
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index a2fa9b84a..3ee80c62b 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ports",
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index 32baf2115..996939581 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "tun_tcp_connect",
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index 760445843..dad8ef399 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "tun_tcp_echo",
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index c5c889239..a63665efc 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 8a598c57d..551c3c73e 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
index 982b6795c..4d4241d4b 100644
--- a/pkg/tcpip/transport/ping/BUILD
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 726107739..e5c05f8c0 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
index 814e5c1ea..1584e4095 100644
--- a/pkg/tcpip/transport/tcp/testing/context/BUILD
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "context",
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index ac1a94d4d..31a845dee 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "tcpconntrack",
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 4225e28dc..8ccb79c48 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index c20df7005..69035044d 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "tmutex",
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index f90e43c89..5e177e78e 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "unet",
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index 21008cf6c..36cae67e1 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "urpc",
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 5e611c54f..b748246da 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 15a7cdae1..540e99151 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "boot",
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index 004222242..3b6020cf3 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "filter",
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index 4f9a25a25..620d33a19 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "cgroup",
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index a908172af..9e2be0d37 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "cmd",
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
index ff4ccff69..3ff9eba27 100644
--- a/runsc/console/BUILD
+++ b/runsc/console/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "console",
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 354ce2661..3b25ff79a 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "container",
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 756c20ad7..4adc9c1bc 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "fsgofer",
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index c7848d10c..78c5b526c 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "filter",
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 899fd99de..2ed793333 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sandbox",
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 77a10e2b6..372799850 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "specutils",
diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index 22b3ebd2a..e8b629c6a 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_test(
     name = "image_test",
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index e7204dc66..779d30ec9 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_test(
     name = "integration_test",
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
index 77dcbd79e..75826a521 100644
--- a/runsc/test/root/BUILD
+++ b/runsc/test/root/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "root",
diff --git a/runsc/test/root/testdata/BUILD b/runsc/test/root/testdata/BUILD
index 6c9fe0aea..7f272dcd3 100644
--- a/runsc/test/root/testdata/BUILD
+++ b/runsc/test/root/testdata/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "testdata",
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 8c3919320..ddec81444 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "testutil",
diff --git a/runsc/tools/dockercfg/BUILD b/runsc/tools/dockercfg/BUILD
index a80b3abab..fd406ab93 100644
--- a/runsc/tools/dockercfg/BUILD
+++ b/runsc/tools/dockercfg/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "dockercfg",
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 8c391c8a6..148d9c366 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -1,7 +1,7 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 load("//test/syscalls:build_defs.bzl", "syscall_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 syscall_test(test = "//test/syscalls/linux:32bit_test")
 
diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD
index d078fd3d5..22e061652 100644
--- a/test/syscalls/gtest/BUILD
+++ b/test/syscalls/gtest/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "gtest",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e70742875..a311ca12c 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1,6 +1,6 @@
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 cc_binary(
diff --git a/test/util/BUILD b/test/util/BUILD
index f2e563507..fac0730b4 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,6 +1,6 @@
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 cc_library(
diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD
index 2d97d99dc..39318b877 100644
--- a/tools/go_generics/BUILD
+++ b/tools/go_generics/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "go_generics",
diff --git a/tools/go_generics/globals/BUILD b/tools/go_generics/globals/BUILD
index c26ac56d2..6628132f5 100644
--- a/tools/go_generics/globals/BUILD
+++ b/tools/go_generics/globals/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "globals",
diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD
index a60437962..02b09120e 100644
--- a/tools/go_generics/go_merge/BUILD
+++ b/tools/go_generics/go_merge/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "go_merge",
diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD
index 23b2d656d..a6f8cdd3c 100644
--- a/tools/go_generics/rules_tests/BUILD
+++ b/tools/go_generics/rules_tests/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index 68d37f5d7..bb53f8ae9 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "stateify",
diff --git a/vdso/BUILD b/vdso/BUILD
index fd395511c..c43d24070 100644
--- a/vdso/BUILD
+++ b/vdso/BUILD
@@ -3,7 +3,7 @@
 #   normal system VDSO (time, gettimeofday, clock_gettimeofday) but which uses
 #   timekeeping parameters managed by the sandbox kernel.
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 genrule(
     name = "vdso",
-- 
cgit v1.2.3


From 4e695adcd0c739101c3d50431ca18b1b911c9238 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 31 Jan 2019 15:17:50 -0800
Subject: gvisor/gofer: Use pivot_root instead of chroot

PiperOrigin-RevId: 231864273
Change-Id: I8545b72b615f5c2945df374b801b80be64ec3e13
---
 runsc/cmd/chroot.go            | 45 ++++++++++++++-------------
 runsc/cmd/gofer.go             | 69 ++++++++++++++++++++++++++++++++++--------
 runsc/container/container.go   | 46 +++++++++++++++++++++++++---
 runsc/main.go                  |  4 +--
 runsc/test/root/chroot_test.go |  5 +--
 runsc/test/testutil/docker.go  |  9 ------
 6 files changed, 124 insertions(+), 54 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
index c1acbf26b..ed1dafef1 100644
--- a/runsc/cmd/chroot.go
+++ b/runsc/cmd/chroot.go
@@ -36,6 +36,29 @@ func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
 	return nil
 }
 
+func pivotRoot(root string) error {
+	if err := os.Chdir(root); err != nil {
+		return fmt.Errorf("error changing working directory: %v", err)
+	}
+	// pivot_root(new_root, put_old) moves the root filesystem (old_root)
+	// of the calling process to the directory put_old and makes new_root
+	// the new root filesystem of the calling process.
+	//
+	// pivot_root(".", ".") makes a mount of the working directory the new
+	// root filesystem, so it will be moved in "/" and then the old_root
+	// will be moved to "/" too. The parent mount of the old_root will be
+	// new_root, so after umounting the old_root, we will see only
+	// the new_root in "/".
+	if err := syscall.PivotRoot(".", "."); err != nil {
+		return fmt.Errorf("error changing root filesystem: %v", err)
+	}
+
+	if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
+		return fmt.Errorf("error umounting the old root file system: %v", err)
+	}
+	return nil
+}
+
 // setUpChroot creates an empty directory with runsc mounted at /runsc and proc
 // mounted at /proc.
 func setUpChroot(pidns bool) error {
@@ -66,29 +89,9 @@ func setUpChroot(pidns bool) error {
 		}
 	}
 
-	if err := os.Chdir(chroot); err != nil {
-		return fmt.Errorf("error changing working directory: %v", err)
-	}
-
 	if err := syscall.Mount("", chroot, "", syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_BIND, ""); err != nil {
 		return fmt.Errorf("error remounting chroot in read-only: %v", err)
 	}
-	// pivot_root(new_root, put_old) moves the root filesystem (old_root)
-	// of the calling process to the directory put_old and makes new_root
-	// the new root filesystem of the calling process.
-	//
-	// pivot_root(".", ".") makes a mount of the working directory the new
-	// root filesystem, so it will be moved in "/" and then the old_root
-	// will be moved to "/" too. The parent mount of the old_root will be
-	// new_root, so after umounting the old_root, we will see only
-	// the new_root in "/".
-	if err := syscall.PivotRoot(".", "."); err != nil {
-		return fmt.Errorf("error changing root filesystem: %v", err)
-	}
 
-	if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
-		return fmt.Errorf("error umounting the old root file system: %v", err)
-	}
-
-	return nil
+	return pivotRoot(chroot)
 }
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 43286a2e5..6f9711518 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/fsgofer/filter"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -54,8 +55,10 @@ type Gofer struct {
 	bundleDir string
 	ioFDs     intFlags
 	applyCaps bool
+	setUpRoot bool
 
 	panicOnWrite bool
+	specFD       int
 }
 
 // Name implements subcommands.Command.
@@ -79,43 +82,83 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
 	f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
+	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
+	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
 }
 
 // Execute implements subcommands.Command.
 func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if g.bundleDir == "" || len(g.ioFDs) < 1 {
+	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
 
+	specFile := os.NewFile(uintptr(g.specFD), "spec file")
+	defer specFile.Close()
+	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
+	if err != nil {
+		Fatalf("reading spec: %v", err)
+	}
+
+	// Find what path is going to be served by this gofer.
+	root := spec.Root.Path
+
+	conf := args[0].(*boot.Config)
+
+	if g.setUpRoot && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		// Convert all shared mounts into slave to be sure that nothing will be
+		// propagated outside of our namespace.
+		if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
+			Fatalf("error converting mounts: %v", err)
+		}
+
+		// FIXME: runsc can't be re-executed without
+		// /proc, so we create a tmpfs mount, mount ./proc and ./root
+		// there, then move this mount to the root and after
+		// setCapsAndCallSelf, runsc will chroot into /root.
+		//
+		// We need a directory to construct a new root and we know that
+		// runsc can't start without /proc, so we can use it for this.
+		flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC)
+		if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil {
+			Fatalf("error mounting tmpfs: %v", err)
+		}
+		os.Mkdir("/proc/proc", 0755)
+		os.Mkdir("/proc/root", 0755)
+		if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil {
+			Fatalf("error mounting proc: %v", err)
+		}
+		if err := syscall.Mount(root, "/proc/root", "", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
+			Fatalf("error mounting root: %v", err)
+		}
+		if err := pivotRoot("/proc"); err != nil {
+			Fatalf("faild to change the root file system: %v", err)
+		}
+		if err := os.Chdir("/"); err != nil {
+			Fatalf("failed to change working directory")
+		}
+	}
+
 	if g.applyCaps {
 		// Disable caps when calling myself again.
 		// Note: minimal argument handling for the default case to keep it simple.
 		args := os.Args
-		args = append(args, "--apply-caps=false")
+		args = append(args, "--apply-caps=false", "--setup-root=false")
 		if err := setCapsAndCallSelf(args, goferCaps); err != nil {
 			Fatalf("Unable to apply caps: %v", err)
 		}
 		panic("unreachable")
 	}
 
-	specFile, err := specutils.OpenCleanSpec(g.bundleDir)
-	if err != nil {
-		Fatalf("opening spec: %v", err)
-	}
-	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
-	specFile.Close()
-	if err != nil {
-		Fatalf("reading spec: %v", err)
-	}
 	specutils.LogSpec(spec)
 
 	// fsgofer should run with a umask of 0, because we want to preserve file
 	// modes exactly as sent by the sandbox, which will have applied its own umask.
 	syscall.Umask(0)
 
-	// Find what path is going to be served by this gofer.
-	root := spec.Root.Path
+	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		root = "/root"
+	}
 	if err := syscall.Chroot(root); err != nil {
 		Fatalf("failed to chroot to %q: %v", root, err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 37969d8c5..08a3725f5 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -787,11 +787,50 @@ func (c *Container) waitForStopped() error {
 func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, error) {
 	// Start with the general config flags.
 	args := conf.ToFlags()
+
+	var goferEnds []*os.File
+
+	// nextFD is the next available file descriptor for the gofer process.
+	// It starts at 3 because 0-2 are used by stdin/stdout/stderr.
+	nextFD := 3
+
+	if conf.LogFilename != "" {
+		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		if err != nil {
+			return nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
+		}
+		defer logFile.Close()
+		goferEnds = append(goferEnds, logFile)
+		args = append(args, "--log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	if conf.DebugLog != "" {
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer")
+		if err != nil {
+			return nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
+		}
+		defer debugLogFile.Close()
+		goferEnds = append(goferEnds, debugLogFile)
+		args = append(args, "--debug-log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
 	args = append(args, "gofer", "--bundle", bundleDir)
 	if conf.Overlay {
 		args = append(args, "--panic-on-write=true")
 	}
 
+	// Open the spec file to donate to the sandbox.
+	specFile, err := specutils.OpenCleanSpec(bundleDir)
+	if err != nil {
+		return nil, fmt.Errorf("opening spec file: %v", err)
+	}
+	defer specFile.Close()
+	goferEnds = append(goferEnds, specFile)
+	args = append(args, "--spec-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
 	// Add root mount and then add any other additional mounts.
 	mountCount := 1
 
@@ -802,12 +841,8 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 		}
 	}
 	sandEnds := make([]*os.File, 0, mountCount)
-	goferEnds := make([]*os.File, 0, mountCount)
 
-	// nextFD is the next available file descriptor for the gofer process.
-	// It starts at 3 because 0-2 are used by stdin/stdout/stderr.
-	nextFD := 3
-	for ; nextFD-3 < mountCount; nextFD++ {
+	for i := 0; i < mountCount; i++ {
 		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
 		if err != nil {
 			return nil, err
@@ -819,6 +854,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 		goferEnds = append(goferEnds, goferEnd)
 
 		args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
+		nextFD++
 	}
 
 	binPath := specutils.ExePath
diff --git a/runsc/main.go b/runsc/main.go
index e036abc44..472839bf0 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -179,8 +179,8 @@ func main() {
 
 		// Quick sanity check to make sure no other commands get passed
 		// a log fd (they should use log dir instead).
-		if subcommand != "boot" {
-			cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' command, but was passed to %q", subcommand)
+		if subcommand != "boot" && subcommand != "gofer" {
+			cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
 		}
 
 		// If we are the boot process, then we own our stdio FDs and
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 89f90c3e0..0deca0532 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -118,10 +118,7 @@ func TestChrootGofer(t *testing.T) {
 
 	// This where the root directory is mapped on the host and that's where the
 	// gofer must have chroot'd to.
-	root, err := d.RootDirInHost()
-	if err != nil {
-		t.Fatalf("Docker.RootDirInHost(): %v", err)
-	}
+	root := "/root"
 
 	for _, child := range children {
 		childPID, err := strconv.Atoi(child)
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 9a76397be..5a92a5835 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -297,15 +297,6 @@ func (d *Docker) SandboxPid() (int, error) {
 	return pid, nil
 }
 
-// RootDirInHost returns where the root directory is mapped on the host.
-func (d *Docker) RootDirInHost() (string, error) {
-	out, err := do("inspect", "-f={{.GraphDriver.Data.MergedDir}}", d.Name)
-	if err != nil {
-		return "", fmt.Errorf("error retrieving pid: %v", err)
-	}
-	return strings.TrimSuffix(string(out), "\n"), nil
-}
-
 // ID returns the container ID.
 func (d *Docker) ID() (string, error) {
 	out, err := do("inspect", "-f={{.Id}}", d.Name)
-- 
cgit v1.2.3


From 92e85623a0cd7b2043a79b757e1874a67796dea9 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 1 Feb 2019 15:22:22 -0800
Subject: Factor the subtargets method into a helper method with tests.

PiperOrigin-RevId: 232047515
Change-Id: I00f036816e320356219be7b2f2e6d5fe57583a60
---
 pkg/sentry/fs/path.go      | 27 ++++++++++++++++
 pkg/sentry/fs/path_test.go | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/fs.go           | 14 ++-------
 3 files changed, 107 insertions(+), 12 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/path.go b/pkg/sentry/fs/path.go
index 91a9a8ffd..52139b648 100644
--- a/pkg/sentry/fs/path.go
+++ b/pkg/sentry/fs/path.go
@@ -14,6 +14,11 @@
 
 package fs
 
+import (
+	"path/filepath"
+	"strings"
+)
+
 // TrimTrailingSlashes trims any trailing slashes.
 //
 // The returned boolean indicates whether any changes were made.
@@ -90,3 +95,25 @@ func SplitFirst(path string) (current, remainder string) {
 		return current, remainder
 	}
 }
+
+// IsSubpath checks whether the first path is a (strict) descendent of the
+// second. If it is a subpath, then true is returned along with a clean
+// relative path from the second path to the first. Otherwise false is
+// returned.
+func IsSubpath(subpath, path string) (string, bool) {
+	cleanPath := filepath.Clean(path)
+	cleanSubpath := filepath.Clean(subpath)
+
+	// Add a trailing slash to the path if it does not already have one.
+	if len(cleanPath) == 0 || cleanPath[len(cleanPath)-1] != '/' {
+		cleanPath += "/"
+	}
+	if cleanPath == cleanSubpath {
+		// Paths are equal, thus not a strict subpath.
+		return "", false
+	}
+	if strings.HasPrefix(cleanSubpath, cleanPath) {
+		return strings.TrimPrefix(cleanSubpath, cleanPath), true
+	}
+	return "", false
+}
diff --git a/pkg/sentry/fs/path_test.go b/pkg/sentry/fs/path_test.go
index 391b010a7..4ba1498f6 100644
--- a/pkg/sentry/fs/path_test.go
+++ b/pkg/sentry/fs/path_test.go
@@ -209,3 +209,81 @@ func TestSplitFirst(t *testing.T) {
 		}
 	}
 }
+
+// TestIsSubpath tests the IsSubpath method.
+func TestIsSubpath(t *testing.T) {
+	tcs := []struct {
+		// Two absolute paths.
+		pathA string
+		pathB string
+
+		// Whether pathA is a subpath of pathB.
+		wantIsSubpath bool
+
+		// Relative path from pathA to pathB. Only checked if
+		// wantIsSubpath is true.
+		wantRelpath string
+	}{
+		{
+			pathA:         "/foo/bar/baz",
+			pathB:         "/foo",
+			wantIsSubpath: true,
+			wantRelpath:   "bar/baz",
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/foo/bar/baz",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/foo",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foobar",
+			pathB:         "/foo",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/foobar",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/foobar",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/",
+			pathB:         "/foo",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/",
+			wantIsSubpath: true,
+			wantRelpath:   "foo",
+		},
+		{
+			pathA:         "/foo/bar/../bar",
+			pathB:         "/foo",
+			wantIsSubpath: true,
+			wantRelpath:   "bar",
+		},
+		{
+			pathA:         "/foo/bar",
+			pathB:         "/foo/../foo",
+			wantIsSubpath: true,
+			wantRelpath:   "bar",
+		},
+	}
+
+	for _, tc := range tcs {
+		gotRelpath, gotIsSubpath := IsSubpath(tc.pathA, tc.pathB)
+		if gotRelpath != tc.wantRelpath || gotIsSubpath != tc.wantIsSubpath {
+			t.Errorf("IsSubpath(%q, %q) got %q %t, want %q %t", tc.pathA, tc.pathB, gotRelpath, gotIsSubpath, tc.wantRelpath, tc.wantIsSubpath)
+		}
+	}
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 5c5e650ca..ada292c9e 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -515,20 +515,10 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 // subtargets takes a set of Mounts and returns only the targets that are
 // children of the given root. The returned paths are relative to the root.
 func subtargets(root string, mnts []specs.Mount) []string {
-	r := filepath.Clean(root)
-	if len(r) > 0 && r[len(r)-1] != '/' {
-		r += "/"
-	}
 	var targets []string
 	for _, mnt := range mnts {
-		t := filepath.Clean(mnt.Destination)
-		if strings.HasPrefix(t, r) {
-			// Make the mnt path relative to the root path.  If the
-			// result is empty, then mnt IS the root mount, not a
-			// submount.  We don't want to include those.
-			if t := strings.TrimPrefix(t, r); t != "" {
-				targets = append(targets, t)
-			}
+		if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
+			targets = append(targets, relPath)
 		}
 	}
 	return targets
-- 
cgit v1.2.3


From e0b3d3323fbb4b27280f0087427bb04c3e71238b Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 13 Feb 2019 14:52:06 -0800
Subject: Add support for using PACKET_RX_RING to receive packets.

PACKET_RX_RING allows the use of an mmapped buffer to receive packets from the
kernel. This should cut down the number of host syscalls that need to be made
to receive packets when the underlying fd is a socket of the AF_PACKET type.

PiperOrigin-RevId: 233834998
Change-Id: I8060025c6ced206986e94cc46b8f382b81bfa47f
---
 pkg/tcpip/link/fdbased/BUILD                       |   7 +-
 pkg/tcpip/link/fdbased/endpoint.go                 | 100 +++++++---
 pkg/tcpip/link/fdbased/mmap.go                     |  33 ++++
 pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go        | 210 +++++++++++++++++++++
 pkg/tcpip/link/rawfile/blockingpoll_amd64.s        |   6 +-
 .../link/rawfile/blockingpoll_amd64_unsafe.go      |   2 +-
 pkg/tcpip/link/rawfile/blockingpoll_unsafe.go      |   4 +-
 pkg/tcpip/link/rawfile/rawfile_unsafe.go           |  33 ++--
 runsc/boot/network.go                              |  12 +-
 9 files changed, 349 insertions(+), 58 deletions(-)
 create mode 100644 pkg/tcpip/link/fdbased/mmap.go
 create mode 100644 pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go

(limited to 'runsc')

diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 0d78c9b15..bcf9c023e 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -4,7 +4,11 @@ package(licenses = ["notice"])
 
 go_library(
     name = "fdbased",
-    srcs = ["endpoint.go"],
+    srcs = [
+        "endpoint.go",
+        "mmap.go",
+        "mmap_amd64_unsafe.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased",
     visibility = [
         "//visibility:public",
@@ -15,6 +19,7 @@ go_library(
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/rawfile",
         "//pkg/tcpip/stack",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 87c8ab1fc..20f379ab0 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -47,6 +47,30 @@ var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
 // NetworkDispatcher.
 type linkDispatcher func() (bool, *tcpip.Error)
 
+// PacketDispatchMode are the various supported methods of receiving and
+// dispatching packets from the underlying FD.
+type PacketDispatchMode int
+
+const (
+	// Readv is the default dispatch mode and is the least performant of the
+	// dispatch options but the one that is supported by all underlying FD
+	// types.
+	Readv PacketDispatchMode = iota
+	// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
+	// read inbound packets. This reduces # of syscalls needed to process
+	// packets.
+	//
+	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
+	// FD is not a socket then the code will still fall back to the readv()
+	// path.
+	RecvMMsg
+	// PacketMMap enables use of PACKET_RX_RING to receive packets from the
+	// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
+	// primary use-case for this is runsc which uses an AF_PACKET FD to
+	// receive packets from the veth device.
+	PacketMMap
+)
+
 type endpoint struct {
 	// fd is the file descriptor used to send and receive packets.
 	fd int
@@ -68,9 +92,11 @@ type endpoint struct {
 	// its end of the communication pipe.
 	closed func(*tcpip.Error)
 
-	views             [][]buffer.View
-	iovecs            [][]syscall.Iovec
-	msgHdrs           []rawfile.MMsgHdr
+	views  [][]buffer.View
+	iovecs [][]syscall.Iovec
+	// msgHdrs is only used by the RecvMMsg dispatcher.
+	msgHdrs []rawfile.MMsgHdr
+
 	inboundDispatcher linkDispatcher
 	dispatcher        stack.NetworkDispatcher
 
@@ -79,28 +105,31 @@ type endpoint struct {
 	// endpoint (false).
 	handleLocal bool
 
-	// useRecvMMsg enables use of recvmmsg() syscall instead of readv() to
-	// read inbound packets. This reduces # of syscalls needed to process
-	// packets.
-	//
-	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
-	// FD is not a socket then the code will still fall back to the readv()
-	// path.
-	useRecvMMsg bool
+	// packetDispatchMode controls the packet dispatcher used by this
+	// endpoint.
+	packetDispatchMode PacketDispatchMode
+
+	// ringBuffer is only used when PacketMMap dispatcher is used and points
+	// to the start of the mmapped PACKET_RX_RING buffer.
+	ringBuffer []byte
+
+	// ringOffset is the current offset into the ring buffer where the next
+	// inbound packet will be placed by the kernel.
+	ringOffset int
 }
 
 // Options specify the details about the fd-based endpoint to be created.
 type Options struct {
-	FD              int
-	MTU             uint32
-	EthernetHeader  bool
-	ChecksumOffload bool
-	ClosedFunc      func(*tcpip.Error)
-	Address         tcpip.LinkAddress
-	SaveRestore     bool
-	DisconnectOk    bool
-	HandleLocal     bool
-	UseRecvMMsg     bool
+	FD                 int
+	MTU                uint32
+	EthernetHeader     bool
+	ChecksumOffload    bool
+	ClosedFunc         func(*tcpip.Error)
+	Address            tcpip.LinkAddress
+	SaveRestore        bool
+	DisconnectOk       bool
+	HandleLocal        bool
+	PacketDispatchMode PacketDispatchMode
 }
 
 // New creates a new fd-based endpoint.
@@ -133,21 +162,31 @@ func New(opts *Options) tcpip.LinkEndpointID {
 	}
 
 	e := &endpoint{
-		fd:          opts.FD,
-		mtu:         opts.MTU,
-		caps:        caps,
-		closed:      opts.ClosedFunc,
-		addr:        opts.Address,
-		hdrSize:     hdrSize,
-		handleLocal: opts.HandleLocal,
-		useRecvMMsg: opts.UseRecvMMsg,
+		fd:                 opts.FD,
+		mtu:                opts.MTU,
+		caps:               caps,
+		closed:             opts.ClosedFunc,
+		addr:               opts.Address,
+		hdrSize:            hdrSize,
+		handleLocal:        opts.HandleLocal,
+		packetDispatchMode: opts.PacketDispatchMode,
+	}
+
+	if isSocketFD(opts.FD) && e.packetDispatchMode == PacketMMap {
+		if err := e.setupPacketRXRing(); err != nil {
+			// TODO: replace panic with an error return.
+			panic(fmt.Sprintf("e.setupPacketRXRing failed: %v", err))
+		}
+		e.inboundDispatcher = e.packetMMapDispatch
+		return stack.RegisterLinkEndpoint(e)
 	}
+
 	// For non-socket FDs we read one packet a time (e.g. TAP devices)
 	msgsPerRecv := 1
 	e.inboundDispatcher = e.dispatch
 	// If the provided FD is a socket then we optimize packet reads by
 	// using recvmmsg() instead of read() to read packets in a batch.
-	if isSocketFD(opts.FD) && e.useRecvMMsg {
+	if isSocketFD(opts.FD) && e.packetDispatchMode == RecvMMsg {
 		e.inboundDispatcher = e.recvMMsgDispatch
 		msgsPerRecv = MaxMsgsPerRecv
 	}
@@ -165,6 +204,7 @@ func New(opts *Options) tcpip.LinkEndpointID {
 		e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0]
 		e.msgHdrs[i].Msg.Iovlen = uint64(len(BufConfig))
 	}
+
 	return stack.RegisterLinkEndpoint(e)
 }
 
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
new file mode 100644
index 000000000..f1e71c233
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -0,0 +1,33 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !linux !amd64
+
+package fdbased
+
+import "gvisor.googlesource.com/gvisor/pkg/tcpip"
+
+// Stubbed out versions for non-linux/non-amd64 platforms.
+
+func (e *endpoint) setupPacketRXRing() error {
+	return nil
+}
+
+func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
+	return nil, nil
+}
+
+func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
+	return false, nil
+}
diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
new file mode 100644
index 000000000..d88c3f8a5
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
@@ -0,0 +1,210 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64
+
+package fdbased
+
+import (
+	"encoding/binary"
+	"fmt"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
+)
+
+const (
+	tPacketAlignment = uintptr(16)
+	tpStatusKernel   = 0
+	tpStatusUser     = 1
+	tpStatusCopy     = 2
+	tpStatusLosing   = 4
+)
+
+// We overallocate the frame size to accommodate space for the
+// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
+//
+// NOTE: Frames need to be aligned at 16 byte boundaries.
+const (
+	tpFrameSize = 65536 + 128
+	tpBlockSize = tpFrameSize * 128
+	tpBlockNR   = 10
+	tpFrameNR   = (tpBlockSize * tpBlockNR) / tpFrameSize
+)
+
+// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
+// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
+func tPacketAlign(v uintptr) uintptr {
+	return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
+}
+
+// tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>.
+var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{}))
+
+// tPacketReq is the tpacket_req structure as described in
+// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+type tPacketReq struct {
+	tpBlockSize uint32
+	tpBlockNR   uint32
+	tpFrameSize uint32
+	tpFrameNR   uint32
+}
+
+// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
+type tPacketHdr []byte
+
+const (
+	tpStatusOffset  = 0
+	tpLenOffset     = 8
+	tpSnapLenOffset = 12
+	tpMacOffset     = 16
+	tpNetOffset     = 18
+	tpSecOffset     = 20
+	tpUSecOffset    = 24
+)
+
+func (t tPacketHdr) tpStatus() uint32 {
+	return binary.LittleEndian.Uint32(t[tpStatusOffset:])
+}
+
+func (t tPacketHdr) setTPStatus(status uint32) {
+	binary.LittleEndian.PutUint32(t[tpStatusOffset:], status)
+}
+
+func (t tPacketHdr) tpLen() uint32 {
+	return binary.LittleEndian.Uint32(t[tpLenOffset:])
+}
+
+func (t tPacketHdr) tpSnapLen() uint32 {
+	return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
+}
+
+func (t tPacketHdr) tpMac() uint16 {
+	return binary.LittleEndian.Uint16(t[tpMacOffset:])
+}
+
+func (t tPacketHdr) tpNet() uint16 {
+	return binary.LittleEndian.Uint16(t[tpNetOffset:])
+}
+
+func (t tPacketHdr) tpSec() uint32 {
+	return binary.LittleEndian.Uint32(t[tpSecOffset:])
+}
+
+func (t tPacketHdr) tpUSec() uint32 {
+	return binary.LittleEndian.Uint32(t[tpUSecOffset:])
+}
+
+func (t tPacketHdr) Payload() []byte {
+	return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
+}
+
+func (e *endpoint) setupPacketRXRing() error {
+	tReq := tPacketReq{
+		tpBlockSize: uint32(tpBlockSize),
+		tpBlockNR:   uint32(tpBlockNR),
+		tpFrameSize: uint32(tpFrameSize),
+		tpFrameNR:   uint32(tpFrameNR),
+	}
+	// Setup PACKET_RX_RING.
+	if err := setsockopt(e.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil {
+		return fmt.Errorf("failed to enable PACKET_RX_RING: %v", err)
+	}
+	// Let's mmap the blocks.
+	sz := tpBlockSize * tpBlockNR
+	buf, err := syscall.Mmap(e.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+	if err != nil {
+		return fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err)
+	}
+	e.ringBuffer = buf
+	return nil
+}
+
+func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
+	hdr := (tPacketHdr)(e.ringBuffer[0+e.ringOffset*tpFrameSize:])
+	for (hdr.tpStatus() & tpStatusUser) == 0 {
+		event := rawfile.PollEvent{
+			FD:     int32(e.fd),
+			Events: unix.POLLIN | unix.POLLERR,
+		}
+		_, errno := rawfile.BlockingPoll(&event, 1, -1)
+		if errno != 0 {
+			if errno == syscall.EINTR {
+				continue
+			}
+			return nil, rawfile.TranslateErrno(errno)
+		}
+		if hdr.tpStatus()&tpStatusCopy != 0 {
+			continue
+		}
+		if hdr.tpStatus()&tpStatusLosing != 0 {
+			continue
+		}
+	}
+
+	// Copy out the packet from the mmapped frame to a locally owned buffer.
+	pkt := make([]byte, hdr.tpSnapLen())
+	copy(pkt, hdr.Payload())
+	// Release packet to kernel.
+	hdr.setTPStatus(tpStatusKernel)
+	e.ringOffset = (e.ringOffset + 1) % tpFrameNR
+	return pkt, nil
+}
+
+// packetMMapDispatch reads packets from an mmaped ring buffer and dispatches
+// them to the network stack.
+func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
+	pkt, err := e.readMMappedPacket()
+	if err != nil {
+		return false, err
+	}
+	var (
+		p             tcpip.NetworkProtocolNumber
+		remote, local tcpip.LinkAddress
+	)
+	if e.hdrSize > 0 {
+		eth := header.Ethernet(pkt)
+		p = eth.Type()
+		remote = eth.SourceAddress()
+		local = eth.DestinationAddress()
+	} else {
+		// We don't get any indication of what the packet is, so try to guess
+		// if it's an IPv4 or IPv6 packet.
+		switch header.IPVersion(pkt) {
+		case header.IPv4Version:
+			p = header.IPv4ProtocolNumber
+		case header.IPv6Version:
+			p = header.IPv6ProtocolNumber
+		default:
+			return true, nil
+		}
+	}
+
+	pkt = pkt[e.hdrSize:]
+	e.dispatcher.DeliverNetworkPacket(e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
+	return true, nil
+}
+
+func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error {
+	if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 {
+		return error(errno)
+	}
+
+	return nil
+}
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
index 8e22ba661..9dade5421 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -14,12 +14,12 @@
 
 #include "textflag.h"
 
-// blockingPoll makes the poll() syscall while calling the version of
+// BlockingPoll makes the poll() syscall while calling the version of
 // entersyscall that relinquishes the P so that other Gs can run. This is meant
 // to be called in cases when the syscall is expected to block.
 //
-// func blockingPoll(fds *pollEvent, nfds int, timeout int64) (n int, err syscall.Errno)
-TEXT ·blockingPoll(SB),NOSPLIT,$0-40
+// func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (n int, err syscall.Errno)
+TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
 	CALL	·callEntersyscallblock(SB)
 	MOVQ	fds+0(FP), DI
 	MOVQ	nfds+8(FP), SI
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
index 93479cd0d..3ba96a123 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
@@ -25,7 +25,7 @@ import (
 )
 
 //go:noescape
-func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno)
+func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno)
 
 // Use go:linkname to call into the runtime. As of Go 1.12 this has to
 // be done from Go code so that we make an ABIInternal call to an
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
index 6a3e956ad..94ddad8ea 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
@@ -21,7 +21,9 @@ import (
 	"unsafe"
 )
 
-func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) {
+// BlockingPoll is just a stub function that forwards to the poll() system call
+// on non-amd64 platforms.
+func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) {
 	n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout))
 	return int(n), e
 }
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index 5deea093a..5d36ebe57 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -94,10 +94,11 @@ func NonBlockingWrite2(fd int, b1, b2 []byte) *tcpip.Error {
 	return nil
 }
 
-type pollEvent struct {
-	fd      int32
-	events  int16
-	revents int16
+// PollEvent represents the pollfd structure passed to a poll() system call.
+type PollEvent struct {
+	FD      int32
+	Events  int16
+	Revents int16
 }
 
 // BlockingRead reads from a file descriptor that is set up as non-blocking. If
@@ -110,12 +111,12 @@ func BlockingRead(fd int, b []byte) (int, *tcpip.Error) {
 			return int(n), nil
 		}
 
-		event := pollEvent{
-			fd:     int32(fd),
-			events: 1, // POLLIN
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
 		}
 
-		_, e = blockingPoll(&event, 1, -1)
+		_, e = BlockingPoll(&event, 1, -1)
 		if e != 0 && e != syscall.EINTR {
 			return 0, TranslateErrno(e)
 		}
@@ -132,12 +133,12 @@ func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) {
 			return int(n), nil
 		}
 
-		event := pollEvent{
-			fd:     int32(fd),
-			events: 1, // POLLIN
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
 		}
 
-		_, e = blockingPoll(&event, 1, -1)
+		_, e = BlockingPoll(&event, 1, -1)
 		if e != 0 && e != syscall.EINTR {
 			return 0, TranslateErrno(e)
 		}
@@ -162,12 +163,12 @@ func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
 			return int(n), nil
 		}
 
-		event := pollEvent{
-			fd:     int32(fd),
-			events: 1, // POLLIN
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
 		}
 
-		if _, e := blockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR {
+		if _, e := BlockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR {
 			return 0, TranslateErrno(e)
 		}
 	}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 83d56f93a..0cadf48d6 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -135,12 +135,12 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 		mac := tcpip.LinkAddress(generateRndMac())
 		linkEP := fdbased.New(&fdbased.Options{
-			FD:             newFD,
-			MTU:            uint32(link.MTU),
-			EthernetHeader: true,
-			HandleLocal:    true,
-			Address:        mac,
-			UseRecvMMsg:    true,
+			FD:                 newFD,
+			MTU:                uint32(link.MTU),
+			EthernetHeader:     true,
+			HandleLocal:        true,
+			Address:            mac,
+			PacketDispatchMode: fdbased.PacketMMap,
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
-- 
cgit v1.2.3


From 0a41ea72c1f70916bdbb68d9fdfa6c438e28b5b2 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 14 Feb 2019 15:46:25 -0800
Subject: Don't allow writing or reading to TTY unless process group is in
 foreground.

If a background process tries to read from a TTY, linux sends it a SIGTTIN
unless the signal is blocked or ignored, or the process group is an orphan, in
which case the syscall returns EIO.

See drivers/tty/n_tty.c:n_tty_read()=>job_control().

If a background process tries to write a TTY, set the termios, or set the
foreground process group, linux then sends a SIGTTOU. If the signal is ignored
or blocked, linux allows the write. If the process group is an orphan, the
syscall returns EIO.

See drivers/tty/tty_io.c:tty_check_change().

PiperOrigin-RevId: 234044367
Change-Id: I009461352ac4f3f11c5d42c43ac36bb0caa580f9
---
 pkg/sentry/control/proc.go           |  14 ++-
 pkg/sentry/fs/host/tty.go            | 183 ++++++++++++++++++++++++++++++-----
 pkg/sentry/kernel/kernel.go          |  44 +++------
 pkg/sentry/kernel/sessions.go        |  29 ++++++
 pkg/sentry/kernel/signal_handlers.go |   8 ++
 runsc/boot/loader.go                 |  14 ++-
 6 files changed, 234 insertions(+), 58 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 923399fb2..e848def14 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -222,10 +222,18 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		return nil, 0, nil, err
 	}
 
-	if ttyFile == nil {
-		return tg, tid, nil, nil
+	var ttyFileOps *host.TTYFileOperations
+	if ttyFile != nil {
+		// Set the foreground process group on the TTY before starting
+		// the process.
+		ttyFileOps = ttyFile.FileOperations.(*host.TTYFileOperations)
+		ttyFileOps.InitForegroundProcessGroup(tg.ProcessGroup())
 	}
-	return tg, tid, ttyFile.FileOperations.(*host.TTYFileOperations), nil
+
+	// Start the newly created process.
+	proc.Kernel.StartProcess(tg)
+
+	return tg, tid, ttyFileOps, nil
 }
 
 // PsArgs is the set of arguments to ps.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index ac6ad1b87..21db0086e 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -37,8 +37,11 @@ type TTYFileOperations struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
 
-	// FGProcessGroup is the foreground process group this TTY.  Will be
-	// nil if not set or if this file has been released.
+	// session is the session attached to this TTYFileOperations.
+	session *kernel.Session
+
+	// fgProcessGroup is the foreground process group that is currently
+	// connected to this TTY.
 	fgProcessGroup *kernel.ProcessGroup
 }
 
@@ -49,15 +52,58 @@ func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops
 	})
 }
 
-// ForegroundProcessGroup returns the foreground process for the TTY. This will
-// be nil if the foreground process has not been set or if the file has been
-// released.
+// InitForegroundProcessGroup sets the foreground process group and session for
+// the TTY. This should only be called once, after the foreground process group
+// has been created, but before it has started running.
+func (t *TTYFileOperations) InitForegroundProcessGroup(pg *kernel.ProcessGroup) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.fgProcessGroup != nil {
+		panic("foreground process group is already set")
+	}
+	t.fgProcessGroup = pg
+	t.session = pg.Session()
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY.
 func (t *TTYFileOperations) ForegroundProcessGroup() *kernel.ProcessGroup {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	return t.fgProcessGroup
 }
 
+// Read implements fs.FileOperations.Read.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+//
+// See drivers/tty/n_tty.c:n_tty_read()=>job_control().
+func (t *TTYFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileOperations.Read(ctx, file, dst, offset)
+}
+
+// Write implements fs.FileOperations.Write.
+func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the write?
+	if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+		return 0, err
+	}
+	return t.fileOperations.Write(ctx, file, src, offset)
+}
+
 // Release implements fs.FileOperations.Release.
 func (t *TTYFileOperations) Release() {
 	t.mu.Lock()
@@ -84,6 +130,13 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+
 		var termios linux.Termios
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
 			AddressSpaceActive: true,
@@ -99,20 +152,17 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		// Get the process group ID of the foreground process group on
 		// this terminal.
 
+		pidns := kernel.PIDNamespaceFromContext(ctx)
+		if pidns == nil {
+			return 0, syserror.ENOTTY
+		}
+
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
-		if t.fgProcessGroup == nil {
-			// No process group has been set yet. Let's just lie
-			// and tell it the process group from the current task.
-			// The app is probably going to set it to something
-			// else very soon anyways.
-			t.fgProcessGroup = kernel.TaskFromContext(ctx).ThreadGroup().ProcessGroup()
-		}
-
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID
 		// namespace.
-		pgID := kernel.TaskFromContext(ctx).ThreadGroup().PIDNamespace().IDOfProcessGroup(t.fgProcessGroup)
+		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
@@ -123,6 +173,30 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
+		task := kernel.TaskFromContext(ctx)
+		if task == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Check that we are allowed to set the process group.
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			// drivers/tty/tty_io.c:tiocspgrp() converts -EIO from
+			// tty_check_change() to -ENOTTY.
+			if err == syserror.EIO {
+				return 0, syserror.ENOTTY
+			}
+			return 0, err
+		}
+
+		// Check that calling task's process group is in the TTY
+		// session.
+		if task.ThreadGroup().Session() != t.session {
+			return 0, syserror.ENOTTY
+		}
+
 		var pgID kernel.ProcessGroupID
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
 			AddressSpaceActive: true,
@@ -136,24 +210,18 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		}
 
 		// Process group with pgID must exist in this PID namespace.
-		task := kernel.TaskFromContext(ctx)
 		pidns := task.PIDNamespace()
 		pg := pidns.ProcessGroupWithID(pgID)
 		if pg == nil {
 			return 0, syserror.ESRCH
 		}
 
-		// Process group must be in same session as calling task's
-		// process group.
-		curSession := task.ThreadGroup().ProcessGroup().Session()
-		curSessionID := pidns.IDOfSession(curSession)
-		if pidns.IDOfSession(pg.Session()) != curSessionID {
+		// Check that new process group is in the TTY session.
+		if pg.Session() != t.session {
 			return 0, syserror.EPERM
 		}
 
-		t.mu.Lock()
 		t.fgProcessGroup = pg
-		t.mu.Unlock()
 		return 0, nil
 
 	case linux.TIOCGWINSZ:
@@ -171,6 +239,10 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 	case linux.TIOCSWINSZ:
 		// Args: const struct winsize *argp
 		// Set window size.
+
+		// Unlike setting the termios, any process group (even
+		// background ones) can set the winsize.
+
 		var winsize linux.Winsize
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
 			AddressSpaceActive: true,
@@ -213,3 +285,70 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		return 0, syserror.ENOTTY
 	}
 }
+
+// checkChange checks that the process group is allowed to read, write, or
+// change the state of the TTY.
+//
+// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic
+// is a bit convoluted, but documented inline.
+//
+// Preconditions: t.mu must be held.
+func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		// No task? Linux does not have an analog for this case, but
+		// tty_check_change is more of a blacklist of cases than a
+		// whitelist, and is surprisingly permissive. Allowing the
+		// change seems most appropriate.
+		return nil
+	}
+
+	tg := task.ThreadGroup()
+	pg := tg.ProcessGroup()
+
+	// If the session for the task is different than the session for the
+	// controlling TTY, then the change is allowed. Seems like a bad idea,
+	// but that's exactly what linux does.
+	if tg.Session() != t.fgProcessGroup.Session() {
+		return nil
+	}
+
+	// If we are the foreground process group, then the change is allowed.
+	if pg == t.fgProcessGroup {
+		return nil
+	}
+
+	// We are not the foreground process group.
+
+	// Is the provided signal blocked or ignored?
+	if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) {
+		// If the signal is SIGTTIN, then we are attempting to read
+		// from the TTY. Don't send the signal and return EIO.
+		if sig == linux.SIGTTIN {
+			return syserror.EIO
+		}
+
+		// Otherwise, we are writing or changing terminal state. This is allowed.
+		return nil
+	}
+
+	// If the process group is an orphan, return EIO.
+	if pg.IsOrphan() {
+		return syserror.EIO
+	}
+
+	// Otherwise, send the signal to the process group and return ERESTARTSYS.
+	//
+	// Note that Linux also unconditionally sets TIF_SIGPENDING on current,
+	// but this isn't necessary in gVisor because the rationale given in
+	// 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
+	// apply: the sentry will handle -ERESTARTSYS in
+	// kernel.runApp.execute() even if the kernel.Task isn't interrupted.
+	si := arch.SignalInfo{
+		Code:  arch.SignalInfoKernel,
+		Signo: int32(sig),
+	}
+	// Linux ignores the result of kill_pgrp().
+	_ = pg.SendSignal(&si)
+	return kernel.ERESTARTSYS
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index e7e5ff777..c6afae2e6 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -615,8 +615,11 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 // CreateProcess creates a new task in a new thread group with the given
 // options. The new task has no parent and is in the root PID namespace.
 //
-// If k.Start() has already been called, the created task will begin running
-// immediately. Otherwise, it will be started when k.Start() is called.
+// If k.Start() has already been called, then the created process must be
+// started by calling kernel.StartProcess(tg).
+//
+// If k.Start() has not yet been called, then the created task will begin
+// running when k.Start() is called.
 //
 // CreateProcess has no analogue in Linux; it is used to create the initial
 // application task, as well as processes started by the control server.
@@ -688,22 +691,25 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
 		ContainerID:             args.ContainerID,
 	}
-	t, err := k.tasks.NewTask(config)
-	if err != nil {
+	if _, err := k.tasks.NewTask(config); err != nil {
 		return nil, 0, err
 	}
 
 	// Success.
 	tgid := k.tasks.Root.IDOfThreadGroup(tg)
-	if k.started {
-		tid := k.tasks.Root.IDOfTask(t)
-		t.Start(tid)
-	} else if k.globalInit == nil {
+	if k.globalInit == nil {
 		k.globalInit = tg
 	}
 	return tg, tgid, nil
 }
 
+// StartProcess starts running a process that was created with CreateProcess.
+func (k *Kernel) StartProcess(tg *ThreadGroup) {
+	t := tg.Leader()
+	tid := k.tasks.Root.IDOfTask(t)
+	t.Start(tid)
+}
+
 // Start starts execution of all tasks in k.
 //
 // Preconditions: Start may be called exactly once.
@@ -866,28 +872,6 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 	return lastErr
 }
 
-// SendProcessGroupSignal sends a signal to all processes inside the process
-// group. It is analagous to kernel/signal.c:kill_pgrp.
-func (k *Kernel) SendProcessGroupSignal(pg *ProcessGroup, info *arch.SignalInfo) error {
-	k.extMu.Lock()
-	defer k.extMu.Unlock()
-	k.tasks.mu.RLock()
-	defer k.tasks.mu.RUnlock()
-
-	var lastErr error
-	for t := range k.tasks.Root.tids {
-		if t == t.tg.leader && t.tg.ProcessGroup() == pg {
-			t.tg.signalHandlers.mu.Lock()
-			defer t.tg.signalHandlers.mu.Unlock()
-			infoCopy := *info
-			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
-				lastErr = err
-			}
-		}
-	}
-	return lastErr
-}
-
 // FeatureSet returns the FeatureSet.
 func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
 	return k.featureSet
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 78a5b4063..6fd65f2b0 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -119,6 +120,13 @@ func (pg *ProcessGroup) Originator() *ThreadGroup {
 	return pg.originator
 }
 
+// IsOrphan returns true if this process group is an orphan.
+func (pg *ProcessGroup) IsOrphan() bool {
+	pg.originator.TaskSet().mu.RLock()
+	defer pg.originator.TaskSet().mu.RUnlock()
+	return pg.ancestors == 0
+}
+
 // incRefWithParent grabs a reference.
 //
 // This function is called when this ProcessGroup is being associated with some
@@ -224,6 +232,27 @@ func (pg *ProcessGroup) Session() *Session {
 	return pg.session
 }
 
+// SendSignal sends a signal to all processes inside the process group. It is
+// analagous to kernel/signal.c:kill_pgrp.
+func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
+	tasks := pg.originator.TaskSet()
+	tasks.mu.RLock()
+	defer tasks.mu.RUnlock()
+
+	var lastErr error
+	for t := range tasks.Root.tids {
+		if t == t.tg.leader && t.tg.ProcessGroup() == pg {
+			t.tg.signalHandlers.mu.Lock()
+			defer t.tg.signalHandlers.mu.Unlock()
+			infoCopy := *info
+			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				lastErr = err
+			}
+		}
+	}
+	return lastErr
+}
+
 // CreateSession creates a new Session, with the ThreadGroup as the leader.
 //
 // EPERM may be returned if either the given ThreadGroup is already a Session
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 3f1ac9898..60cbe85b8 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -69,6 +69,14 @@ func (sh *SignalHandlers) CopyForExec() *SignalHandlers {
 	return sh2
 }
 
+// IsIgnored returns true if the signal is ignored.
+func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool {
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	sa, ok := sh.actions[sig]
+	return ok && sa.Handler == arch.SignalActIgnore
+}
+
 // dequeueActionLocked returns the SignalAct that should be used to handle sig.
 //
 // Preconditions: sh.mu must be locked.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 973578484..41f456af7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -477,9 +477,9 @@ func (l *Loader) run() error {
 			return err
 		}
 
-		// Create the root container init task.
-		_, _, err := l.k.CreateProcess(l.rootProcArgs)
-		if err != nil {
+		// Create the root container init task. It will begin running
+		// when the kernel is started.
+		if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
 			return fmt.Errorf("creating init process: %v", err)
 		}
 
@@ -492,6 +492,11 @@ func (l *Loader) run() error {
 		ttyFile := l.rootProcArgs.FDMap.GetFile(0)
 		defer ttyFile.DecRef()
 		ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
+
+		// Set the foreground process group on the TTY to the global
+		// init process group, since that is what we are about to
+		// start running.
+		ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
 	}
 
 	// Start signal forwarding only after an init process is created.
@@ -595,10 +600,13 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
 	}
 
+	// Create and start the new process.
 	tg, _, err := l.k.CreateProcess(procArgs)
 	if err != nil {
 		return fmt.Errorf("creating process: %v", err)
 	}
+	l.k.StartProcess(tg)
+
 	// CreateProcess takes a reference on FDMap if successful.
 	procArgs.FDMap.DecRef()
 
-- 
cgit v1.2.3


From b75aa515044367094e762985a4b1a1f0580e3ad6 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 22 Feb 2019 13:33:49 -0800
Subject: Rename ping endpoints to icmp endpoints.

PiperOrigin-RevId: 235248572
Change-Id: I5b0538b6feb365a98712c2a2d56d856fe80a8a09
---
 pkg/tcpip/network/arp/BUILD                |   2 +-
 pkg/tcpip/network/arp/arp_test.go          |   4 +-
 pkg/tcpip/network/ipv6/BUILD               |   2 +-
 pkg/tcpip/network/ipv6/icmp_test.go        |   6 +-
 pkg/tcpip/transport/icmp/BUILD             |  45 ++
 pkg/tcpip/transport/icmp/endpoint.go       | 694 +++++++++++++++++++++++++++++
 pkg/tcpip/transport/icmp/endpoint_state.go |  90 ++++
 pkg/tcpip/transport/icmp/protocol.go       | 124 ++++++
 pkg/tcpip/transport/ping/BUILD             |  45 --
 pkg/tcpip/transport/ping/endpoint.go       | 694 -----------------------------
 pkg/tcpip/transport/ping/endpoint_state.go |  90 ----
 pkg/tcpip/transport/ping/protocol.go       | 124 ------
 runsc/boot/BUILD                           |   2 +-
 runsc/boot/loader.go                       |   4 +-
 14 files changed, 963 insertions(+), 963 deletions(-)
 create mode 100644 pkg/tcpip/transport/icmp/BUILD
 create mode 100644 pkg/tcpip/transport/icmp/endpoint.go
 create mode 100644 pkg/tcpip/transport/icmp/endpoint_state.go
 create mode 100644 pkg/tcpip/transport/icmp/protocol.go
 delete mode 100644 pkg/tcpip/transport/ping/BUILD
 delete mode 100644 pkg/tcpip/transport/ping/endpoint.go
 delete mode 100644 pkg/tcpip/transport/ping/endpoint_state.go
 delete mode 100644 pkg/tcpip/transport/ping/protocol.go

(limited to 'runsc')

diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index ef18bb93d..2a355e689 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -30,6 +30,6 @@ go_test(
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/stack",
-        "//pkg/tcpip/transport/ping",
+        "//pkg/tcpip/transport/icmp",
     ],
 )
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 5894f9114..14b9cb8b6 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -26,7 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp"
 )
 
 const (
@@ -43,7 +43,7 @@ type testContext struct {
 }
 
 func newTestContext(t *testing.T) *testContext {
-	s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, []string{ping.ProtocolName4}, stack.Options{})
+	s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, []string{icmp.ProtocolName4}, stack.Options{})
 
 	const defaultMTU = 65536
 	id, linkEP := channel.New(256, defaultMTU, stackLinkAddr)
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index 000e00dba..247e14e37 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -32,7 +32,7 @@ go_test(
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/stack",
-        "//pkg/tcpip/transport/ping",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 12c818b48..797176243 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -27,7 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/channel"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -68,8 +68,8 @@ func (e endpointWithResolutionCapability) Capabilities() stack.LinkEndpointCapab
 func newTestContext(t *testing.T) *testContext {
 	c := &testContext{
 		t:      t,
-		s0:     stack.New([]string{ProtocolName}, []string{ping.ProtocolName6}, stack.Options{}),
-		s1:     stack.New([]string{ProtocolName}, []string{ping.ProtocolName6}, stack.Options{}),
+		s0:     stack.New([]string{ProtocolName}, []string{icmp.ProtocolName6}, stack.Options{}),
+		s1:     stack.New([]string{ProtocolName}, []string{icmp.ProtocolName6}, stack.Options{}),
 		icmpCh: make(chan icmpInfo, 10),
 	}
 
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
new file mode 100644
index 000000000..74d9ff253
--- /dev/null
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -0,0 +1,45 @@
+package(licenses = ["notice"])
+
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_template_instance(
+    name = "icmp_packet_list",
+    out = "icmp_packet_list.go",
+    package = "icmp",
+    prefix = "icmpPacket",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*icmpPacket",
+        "Linker": "*icmpPacket",
+    },
+)
+
+go_library(
+    name = "icmp",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "icmp_packet_list.go",
+        "protocol.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sleep",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
+)
+
+filegroup(
+    name = "autogen",
+    srcs = [
+        "icmp_packet_list.go",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
new file mode 100644
index 000000000..d87bfe048
--- /dev/null
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -0,0 +1,694 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package icmp
+
+import (
+	"encoding/binary"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sleep"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// +stateify savable
+type icmpPacket struct {
+	icmpPacketEntry
+	senderAddress tcpip.FullAddress
+	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	timestamp     int64
+	// views is used as buffer for data when its length is large
+	// enough to store a VectorisedView.
+	views [8]buffer.View `state:"nosave"`
+}
+
+type endpointState int
+
+const (
+	stateInitial endpointState = iota
+	stateBound
+	stateConnected
+	stateClosed
+)
+
+// endpoint represents an ICMP (ping) endpoint. This struct serves as the
+// interface between users of the endpoint and the protocol implementation; it
+// is legal to have concurrent goroutines make calls into the endpoint, they
+// are properly synchronized.
+type endpoint struct {
+	// The following fields are initialized at creation time and do not
+	// change throughout the lifetime of the endpoint.
+	stack       *stack.Stack `state:"manual"`
+	netProto    tcpip.NetworkProtocolNumber
+	transProto  tcpip.TransportProtocolNumber
+	waiterQueue *waiter.Queue
+
+	// The following fields are used to manage the receive queue, and are
+	// protected by rcvMu.
+	rcvMu         sync.Mutex `state:"nosave"`
+	rcvReady      bool
+	rcvList       icmpPacketList
+	rcvBufSizeMax int `state:".(int)"`
+	rcvBufSize    int
+	rcvClosed     bool
+
+	// The following fields are protected by the mu mutex.
+	mu         sync.RWMutex `state:"nosave"`
+	sndBufSize int
+	// shutdownFlags represent the current shutdown state of the endpoint.
+	shutdownFlags tcpip.ShutdownFlags
+	id            stack.TransportEndpointID
+	state         endpointState
+	bindNICID     tcpip.NICID
+	bindAddr      tcpip.Address
+	regNICID      tcpip.NICID
+	route         stack.Route `state:"manual"`
+}
+
+func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
+	return &endpoint{
+		stack:         stack,
+		netProto:      netProto,
+		transProto:    transProto,
+		waiterQueue:   waiterQueue,
+		rcvBufSizeMax: 32 * 1024,
+		sndBufSize:    32 * 1024,
+	}
+}
+
+// Close puts the endpoint in a closed state and frees all resources
+// associated with it.
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
+	switch e.state {
+	case stateBound, stateConnected:
+		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e)
+	}
+
+	// Close the receive list and drain it.
+	e.rcvMu.Lock()
+	e.rcvClosed = true
+	e.rcvBufSize = 0
+	for !e.rcvList.Empty() {
+		p := e.rcvList.Front()
+		e.rcvList.Remove(p)
+	}
+	e.rcvMu.Unlock()
+
+	e.route.Release()
+
+	// Update the state.
+	e.state = stateClosed
+
+	e.mu.Unlock()
+
+	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+}
+
+// Read reads data from the endpoint. This method does not block if
+// there is no data pending.
+func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	e.rcvMu.Lock()
+
+	if e.rcvList.Empty() {
+		err := tcpip.ErrWouldBlock
+		if e.rcvClosed {
+			err = tcpip.ErrClosedForReceive
+		}
+		e.rcvMu.Unlock()
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	p := e.rcvList.Front()
+	e.rcvList.Remove(p)
+	e.rcvBufSize -= p.data.Size()
+
+	e.rcvMu.Unlock()
+
+	if addr != nil {
+		*addr = p.senderAddress
+	}
+
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
+}
+
+// prepareForWrite prepares the endpoint for sending data. In particular, it
+// binds it if it's still in the initial state. To do so, it must first
+// reacquire the mutex in exclusive mode.
+//
+// Returns true for retry if preparation should be retried.
+func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
+	switch e.state {
+	case stateInitial:
+	case stateConnected:
+		return false, nil
+
+	case stateBound:
+		if to == nil {
+			return false, tcpip.ErrDestinationRequired
+		}
+		return false, nil
+	default:
+		return false, tcpip.ErrInvalidEndpointState
+	}
+
+	e.mu.RUnlock()
+	defer e.mu.RLock()
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// The state changed when we released the shared locked and re-acquired
+	// it in exclusive mode. Try again.
+	if e.state != stateInitial {
+		return true, nil
+	}
+
+	// The state is still 'initial', so try to bind the endpoint.
+	if err := e.bindLocked(tcpip.FullAddress{}, nil); err != nil {
+		return false, err
+	}
+
+	return true, nil
+}
+
+// Write writes data to the endpoint's peer. This method does not block
+// if the data cannot be written.
+func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) {
+	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
+	if opts.More {
+		return 0, nil, tcpip.ErrInvalidOptionValue
+	}
+
+	to := opts.To
+
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
+	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
+		return 0, nil, tcpip.ErrClosedForSend
+	}
+
+	// Prepare for write.
+	for {
+		retry, err := e.prepareForWrite(to)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		if !retry {
+			break
+		}
+	}
+
+	var route *stack.Route
+	if to == nil {
+		route = &e.route
+
+		if route.IsResolutionRequired() {
+			// Promote lock to exclusive if using a shared route, given that it may
+			// need to change in Route.Resolve() call below.
+			e.mu.RUnlock()
+			defer e.mu.RLock()
+
+			e.mu.Lock()
+			defer e.mu.Unlock()
+
+			// Recheck state after lock was re-acquired.
+			if e.state != stateConnected {
+				return 0, nil, tcpip.ErrInvalidEndpointState
+			}
+		}
+	} else {
+		// Reject destination address if it goes through a different
+		// NIC than the endpoint was bound to.
+		nicid := to.NIC
+		if e.bindNICID != 0 {
+			if nicid != 0 && nicid != e.bindNICID {
+				return 0, nil, tcpip.ErrNoRoute
+			}
+
+			nicid = e.bindNICID
+		}
+
+		toCopy := *to
+		to = &toCopy
+		netProto, err := e.checkV4Mapped(to, true)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		// Find the enpoint.
+		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto)
+		if err != nil {
+			return 0, nil, err
+		}
+		defer r.Release()
+
+		route = &r
+	}
+
+	if route.IsResolutionRequired() {
+		waker := &sleep.Waker{}
+		if ch, err := route.Resolve(waker); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				// Link address needs to be resolved. Resolution was triggered the
+				// background. Better luck next time.
+				route.RemoveWaker(waker)
+				return 0, ch, tcpip.ErrNoLinkAddress
+			}
+			return 0, nil, err
+		}
+	}
+
+	v, err := p.Get(p.Size())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	switch e.netProto {
+	case header.IPv4ProtocolNumber:
+		err = sendPing4(route, e.id.LocalPort, v)
+
+	case header.IPv6ProtocolNumber:
+		err = sendPing6(route, e.id.LocalPort, v)
+	}
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(len(v)), nil, nil
+}
+
+// Peek only returns data from a single datagram, so do nothing here.
+func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// SetSockOpt sets a socket option. Currently not supported.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	return nil
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+
+	case *tcpip.SendBufferSizeOption:
+		e.mu.Lock()
+		*o = tcpip.SendBufferSizeOption(e.sndBufSize)
+		e.mu.Unlock()
+		return nil
+
+	case *tcpip.ReceiveBufferSizeOption:
+		e.rcvMu.Lock()
+		*o = tcpip.ReceiveBufferSizeOption(e.rcvBufSizeMax)
+		e.rcvMu.Unlock()
+		return nil
+
+	case *tcpip.ReceiveQueueSizeOption:
+		e.rcvMu.Lock()
+		if e.rcvList.Empty() {
+			*o = 0
+		} else {
+			p := e.rcvList.Front()
+			*o = tcpip.ReceiveQueueSizeOption(p.data.Size())
+		}
+		e.rcvMu.Unlock()
+		return nil
+
+	case *tcpip.KeepaliveEnabledOption:
+		*o = 0
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+func sendPing4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
+	if len(data) < header.ICMPv4EchoMinimumSize {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Set the ident. Sequence number is provided by the user.
+	binary.BigEndian.PutUint16(data[header.ICMPv4MinimumSize:], ident)
+
+	hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength()))
+
+	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize))
+	copy(icmpv4, data)
+	data = data[header.ICMPv4EchoMinimumSize:]
+
+	// Linux performs these basic checks.
+	if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	icmpv4.SetChecksum(0)
+	icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
+
+	return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL())
+}
+
+func sendPing6(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
+	if len(data) < header.ICMPv6EchoMinimumSize {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Set the ident. Sequence number is provided by the user.
+	binary.BigEndian.PutUint16(data[header.ICMPv6MinimumSize:], ident)
+
+	hdr := buffer.NewPrependable(header.ICMPv6EchoMinimumSize + int(r.MaxHeaderLength()))
+
+	icmpv6 := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
+	copy(icmpv6, data)
+	data = data[header.ICMPv6EchoMinimumSize:]
+
+	if icmpv6.Type() != header.ICMPv6EchoRequest || icmpv6.Code() != 0 {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	icmpv6.SetChecksum(0)
+	icmpv6.SetChecksum(^header.Checksum(icmpv6, header.Checksum(data, 0)))
+
+	return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv6ProtocolNumber, r.DefaultTTL())
+}
+
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := e.netProto
+	if header.IsV4MappedAddress(addr.Addr) {
+		return 0, tcpip.ErrNoRoute
+	}
+
+	// Fail if we're bound to an address length different from the one we're
+	// checking.
+	if l := len(e.id.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+
+	return netProto, nil
+}
+
+// Connect connects the endpoint to its peer. Specifying a NIC is optional.
+func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	nicid := addr.NIC
+	localPort := uint16(0)
+	switch e.state {
+	case stateBound, stateConnected:
+		localPort = e.id.LocalPort
+		if e.bindNICID == 0 {
+			break
+		}
+
+		if nicid != 0 && nicid != e.bindNICID {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		nicid = e.bindNICID
+	default:
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	netProto, err := e.checkV4Mapped(&addr, false)
+	if err != nil {
+		return err
+	}
+
+	// Find a route to the desired destination.
+	r, err := e.stack.FindRoute(nicid, e.bindAddr, addr.Addr, netProto)
+	if err != nil {
+		return err
+	}
+	defer r.Release()
+
+	id := stack.TransportEndpointID{
+		LocalAddress:  r.LocalAddress,
+		LocalPort:     localPort,
+		RemoteAddress: r.RemoteAddress,
+	}
+
+	// Even if we're connected, this endpoint can still be used to send
+	// packets on a different network protocol, so we register both even if
+	// v6only is set to false and this is an ipv6 endpoint.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+
+	id, err = e.registerWithStack(nicid, netProtos, id)
+	if err != nil {
+		return err
+	}
+
+	e.id = id
+	e.route = r.Clone()
+	e.regNICID = nicid
+
+	e.state = stateConnected
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// ConnectEndpoint is not supported.
+func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown closes the read and/or write end of the endpoint connection
+// to its peer.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.shutdownFlags |= flags
+
+	if e.state != stateConnected {
+		return tcpip.ErrNotConnected
+	}
+
+	if flags&tcpip.ShutdownRead != 0 {
+		e.rcvMu.Lock()
+		wasClosed := e.rcvClosed
+		e.rcvClosed = true
+		e.rcvMu.Unlock()
+
+		if !wasClosed {
+			e.waiterQueue.Notify(waiter.EventIn)
+		}
+	}
+
+	return nil
+}
+
+// Listen is not supported by UDP, it just fails.
+func (*endpoint) Listen(int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept is not supported by UDP, it just fails.
+func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	return nil, nil, tcpip.ErrNotSupported
+}
+
+func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
+	if id.LocalPort != 0 {
+		// The endpoint already has a local port, just attempt to
+		// register it.
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false)
+		return id, err
+	}
+
+	// We need to find a port for the endpoint.
+	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
+		id.LocalPort = p
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false)
+		switch err {
+		case nil:
+			return true, nil
+		case tcpip.ErrPortInUse:
+			return false, nil
+		default:
+			return false, err
+		}
+	})
+
+	return id, err
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	// Don't allow binding once endpoint is not in the initial state
+	// anymore.
+	if e.state != stateInitial {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	netProto, err := e.checkV4Mapped(&addr, false)
+	if err != nil {
+		return err
+	}
+
+	// Expand netProtos to include v4 and v6 if the caller is binding to a
+	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
+	// set to false.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+
+	if len(addr.Addr) != 0 {
+		// A local address was specified, verify that it's valid.
+		if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 {
+			return tcpip.ErrBadLocalAddress
+		}
+	}
+
+	id := stack.TransportEndpointID{
+		LocalPort:    addr.Port,
+		LocalAddress: addr.Addr,
+	}
+	id, err = e.registerWithStack(addr.NIC, netProtos, id)
+	if err != nil {
+		return err
+	}
+	if commit != nil {
+		if err := commit(); err != nil {
+			// Unregister, the commit failed.
+			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, e.transProto, id, e)
+			return err
+		}
+	}
+
+	e.id = id
+	e.regNICID = addr.NIC
+
+	// Mark endpoint as bound.
+	e.state = stateBound
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// Bind binds the endpoint to a specific local address and port.
+// Specifying a NIC is optional.
+func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	err := e.bindLocked(addr, commit)
+	if err != nil {
+		return err
+	}
+
+	e.bindNICID = addr.NIC
+	e.bindAddr = addr.Addr
+
+	return nil
+}
+
+// GetLocalAddress returns the address to which the endpoint is bound.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	return tcpip.FullAddress{
+		NIC:  e.regNICID,
+		Addr: e.id.LocalAddress,
+		Port: e.id.LocalPort,
+	}, nil
+}
+
+// GetRemoteAddress returns the address to which the endpoint is connected.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	if e.state != stateConnected {
+		return tcpip.FullAddress{}, tcpip.ErrNotConnected
+	}
+
+	return tcpip.FullAddress{
+		NIC:  e.regNICID,
+		Addr: e.id.RemoteAddress,
+		Port: e.id.RemotePort,
+	}, nil
+}
+
+// Readiness returns the current readiness of the endpoint. For example, if
+// waiter.EventIn is set, the endpoint is immediately readable.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// The endpoint is always writable.
+	result := waiter.EventOut & mask
+
+	// Determine if the endpoint is readable if requested.
+	if (mask & waiter.EventIn) != 0 {
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() || e.rcvClosed {
+			result |= waiter.EventIn
+		}
+		e.rcvMu.Unlock()
+	}
+
+	return result
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
+	e.rcvMu.Lock()
+
+	// Drop the packet if our buffer is currently full.
+	if !e.rcvReady || e.rcvClosed || e.rcvBufSize >= e.rcvBufSizeMax {
+		e.rcvMu.Unlock()
+		return
+	}
+
+	wasEmpty := e.rcvBufSize == 0
+
+	// Push new packet into receive list and increment the buffer size.
+	pkt := &icmpPacket{
+		senderAddress: tcpip.FullAddress{
+			NIC:  r.NICID(),
+			Addr: id.RemoteAddress,
+		},
+	}
+	pkt.data = vv.Clone(pkt.views[:])
+	e.rcvList.PushBack(pkt)
+	e.rcvBufSize += vv.Size()
+
+	pkt.timestamp = e.stack.NowNanoseconds()
+
+	e.rcvMu.Unlock()
+
+	// Notify any waiters that there's data to be read now.
+	if wasEmpty {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
+}
diff --git a/pkg/tcpip/transport/icmp/endpoint_state.go b/pkg/tcpip/transport/icmp/endpoint_state.go
new file mode 100644
index 000000000..21008d089
--- /dev/null
+++ b/pkg/tcpip/transport/icmp/endpoint_state.go
@@ -0,0 +1,90 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package icmp
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves icmpPacket.data field.
+func (p *icmpPacket) saveData() buffer.VectorisedView {
+	// We cannot save p.data directly as p.data.views may alias to p.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return p.data.Clone(nil)
+}
+
+// loadData loads icmpPacket.data field.
+func (p *icmpPacket) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing p.views for data.views.
+	p.data = data
+}
+
+// beforeSave is invoked by stateify.
+func (e *endpoint) beforeSave() {
+	// Stop incoming packets from being handled (and mutate endpoint state).
+	// The lock will be released after savercvBufSizeMax(), which would have
+	// saved e.rcvBufSizeMax and set it to 0 to continue blocking incoming
+	// packets.
+	e.rcvMu.Lock()
+}
+
+// saveRcvBufSizeMax is invoked by stateify.
+func (e *endpoint) saveRcvBufSizeMax() int {
+	max := e.rcvBufSizeMax
+	// Make sure no new packets will be handled regardless of the lock.
+	e.rcvBufSizeMax = 0
+	// Release the lock acquired in beforeSave() so regular endpoint closing
+	// logic can proceed after save.
+	e.rcvMu.Unlock()
+	return max
+}
+
+// loadRcvBufSizeMax is invoked by stateify.
+func (e *endpoint) loadRcvBufSizeMax(max int) {
+	e.rcvBufSizeMax = max
+}
+
+// afterLoad is invoked by stateify.
+func (e *endpoint) afterLoad() {
+	e.stack = stack.StackFromEnv
+
+	if e.state != stateBound && e.state != stateConnected {
+		return
+	}
+
+	var err *tcpip.Error
+	if e.state == stateConnected {
+		e.route, err = e.stack.FindRoute(e.regNICID, e.bindAddr, e.id.RemoteAddress, e.netProto)
+		if err != nil {
+			panic(*err)
+		}
+
+		e.id.LocalAddress = e.route.LocalAddress
+	} else if len(e.id.LocalAddress) != 0 { // stateBound
+		if e.stack.CheckLocalAddress(e.regNICID, e.netProto, e.id.LocalAddress) == 0 {
+			panic(tcpip.ErrBadLocalAddress)
+		}
+	}
+
+	e.id, err = e.registerWithStack(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.id)
+	if err != nil {
+		panic(*err)
+	}
+}
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
new file mode 100644
index 000000000..9f0a2bf71
--- /dev/null
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -0,0 +1,124 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package icmp contains the implementation of the ICMP and IPv6-ICMP transport
+// protocols for use in ping. To use it in the networking stack, this package
+// must be added to the project, and
+// activated on the stack by passing icmp.ProtocolName (or "icmp") and/or
+// icmp.ProtocolName6 (or "icmp6") as one of the transport protocols when
+// calling stack.New(). Then endpoints can be created by passing
+// icmp.ProtocolNumber or icmp.ProtocolNumber6 as the transport protocol number
+// when calling Stack.NewEndpoint().
+package icmp
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// ProtocolName4 is the string representation of the icmp protocol name.
+	ProtocolName4 = "icmp4"
+
+	// ProtocolNumber4 is the ICMP protocol number.
+	ProtocolNumber4 = header.ICMPv4ProtocolNumber
+
+	// ProtocolName6 is the string representation of the icmp protocol name.
+	ProtocolName6 = "icmp6"
+
+	// ProtocolNumber6 is the IPv6-ICMP protocol number.
+	ProtocolNumber6 = header.ICMPv6ProtocolNumber
+)
+
+type protocol struct {
+	number tcpip.TransportProtocolNumber
+}
+
+// Number returns the ICMP protocol number.
+func (p *protocol) Number() tcpip.TransportProtocolNumber {
+	return p.number
+}
+
+func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
+	switch p.number {
+	case ProtocolNumber4:
+		return header.IPv4ProtocolNumber
+	case ProtocolNumber6:
+		return header.IPv6ProtocolNumber
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// NewEndpoint creates a new icmp endpoint.
+func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if netProto != p.netProto() {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+	return newEndpoint(stack, netProto, p.number, waiterQueue), nil
+}
+
+// MinimumPacketSize returns the minimum valid icmp packet size.
+func (p *protocol) MinimumPacketSize() int {
+	switch p.number {
+	case ProtocolNumber4:
+		return header.ICMPv4EchoMinimumSize
+	case ProtocolNumber6:
+		return header.ICMPv6EchoMinimumSize
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// ParsePorts returns the source and destination ports stored in the given icmp
+// packet.
+func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
+	switch p.number {
+	case ProtocolNumber4:
+		return 0, binary.BigEndian.Uint16(v[header.ICMPv4MinimumSize:]), nil
+	case ProtocolNumber6:
+		return 0, binary.BigEndian.Uint16(v[header.ICMPv6MinimumSize:]), nil
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// HandleUnknownDestinationPacket handles packets targeted at this protocol but
+// that don't match any existing endpoint.
+func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, buffer.VectorisedView) bool {
+	return true
+}
+
+// SetOption implements TransportProtocol.SetOption.
+func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Option implements TransportProtocol.Option.
+func (p *protocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func init() {
+	stack.RegisterTransportProtocolFactory(ProtocolName4, func() stack.TransportProtocol {
+		return &protocol{ProtocolNumber4}
+	})
+
+	stack.RegisterTransportProtocolFactory(ProtocolName6, func() stack.TransportProtocol {
+		return &protocol{ProtocolNumber6}
+	})
+}
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
deleted file mode 100644
index 4d4241d4b..000000000
--- a/pkg/tcpip/transport/ping/BUILD
+++ /dev/null
@@ -1,45 +0,0 @@
-package(licenses = ["notice"])
-
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-go_template_instance(
-    name = "ping_packet_list",
-    out = "ping_packet_list.go",
-    package = "ping",
-    prefix = "pingPacket",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*pingPacket",
-        "Linker": "*pingPacket",
-    },
-)
-
-go_library(
-    name = "ping",
-    srcs = [
-        "endpoint.go",
-        "endpoint_state.go",
-        "ping_packet_list.go",
-        "protocol.go",
-    ],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/sleep",
-        "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
-        "//pkg/tcpip/header",
-        "//pkg/tcpip/stack",
-        "//pkg/waiter",
-    ],
-)
-
-filegroup(
-    name = "autogen",
-    srcs = [
-        "ping_packet_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
deleted file mode 100644
index c8263a512..000000000
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ /dev/null
@@ -1,694 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ping
-
-import (
-	"encoding/binary"
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/sleep"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// +stateify savable
-type pingPacket struct {
-	pingPacketEntry
-	senderAddress tcpip.FullAddress
-	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
-	timestamp     int64
-	// views is used as buffer for data when its length is large
-	// enough to store a VectorisedView.
-	views [8]buffer.View `state:"nosave"`
-}
-
-type endpointState int
-
-const (
-	stateInitial endpointState = iota
-	stateBound
-	stateConnected
-	stateClosed
-)
-
-// endpoint represents a ping endpoint. This struct serves as the interface
-// between users of the endpoint and the protocol implementation; it is legal to
-// have concurrent goroutines make calls into the endpoint, they are properly
-// synchronized.
-type endpoint struct {
-	// The following fields are initialized at creation time and do not
-	// change throughout the lifetime of the endpoint.
-	stack       *stack.Stack `state:"manual"`
-	netProto    tcpip.NetworkProtocolNumber
-	transProto  tcpip.TransportProtocolNumber
-	waiterQueue *waiter.Queue
-
-	// The following fields are used to manage the receive queue, and are
-	// protected by rcvMu.
-	rcvMu         sync.Mutex `state:"nosave"`
-	rcvReady      bool
-	rcvList       pingPacketList
-	rcvBufSizeMax int `state:".(int)"`
-	rcvBufSize    int
-	rcvClosed     bool
-
-	// The following fields are protected by the mu mutex.
-	mu         sync.RWMutex `state:"nosave"`
-	sndBufSize int
-	// shutdownFlags represent the current shutdown state of the endpoint.
-	shutdownFlags tcpip.ShutdownFlags
-	id            stack.TransportEndpointID
-	state         endpointState
-	bindNICID     tcpip.NICID
-	bindAddr      tcpip.Address
-	regNICID      tcpip.NICID
-	route         stack.Route `state:"manual"`
-}
-
-func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
-	return &endpoint{
-		stack:         stack,
-		netProto:      netProto,
-		transProto:    transProto,
-		waiterQueue:   waiterQueue,
-		rcvBufSizeMax: 32 * 1024,
-		sndBufSize:    32 * 1024,
-	}
-}
-
-// Close puts the endpoint in a closed state and frees all resources
-// associated with it.
-func (e *endpoint) Close() {
-	e.mu.Lock()
-	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
-	switch e.state {
-	case stateBound, stateConnected:
-		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e)
-	}
-
-	// Close the receive list and drain it.
-	e.rcvMu.Lock()
-	e.rcvClosed = true
-	e.rcvBufSize = 0
-	for !e.rcvList.Empty() {
-		p := e.rcvList.Front()
-		e.rcvList.Remove(p)
-	}
-	e.rcvMu.Unlock()
-
-	e.route.Release()
-
-	// Update the state.
-	e.state = stateClosed
-
-	e.mu.Unlock()
-
-	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
-}
-
-// Read reads data from the endpoint. This method does not block if
-// there is no data pending.
-func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	e.rcvMu.Lock()
-
-	if e.rcvList.Empty() {
-		err := tcpip.ErrWouldBlock
-		if e.rcvClosed {
-			err = tcpip.ErrClosedForReceive
-		}
-		e.rcvMu.Unlock()
-		return buffer.View{}, tcpip.ControlMessages{}, err
-	}
-
-	p := e.rcvList.Front()
-	e.rcvList.Remove(p)
-	e.rcvBufSize -= p.data.Size()
-
-	e.rcvMu.Unlock()
-
-	if addr != nil {
-		*addr = p.senderAddress
-	}
-
-	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
-}
-
-// prepareForWrite prepares the endpoint for sending data. In particular, it
-// binds it if it's still in the initial state. To do so, it must first
-// reacquire the mutex in exclusive mode.
-//
-// Returns true for retry if preparation should be retried.
-func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
-	switch e.state {
-	case stateInitial:
-	case stateConnected:
-		return false, nil
-
-	case stateBound:
-		if to == nil {
-			return false, tcpip.ErrDestinationRequired
-		}
-		return false, nil
-	default:
-		return false, tcpip.ErrInvalidEndpointState
-	}
-
-	e.mu.RUnlock()
-	defer e.mu.RLock()
-
-	e.mu.Lock()
-	defer e.mu.Unlock()
-
-	// The state changed when we released the shared locked and re-acquired
-	// it in exclusive mode. Try again.
-	if e.state != stateInitial {
-		return true, nil
-	}
-
-	// The state is still 'initial', so try to bind the endpoint.
-	if err := e.bindLocked(tcpip.FullAddress{}, nil); err != nil {
-		return false, err
-	}
-
-	return true, nil
-}
-
-// Write writes data to the endpoint's peer. This method does not block
-// if the data cannot be written.
-func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) {
-	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
-	if opts.More {
-		return 0, nil, tcpip.ErrInvalidOptionValue
-	}
-
-	to := opts.To
-
-	e.mu.RLock()
-	defer e.mu.RUnlock()
-
-	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
-	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
-		return 0, nil, tcpip.ErrClosedForSend
-	}
-
-	// Prepare for write.
-	for {
-		retry, err := e.prepareForWrite(to)
-		if err != nil {
-			return 0, nil, err
-		}
-
-		if !retry {
-			break
-		}
-	}
-
-	var route *stack.Route
-	if to == nil {
-		route = &e.route
-
-		if route.IsResolutionRequired() {
-			// Promote lock to exclusive if using a shared route, given that it may
-			// need to change in Route.Resolve() call below.
-			e.mu.RUnlock()
-			defer e.mu.RLock()
-
-			e.mu.Lock()
-			defer e.mu.Unlock()
-
-			// Recheck state after lock was re-acquired.
-			if e.state != stateConnected {
-				return 0, nil, tcpip.ErrInvalidEndpointState
-			}
-		}
-	} else {
-		// Reject destination address if it goes through a different
-		// NIC than the endpoint was bound to.
-		nicid := to.NIC
-		if e.bindNICID != 0 {
-			if nicid != 0 && nicid != e.bindNICID {
-				return 0, nil, tcpip.ErrNoRoute
-			}
-
-			nicid = e.bindNICID
-		}
-
-		toCopy := *to
-		to = &toCopy
-		netProto, err := e.checkV4Mapped(to, true)
-		if err != nil {
-			return 0, nil, err
-		}
-
-		// Find the enpoint.
-		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto)
-		if err != nil {
-			return 0, nil, err
-		}
-		defer r.Release()
-
-		route = &r
-	}
-
-	if route.IsResolutionRequired() {
-		waker := &sleep.Waker{}
-		if ch, err := route.Resolve(waker); err != nil {
-			if err == tcpip.ErrWouldBlock {
-				// Link address needs to be resolved. Resolution was triggered the
-				// background. Better luck next time.
-				route.RemoveWaker(waker)
-				return 0, ch, tcpip.ErrNoLinkAddress
-			}
-			return 0, nil, err
-		}
-	}
-
-	v, err := p.Get(p.Size())
-	if err != nil {
-		return 0, nil, err
-	}
-
-	switch e.netProto {
-	case header.IPv4ProtocolNumber:
-		err = sendPing4(route, e.id.LocalPort, v)
-
-	case header.IPv6ProtocolNumber:
-		err = sendPing6(route, e.id.LocalPort, v)
-	}
-
-	if err != nil {
-		return 0, nil, err
-	}
-
-	return uintptr(len(v)), nil, nil
-}
-
-// Peek only returns data from a single datagram, so do nothing here.
-func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
-	return 0, tcpip.ControlMessages{}, nil
-}
-
-// SetSockOpt sets a socket option. Currently not supported.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	return nil
-}
-
-// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
-
-	case *tcpip.SendBufferSizeOption:
-		e.mu.Lock()
-		*o = tcpip.SendBufferSizeOption(e.sndBufSize)
-		e.mu.Unlock()
-		return nil
-
-	case *tcpip.ReceiveBufferSizeOption:
-		e.rcvMu.Lock()
-		*o = tcpip.ReceiveBufferSizeOption(e.rcvBufSizeMax)
-		e.rcvMu.Unlock()
-		return nil
-
-	case *tcpip.ReceiveQueueSizeOption:
-		e.rcvMu.Lock()
-		if e.rcvList.Empty() {
-			*o = 0
-		} else {
-			p := e.rcvList.Front()
-			*o = tcpip.ReceiveQueueSizeOption(p.data.Size())
-		}
-		e.rcvMu.Unlock()
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
-	default:
-		return tcpip.ErrUnknownProtocolOption
-	}
-}
-
-func sendPing4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
-	if len(data) < header.ICMPv4EchoMinimumSize {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	// Set the ident. Sequence number is provided by the user.
-	binary.BigEndian.PutUint16(data[header.ICMPv4MinimumSize:], ident)
-
-	hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength()))
-
-	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize))
-	copy(icmpv4, data)
-	data = data[header.ICMPv4EchoMinimumSize:]
-
-	// Linux performs these basic checks.
-	if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	icmpv4.SetChecksum(0)
-	icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
-
-	return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL())
-}
-
-func sendPing6(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
-	if len(data) < header.ICMPv6EchoMinimumSize {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	// Set the ident. Sequence number is provided by the user.
-	binary.BigEndian.PutUint16(data[header.ICMPv6MinimumSize:], ident)
-
-	hdr := buffer.NewPrependable(header.ICMPv6EchoMinimumSize + int(r.MaxHeaderLength()))
-
-	icmpv6 := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
-	copy(icmpv6, data)
-	data = data[header.ICMPv6EchoMinimumSize:]
-
-	if icmpv6.Type() != header.ICMPv6EchoRequest || icmpv6.Code() != 0 {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	icmpv6.SetChecksum(0)
-	icmpv6.SetChecksum(^header.Checksum(icmpv6, header.Checksum(data, 0)))
-
-	return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv6ProtocolNumber, r.DefaultTTL())
-}
-
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.netProto
-	if header.IsV4MappedAddress(addr.Addr) {
-		return 0, tcpip.ErrNoRoute
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.id.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
-	}
-
-	return netProto, nil
-}
-
-// Connect connects the endpoint to its peer. Specifying a NIC is optional.
-func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-
-	nicid := addr.NIC
-	localPort := uint16(0)
-	switch e.state {
-	case stateBound, stateConnected:
-		localPort = e.id.LocalPort
-		if e.bindNICID == 0 {
-			break
-		}
-
-		if nicid != 0 && nicid != e.bindNICID {
-			return tcpip.ErrInvalidEndpointState
-		}
-
-		nicid = e.bindNICID
-	default:
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	netProto, err := e.checkV4Mapped(&addr, false)
-	if err != nil {
-		return err
-	}
-
-	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, e.bindAddr, addr.Addr, netProto)
-	if err != nil {
-		return err
-	}
-	defer r.Release()
-
-	id := stack.TransportEndpointID{
-		LocalAddress:  r.LocalAddress,
-		LocalPort:     localPort,
-		RemoteAddress: r.RemoteAddress,
-	}
-
-	// Even if we're connected, this endpoint can still be used to send
-	// packets on a different network protocol, so we register both even if
-	// v6only is set to false and this is an ipv6 endpoint.
-	netProtos := []tcpip.NetworkProtocolNumber{netProto}
-
-	id, err = e.registerWithStack(nicid, netProtos, id)
-	if err != nil {
-		return err
-	}
-
-	e.id = id
-	e.route = r.Clone()
-	e.regNICID = nicid
-
-	e.state = stateConnected
-
-	e.rcvMu.Lock()
-	e.rcvReady = true
-	e.rcvMu.Unlock()
-
-	return nil
-}
-
-// ConnectEndpoint is not supported.
-func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
-	return tcpip.ErrInvalidEndpointState
-}
-
-// Shutdown closes the read and/or write end of the endpoint connection
-// to its peer.
-func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	e.shutdownFlags |= flags
-
-	if e.state != stateConnected {
-		return tcpip.ErrNotConnected
-	}
-
-	if flags&tcpip.ShutdownRead != 0 {
-		e.rcvMu.Lock()
-		wasClosed := e.rcvClosed
-		e.rcvClosed = true
-		e.rcvMu.Unlock()
-
-		if !wasClosed {
-			e.waiterQueue.Notify(waiter.EventIn)
-		}
-	}
-
-	return nil
-}
-
-// Listen is not supported by UDP, it just fails.
-func (*endpoint) Listen(int) *tcpip.Error {
-	return tcpip.ErrNotSupported
-}
-
-// Accept is not supported by UDP, it just fails.
-func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
-	return nil, nil, tcpip.ErrNotSupported
-}
-
-func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
-	if id.LocalPort != 0 {
-		// The endpoint already has a local port, just attempt to
-		// register it.
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false)
-		return id, err
-	}
-
-	// We need to find a port for the endpoint.
-	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		id.LocalPort = p
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false)
-		switch err {
-		case nil:
-			return true, nil
-		case tcpip.ErrPortInUse:
-			return false, nil
-		default:
-			return false, err
-		}
-	})
-
-	return id, err
-}
-
-func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
-	// Don't allow binding once endpoint is not in the initial state
-	// anymore.
-	if e.state != stateInitial {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	netProto, err := e.checkV4Mapped(&addr, false)
-	if err != nil {
-		return err
-	}
-
-	// Expand netProtos to include v4 and v6 if the caller is binding to a
-	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
-	// set to false.
-	netProtos := []tcpip.NetworkProtocolNumber{netProto}
-
-	if len(addr.Addr) != 0 {
-		// A local address was specified, verify that it's valid.
-		if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 {
-			return tcpip.ErrBadLocalAddress
-		}
-	}
-
-	id := stack.TransportEndpointID{
-		LocalPort:    addr.Port,
-		LocalAddress: addr.Addr,
-	}
-	id, err = e.registerWithStack(addr.NIC, netProtos, id)
-	if err != nil {
-		return err
-	}
-	if commit != nil {
-		if err := commit(); err != nil {
-			// Unregister, the commit failed.
-			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, e.transProto, id, e)
-			return err
-		}
-	}
-
-	e.id = id
-	e.regNICID = addr.NIC
-
-	// Mark endpoint as bound.
-	e.state = stateBound
-
-	e.rcvMu.Lock()
-	e.rcvReady = true
-	e.rcvMu.Unlock()
-
-	return nil
-}
-
-// Bind binds the endpoint to a specific local address and port.
-// Specifying a NIC is optional.
-func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-
-	err := e.bindLocked(addr, commit)
-	if err != nil {
-		return err
-	}
-
-	e.bindNICID = addr.NIC
-	e.bindAddr = addr.Addr
-
-	return nil
-}
-
-// GetLocalAddress returns the address to which the endpoint is bound.
-func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
-
-	return tcpip.FullAddress{
-		NIC:  e.regNICID,
-		Addr: e.id.LocalAddress,
-		Port: e.id.LocalPort,
-	}, nil
-}
-
-// GetRemoteAddress returns the address to which the endpoint is connected.
-func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
-
-	if e.state != stateConnected {
-		return tcpip.FullAddress{}, tcpip.ErrNotConnected
-	}
-
-	return tcpip.FullAddress{
-		NIC:  e.regNICID,
-		Addr: e.id.RemoteAddress,
-		Port: e.id.RemotePort,
-	}, nil
-}
-
-// Readiness returns the current readiness of the endpoint. For example, if
-// waiter.EventIn is set, the endpoint is immediately readable.
-func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
-	// The endpoint is always writable.
-	result := waiter.EventOut & mask
-
-	// Determine if the endpoint is readable if requested.
-	if (mask & waiter.EventIn) != 0 {
-		e.rcvMu.Lock()
-		if !e.rcvList.Empty() || e.rcvClosed {
-			result |= waiter.EventIn
-		}
-		e.rcvMu.Unlock()
-	}
-
-	return result
-}
-
-// HandlePacket is called by the stack when new packets arrive to this transport
-// endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
-	e.rcvMu.Lock()
-
-	// Drop the packet if our buffer is currently full.
-	if !e.rcvReady || e.rcvClosed || e.rcvBufSize >= e.rcvBufSizeMax {
-		e.rcvMu.Unlock()
-		return
-	}
-
-	wasEmpty := e.rcvBufSize == 0
-
-	// Push new packet into receive list and increment the buffer size.
-	pkt := &pingPacket{
-		senderAddress: tcpip.FullAddress{
-			NIC:  r.NICID(),
-			Addr: id.RemoteAddress,
-		},
-	}
-	pkt.data = vv.Clone(pkt.views[:])
-	e.rcvList.PushBack(pkt)
-	e.rcvBufSize += vv.Size()
-
-	pkt.timestamp = e.stack.NowNanoseconds()
-
-	e.rcvMu.Unlock()
-
-	// Notify any waiters that there's data to be read now.
-	if wasEmpty {
-		e.waiterQueue.Notify(waiter.EventIn)
-	}
-}
-
-// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
-}
diff --git a/pkg/tcpip/transport/ping/endpoint_state.go b/pkg/tcpip/transport/ping/endpoint_state.go
deleted file mode 100644
index 80721d227..000000000
--- a/pkg/tcpip/transport/ping/endpoint_state.go
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ping
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-)
-
-// saveData saves pingPacket.data field.
-func (p *pingPacket) saveData() buffer.VectorisedView {
-	// We cannot save p.data directly as p.data.views may alias to p.views,
-	// which is not allowed by state framework (in-struct pointer).
-	return p.data.Clone(nil)
-}
-
-// loadData loads pingPacket.data field.
-func (p *pingPacket) loadData(data buffer.VectorisedView) {
-	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
-	// here because data.views is not guaranteed to be loaded by now. Plus,
-	// data.views will be allocated anyway so there really is little point
-	// of utilizing p.views for data.views.
-	p.data = data
-}
-
-// beforeSave is invoked by stateify.
-func (e *endpoint) beforeSave() {
-	// Stop incoming packets from being handled (and mutate endpoint state).
-	// The lock will be released after savercvBufSizeMax(), which would have
-	// saved e.rcvBufSizeMax and set it to 0 to continue blocking incoming
-	// packets.
-	e.rcvMu.Lock()
-}
-
-// saveRcvBufSizeMax is invoked by stateify.
-func (e *endpoint) saveRcvBufSizeMax() int {
-	max := e.rcvBufSizeMax
-	// Make sure no new packets will be handled regardless of the lock.
-	e.rcvBufSizeMax = 0
-	// Release the lock acquired in beforeSave() so regular endpoint closing
-	// logic can proceed after save.
-	e.rcvMu.Unlock()
-	return max
-}
-
-// loadRcvBufSizeMax is invoked by stateify.
-func (e *endpoint) loadRcvBufSizeMax(max int) {
-	e.rcvBufSizeMax = max
-}
-
-// afterLoad is invoked by stateify.
-func (e *endpoint) afterLoad() {
-	e.stack = stack.StackFromEnv
-
-	if e.state != stateBound && e.state != stateConnected {
-		return
-	}
-
-	var err *tcpip.Error
-	if e.state == stateConnected {
-		e.route, err = e.stack.FindRoute(e.regNICID, e.bindAddr, e.id.RemoteAddress, e.netProto)
-		if err != nil {
-			panic(*err)
-		}
-
-		e.id.LocalAddress = e.route.LocalAddress
-	} else if len(e.id.LocalAddress) != 0 { // stateBound
-		if e.stack.CheckLocalAddress(e.regNICID, e.netProto, e.id.LocalAddress) == 0 {
-			panic(tcpip.ErrBadLocalAddress)
-		}
-	}
-
-	e.id, err = e.registerWithStack(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.id)
-	if err != nil {
-		panic(*err)
-	}
-}
diff --git a/pkg/tcpip/transport/ping/protocol.go b/pkg/tcpip/transport/ping/protocol.go
deleted file mode 100644
index 1d504773b..000000000
--- a/pkg/tcpip/transport/ping/protocol.go
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package ping contains the implementation of the ICMP and IPv6-ICMP transport
-// protocols for use in ping. To use it in the networking stack, this package
-// must be added to the project, and
-// activated on the stack by passing ping.ProtocolName (or "ping") and/or
-// ping.ProtocolName6 (or "ping6") as one of the transport protocols when
-// calling stack.New(). Then endpoints can be created by passing
-// ping.ProtocolNumber or ping.ProtocolNumber6 as the transport protocol number
-// when calling Stack.NewEndpoint().
-package ping
-
-import (
-	"encoding/binary"
-	"fmt"
-
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-const (
-	// ProtocolName4 is the string representation of the ping protocol name.
-	ProtocolName4 = "ping4"
-
-	// ProtocolNumber4 is the ICMP protocol number.
-	ProtocolNumber4 = header.ICMPv4ProtocolNumber
-
-	// ProtocolName6 is the string representation of the ping protocol name.
-	ProtocolName6 = "ping6"
-
-	// ProtocolNumber6 is the IPv6-ICMP protocol number.
-	ProtocolNumber6 = header.ICMPv6ProtocolNumber
-)
-
-type protocol struct {
-	number tcpip.TransportProtocolNumber
-}
-
-// Number returns the ICMP protocol number.
-func (p *protocol) Number() tcpip.TransportProtocolNumber {
-	return p.number
-}
-
-func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
-	switch p.number {
-	case ProtocolNumber4:
-		return header.IPv4ProtocolNumber
-	case ProtocolNumber6:
-		return header.IPv6ProtocolNumber
-	}
-	panic(fmt.Sprint("unknown protocol number: ", p.number))
-}
-
-// NewEndpoint creates a new ping endpoint.
-func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	if netProto != p.netProto() {
-		return nil, tcpip.ErrUnknownProtocol
-	}
-	return newEndpoint(stack, netProto, p.number, waiterQueue), nil
-}
-
-// MinimumPacketSize returns the minimum valid ping packet size.
-func (p *protocol) MinimumPacketSize() int {
-	switch p.number {
-	case ProtocolNumber4:
-		return header.ICMPv4EchoMinimumSize
-	case ProtocolNumber6:
-		return header.ICMPv6EchoMinimumSize
-	}
-	panic(fmt.Sprint("unknown protocol number: ", p.number))
-}
-
-// ParsePorts returns the source and destination ports stored in the given ping
-// packet.
-func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
-	switch p.number {
-	case ProtocolNumber4:
-		return 0, binary.BigEndian.Uint16(v[header.ICMPv4MinimumSize:]), nil
-	case ProtocolNumber6:
-		return 0, binary.BigEndian.Uint16(v[header.ICMPv6MinimumSize:]), nil
-	}
-	panic(fmt.Sprint("unknown protocol number: ", p.number))
-}
-
-// HandleUnknownDestinationPacket handles packets targeted at this protocol but
-// that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, buffer.VectorisedView) bool {
-	return true
-}
-
-// SetOption implements TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-// Option implements TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-func init() {
-	stack.RegisterTransportProtocolFactory(ProtocolName4, func() stack.TransportProtocol {
-		return &protocol{ProtocolNumber4}
-	})
-
-	stack.RegisterTransportProtocolFactory(ProtocolName6, func() stack.TransportProtocol {
-		return &protocol{ProtocolNumber6}
-	})
-}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 540e99151..4984231d7 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -75,7 +75,7 @@ go_library(
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
-        "//pkg/tcpip/transport/ping",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/urpc",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 41f456af7..42fe6f312 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -51,7 +51,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.googlesource.com/gvisor/runsc/boot/filter"
@@ -766,7 +766,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	case NetworkNone, NetworkSandbox:
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
-		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
+		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4}
 		s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
 			Clock: clock,
 			Stats: epsocket.Metrics,
-- 
cgit v1.2.3


From 52a2abfca43cffdb9cafb91a4266dacf51525470 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 25 Feb 2019 19:20:52 -0800
Subject: Fix cgroup when path is relative

This can happen when 'docker run --cgroup-parent=' flag is set.

PiperOrigin-RevId: 235645559
Change-Id: Ieea3ae66939abadab621053551bf7d62d412e7ee
---
 runsc/cgroup/cgroup.go          | 86 ++++++++++++++++++++++++++++-------------
 runsc/container/container.go    |  5 ++-
 runsc/test/root/BUILD           |  1 +
 runsc/test/root/cgroup_test.go  | 77 +++++++++++++++++++++++++++++++++---
 runsc/test/testutil/docker.go   |  8 +---
 runsc/test/testutil/testutil.go |  5 +++
 6 files changed, 141 insertions(+), 41 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 87f051e79..2b338b6c6 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -161,20 +161,59 @@ func countCpuset(cpuset string) (int, error) {
 	return count, nil
 }
 
+// LoadPaths loads cgroup paths for given 'pid', may be set to 'self'.
+func LoadPaths(pid string) (map[string]string, error) {
+	f, err := os.Open(filepath.Join("/proc", pid, "cgroup"))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	paths := make(map[string]string)
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		// Format: ID:controller1,controller2:path
+		// Example: 2:cpu,cpuacct:/user.slice
+		tokens := strings.Split(scanner.Text(), ":")
+		if len(tokens) != 3 {
+			return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text())
+		}
+		for _, ctrlr := range strings.Split(tokens[1], ",") {
+			paths[ctrlr] = tokens[2]
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+	return paths, nil
+}
+
 // Cgroup represents a group inside all controllers. For example: Name='/foo/bar'
 // maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers.
 type Cgroup struct {
-	Name string `json:"name"`
-	Own  bool   `json:"own"`
+	Name    string            `json:"name"`
+	Parents map[string]string `json:"parents"`
+	Own     bool              `json:"own"`
 }
 
 // New creates a new Cgroup instance if the spec includes a cgroup path.
 // Returns nil otherwise.
-func New(spec *specs.Spec) *Cgroup {
+func New(spec *specs.Spec) (*Cgroup, error) {
 	if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
-		return nil
+		return nil, nil
+	}
+	var parents map[string]string
+	if !filepath.IsAbs(spec.Linux.CgroupsPath) {
+		var err error
+		parents, err = LoadPaths("self")
+		if err != nil {
+			return nil, fmt.Errorf("finding current cgroups: %v", err)
+		}
 	}
-	return &Cgroup{Name: spec.Linux.CgroupsPath}
+	return &Cgroup{
+		Name:    spec.Linux.CgroupsPath,
+		Parents: parents,
+	}, nil
 }
 
 // Install creates and configures cgroups according to 'res'. If cgroup path
@@ -188,9 +227,11 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 		return nil
 	}
 
-	// Mark that cgroup resources are owned by me.
 	log.Debugf("Creating cgroup %q", c.Name)
+
+	// Mark that cgroup resources are owned by me.
 	c.Own = true
+
 	// The Cleanup object cleans up partially created cgroups when an error occurs.
 	// Errors occuring during cleanup itself are ignored.
 	clean := specutils.MakeCleanup(func() { _ = c.Uninstall() })
@@ -247,32 +288,19 @@ func (c *Cgroup) Uninstall() error {
 func (c *Cgroup) Join() (func(), error) {
 	// First save the current state so it can be restored.
 	undo := func() {}
-	f, err := os.Open("/proc/self/cgroup")
+	paths, err := LoadPaths("self")
 	if err != nil {
 		return undo, err
 	}
-	defer f.Close()
-
 	var undoPaths []string
-	scanner := bufio.NewScanner(f)
-	for scanner.Scan() {
-		// Format: ID:controller1,controller2:path
-		// Example: 2:cpu,cpuacct:/user.slice
-		tokens := strings.Split(scanner.Text(), ":")
-		if len(tokens) != 3 {
-			return undo, fmt.Errorf("formatting cgroups file, line: %q", scanner.Text())
-		}
-		for _, ctrlr := range strings.Split(tokens[1], ",") {
-			// Skip controllers we don't handle.
-			if _, ok := controllers[ctrlr]; ok {
-				undoPaths = append(undoPaths, filepath.Join(cgroupRoot, ctrlr, tokens[2]))
-				break
-			}
+	for ctrlr, path := range paths {
+		// Skip controllers we don't handle.
+		if _, ok := controllers[ctrlr]; ok {
+			fullPath := filepath.Join(cgroupRoot, ctrlr, path)
+			undoPaths = append(undoPaths, fullPath)
+			break
 		}
 	}
-	if err := scanner.Err(); err != nil {
-		return undo, err
-	}
 
 	// Replace empty undo with the real thing before changes are made to cgroups.
 	undo = func() {
@@ -316,7 +344,11 @@ func (c *Cgroup) MemoryLimit() (uint64, error) {
 }
 
 func (c *Cgroup) makePath(controllerName string) string {
-	return filepath.Join(cgroupRoot, controllerName, c.Name)
+	path := c.Name
+	if parent, ok := c.Parents[controllerName]; ok {
+		path = filepath.Join(parent, c.Name)
+	}
+	return filepath.Join(cgroupRoot, controllerName, path)
 }
 
 type controller interface {
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 08a3725f5..6f092a5ce 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -295,7 +295,10 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 
 		// Create and join cgroup before processes are created to ensure they are
 		// part of the cgroup from the start (and all tneir children processes).
-		cg := cgroup.New(spec)
+		cg, err := cgroup.New(spec)
+		if err != nil {
+			return nil, err
+		}
 		if cg != nil {
 			// If there is cgroup config, install it before creating sandbox process.
 			if err := cg.Install(spec.Linux.Resources); err != nil {
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
index 75826a521..7ded78baa 100644
--- a/runsc/test/root/BUILD
+++ b/runsc/test/root/BUILD
@@ -24,6 +24,7 @@ go_test(
         "local",
     ],
     deps = [
+        "//runsc/cgroup",
         "//runsc/specutils",
         "//runsc/test/root/testdata",
         "//runsc/test/testutil",
diff --git a/runsc/test/root/cgroup_test.go b/runsc/test/root/cgroup_test.go
index 0eabf9561..91839048c 100644
--- a/runsc/test/root/cgroup_test.go
+++ b/runsc/test/root/cgroup_test.go
@@ -15,16 +15,45 @@
 package root
 
 import (
+	"bufio"
+	"fmt"
 	"io/ioutil"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"testing"
 
+	"gvisor.googlesource.com/gvisor/runsc/cgroup"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
+func verifyPid(pid int, path string) error {
+	f, err := os.Open(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	var gots []int
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		got, err := strconv.Atoi(scanner.Text())
+		if err != nil {
+			return err
+		}
+		if got == pid {
+			return nil
+		}
+		gots = append(gots, got)
+	}
+	if scanner.Err() != nil {
+		return scanner.Err()
+	}
+	return fmt.Errorf("got: %s, want: %d", gots, pid)
+}
+
 // TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestCgroup(t *testing.T) {
 	if err := testutil.Pull("alpine"); err != nil {
@@ -161,12 +190,48 @@ func TestCgroup(t *testing.T) {
 	}
 	for _, ctrl := range controllers {
 		path := filepath.Join("/sys/fs/cgroup", ctrl, "docker", gid, "cgroup.procs")
-		out, err := ioutil.ReadFile(path)
-		if err != nil {
-			t.Fatalf("failed to read %q: %v", path, err)
-		}
-		if got := string(out); !strings.Contains(got, strconv.Itoa(pid)) {
-			t.Errorf("cgroup control %s processes, got: %q, want: %q", ctrl, got, pid)
+		if err := verifyPid(pid, path); err != nil {
+			t.Errorf("cgroup control %q processes: %v", ctrl, err)
 		}
 	}
 }
+
+func TestCgroupParent(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("cgroup-test")
+
+	parent := testutil.RandomName("runsc")
+	if err := d.Run("--cgroup-parent", parent, "alpine", "sleep", "10000"); err != nil {
+		t.Fatal("docker create failed:", err)
+	}
+	defer d.CleanUp()
+	gid, err := d.ID()
+	if err != nil {
+		t.Fatalf("Docker.ID() failed: %v", err)
+	}
+	t.Logf("cgroup ID: %s", gid)
+
+	// Check that sandbox is inside cgroup.
+	pid, err := d.SandboxPid()
+	if err != nil {
+		t.Fatalf("SandboxPid: %v", err)
+	}
+
+	// Finds cgroup for the sandbox's parent process to check that cgroup is
+	// created in the right location relative to the parent.
+	cmd := fmt.Sprintf("grep PPid: /proc/%d/status | sed 's/PPid:\\s//'", pid)
+	ppid, err := exec.Command("bash", "-c", cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("Executing %q: %v", cmd, err)
+	}
+	cgroups, err := cgroup.LoadPaths(strings.TrimSpace(string(ppid)))
+	if err != nil {
+		t.Fatalf("cgroup.LoadPath(%s): %v", ppid, err)
+	}
+	path := filepath.Join("/sys/fs/cgroup/memory", cgroups["memory"], parent, gid, "cgroup.procs")
+	if err := verifyPid(pid, path); err != nil {
+		t.Errorf("cgroup control %q processes: %v", "memory", err)
+	}
+}
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 5a92a5835..bce609061 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"io/ioutil"
 	"log"
-	"math/rand"
 	"os"
 	"os/exec"
 	"path"
@@ -31,10 +30,6 @@ import (
 	"github.com/kr/pty"
 )
 
-func init() {
-	rand.Seed(time.Now().UnixNano())
-}
-
 func getRuntime() string {
 	r := os.Getenv("RUNSC_RUNTIME")
 	if r == "" {
@@ -162,8 +157,7 @@ type Docker struct {
 // MakeDocker sets up the struct for a Docker container.
 // Names of containers will be unique.
 func MakeDocker(namePrefix string) Docker {
-	suffix := fmt.Sprintf("-%06d", rand.Int())[:7]
-	return Docker{Name: namePrefix + suffix, Runtime: getRuntime()}
+	return Docker{Name: RandomName(namePrefix), Runtime: getRuntime()}
 }
 
 // Create calls 'docker create' with the arguments provided.
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index a84530287..79f0a8b6b 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -461,3 +461,8 @@ func WriteTmpFile(pattern, text string) (string, error) {
 	}
 	return file.Name(), nil
 }
+
+// RandomName create a name with a 6 digit random number appended to it.
+func RandomName(prefix string) string {
+	return fmt.Sprintf("%s-%06d", prefix, rand.Int31n(1000000))
+}
-- 
cgit v1.2.3


From 6df212b831dcc3350b7677423ec7835ed40b3f22 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 27 Feb 2019 10:05:46 -0800
Subject: Don't log twice to debug log when --log isn't set

PiperOrigin-RevId: 235940853
Change-Id: I9c5b4cf18b199fb74044a5edb131bfff59dec945
---
 runsc/main.go                        | 8 +++++++-
 test/syscalls/syscall_test_runner.go | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/main.go b/runsc/main.go
index 472839bf0..4f89312b3 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -192,7 +192,13 @@ func main() {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
 		}
 
-		e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
+		if logFile == os.Stderr {
+			// Suppress logging to stderr when debug log is enabled. Otherwise all
+			// messages will be duplicated in the debug log (see Dup2() call above).
+			e = newEmitter(*debugLogFormat, f)
+		} else {
+			e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
+		}
 	} else if *debugLog != "" {
 		f, err := specutils.DebugLogFile(*debugLog, subcommand)
 		if err != nil {
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index 9fe801b26..c4af28103 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -202,6 +202,11 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 		debugLogDir += "/"
 		log.Infof("runsc logs: %s", debugLogDir)
 		args = append(args, "-debug-log", debugLogDir)
+
+		// Default -log sends messages to stderr which makes reading the test log
+		// difficult. Instead, drop them when debug log is enabled given it's a
+		// better place for these messages.
+		args = append(args, "-log=/dev/null")
 	}
 
 	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
-- 
cgit v1.2.3


From 3dbd4a16f8ae4da967f69fd93870462d1b3554f5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 1 Mar 2019 10:55:22 -0800
Subject: Add semctl(GETPID) syscall

Also added unimplemented notification for semctl(2)
commands.

PiperOrigin-RevId: 236340672
Change-Id: I0795e3bd2e6d41d7936fabb731884df426a42478
---
 pkg/abi/linux/sem.go                          |  5 ++--
 pkg/sentry/kernel/semaphore/semaphore.go      | 34 +++++++++++++++++----
 pkg/sentry/kernel/semaphore/semaphore_test.go |  6 ++--
 pkg/sentry/syscalls/linux/sys_sem.go          | 43 +++++++++++++++++++++++++--
 runsc/boot/compat.go                          |  4 +++
 test/syscalls/linux/semaphore.cc              | 29 ++++++++++++++++++
 6 files changed, 107 insertions(+), 14 deletions(-)

(limited to 'runsc')

diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index d1a0bdb32..b80c93daf 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -27,8 +27,9 @@ const (
 
 // ipcs ctl cmds. Source: include/uapi/linux/sem.h
 const (
-	SEM_STAT = 18
-	SEM_INFO = 19
+	SEM_STAT     = 18
+	SEM_INFO     = 19
+	SEM_STAT_ANY = 20
 )
 
 const SEM_UNDO = 0x1000
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index c134931cd..29a2eb804 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -92,6 +92,7 @@ type Set struct {
 type sem struct {
 	value   int16
 	waiters waiterList `state:"zerovalue"`
+	pid     int32
 }
 
 // waiter represents a caller that is waiting for the semaphore value to
@@ -283,7 +284,7 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File
 }
 
 // SetVal overrides a semaphore value, waking up waiters as needed.
-func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials) error {
+func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
 	if val < 0 || val > valueMax {
 		return syserror.ERANGE
 	}
@@ -303,15 +304,17 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
 
 	// TODO: Clear undo entries in all processes
 	sem.value = val
+	sem.pid = pid
 	s.changeTime = ktime.NowFromContext(ctx)
 	sem.wakeWaiters()
 	return nil
 }
 
-// SetValAll overrides all semaphores values, waking up waiters as needed.
+// SetValAll overrides all semaphores values, waking up waiters as needed. It also
+// sets semaphore's PID which was fixed in Linux 4.6.
 //
 // 'len(vals)' must be equal to 's.Size()'.
-func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials) error {
+func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error {
 	if len(vals) != s.Size() {
 		panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size()))
 	}
@@ -335,6 +338,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 
 		// TODO: Clear undo entries in all processes
 		sem.value = int16(val)
+		sem.pid = pid
 		sem.wakeWaiters()
 	}
 	s.changeTime = ktime.NowFromContext(ctx)
@@ -375,12 +379,29 @@ func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) {
 	return vals, nil
 }
 
+// GetPID returns the PID set when performing operations in the semaphore.
+func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.pid, nil
+}
+
 // ExecuteOps attempts to execute a list of operations to the set. It only
 // succeeds when all operations can be applied. No changes are made if it fails.
 //
 // On failure, it may return an error (retries are hopeless) or it may return
 // a channel that can be waited on before attempting again.
-func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials) (chan struct{}, int32, error) {
+func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -404,14 +425,14 @@ func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Cr
 		return nil, 0, syserror.EACCES
 	}
 
-	ch, num, err := s.executeOps(ctx, ops)
+	ch, num, err := s.executeOps(ctx, ops, pid)
 	if err != nil {
 		return nil, 0, err
 	}
 	return ch, num, nil
 }
 
-func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}, int32, error) {
+func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) {
 	// Changes to semaphores go to this slice temporarily until they all succeed.
 	tmpVals := make([]int16, len(s.sems))
 	for i := range s.sems {
@@ -464,6 +485,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}
 	for i, v := range tmpVals {
 		s.sems[i].value = v
 		s.sems[i].wakeWaiters()
+		s.sems[i].pid = pid
 	}
 	s.opTime = ktime.NowFromContext(ctx)
 	return nil, 0, nil
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index 5f886bf31..2e51e6ee5 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -25,7 +25,7 @@ import (
 )
 
 func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} {
-	ch, _, err := set.executeOps(ctx, ops)
+	ch, _, err := set.executeOps(ctx, ops, 123)
 	if err != nil {
 		t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops)
 	}
@@ -123,13 +123,13 @@ func TestNoWait(t *testing.T) {
 
 	ops[0].SemOp = -2
 	ops[0].SemFlg = linux.IPC_NOWAIT
-	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+	if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock {
 		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
 	}
 
 	ops[0].SemOp = 0
 	ops[0].SemFlg = linux.IPC_NOWAIT
-	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+	if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock {
 		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 6775725ca..86f850ef1 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -71,8 +71,9 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 
 	creds := auth.CredentialsFromContext(t)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
 	for {
-		ch, num, err := set.ExecuteOps(t, ops, creds)
+		ch, num, err := set.ExecuteOps(t, ops, creds, int32(pid))
 		if ch == nil || err != nil {
 			// We're done (either on success or a failure).
 			return 0, nil, err
@@ -123,6 +124,21 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777))
 		return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms)
 
+	case linux.GETPID:
+		v, err := getPID(t, id, num)
+		return uintptr(v), nil, err
+
+	case linux.IPC_INFO,
+		linux.SEM_INFO,
+		linux.IPC_STAT,
+		linux.SEM_STAT,
+		linux.SEM_STAT_ANY,
+		linux.GETNCNT,
+		linux.GETZCNT:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
+
 	default:
 		return 0, nil, syserror.EINVAL
 	}
@@ -161,7 +177,8 @@ func setVal(t *kernel.Task, id int32, num int32, val int16) error {
 		return syserror.EINVAL
 	}
 	creds := auth.CredentialsFromContext(t)
-	return set.SetVal(t, num, val, creds)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+	return set.SetVal(t, num, val, creds, int32(pid))
 }
 
 func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
@@ -175,7 +192,8 @@ func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
 		return err
 	}
 	creds := auth.CredentialsFromContext(t)
-	return set.SetValAll(t, vals, creds)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+	return set.SetValAll(t, vals, creds, int32(pid))
 }
 
 func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
@@ -202,3 +220,22 @@ func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
 	_, err = t.CopyOut(array, vals)
 	return err
 }
+
+func getPID(t *kernel.Task, id int32, num int32) (int32, error) {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	gpid, err := set.GetPID(num, creds)
+	if err != nil {
+		return 0, err
+	}
+	// Convert pid from init namespace to the caller's namespace.
+	tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(gpid))
+	if tg == nil {
+		return 0, nil
+	}
+	return int32(tg.ID()), nil
+}
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index c2a77ebf5..572b5b472 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -100,6 +100,10 @@ func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
 			// args: fd, level, name, ...
 			tr = newArgsTracker(1, 2)
 
+		case syscall.SYS_SEMCTL:
+			// args: semid, semnum, cmd, ...
+			tr = newArgsTracker(2)
+
 		default:
 			tr = &onceTracker{}
 		}
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index da3d2c6fe..1c47b6851 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -431,6 +431,35 @@ TEST(SemaphoreTest, SemCtlValAll) {
               SyscallFailsWithErrno(EFAULT));
 }
 
+TEST(SemaphoreTest, SemCtlGetPid) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 1), SyscallSucceeds());
+  EXPECT_THAT(semctl(sem.get(), 0, GETPID), SyscallSucceedsWithValue(getpid()));
+}
+
+TEST(SemaphoreTest, SemCtlGetPidFork) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  const pid_t child_pid = fork();
+  if (child_pid == 0) {
+    ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 1), SyscallSucceeds());
+    ASSERT_THAT(semctl(sem.get(), 0, GETPID),
+                SyscallSucceedsWithValue(getpid()));
+
+    _exit(0);
+  }
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
 TEST(SemaphoreTest, SemIpcSet) {
   // Drop CAP_IPC_OWNER which allows us to bypass semaphore permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_IPC_OWNER, false));
-- 
cgit v1.2.3


From fcba4e8f040ab4b40e04b9315d718d7e5aa44635 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 5 Mar 2019 22:19:23 -0800
Subject: Add uncaught signal message to the user log

This help troubleshoot cases where the container is killed and the
app logs don't show the reason.

PiperOrigin-RevId: 236982883
Change-Id: I361892856a146cea5b04abaa3aedbf805e123724
---
 runsc/boot/BUILD     |  1 +
 runsc/boot/compat.go | 25 +++++++++++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 4984231d7..daa197437 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -46,6 +46,7 @@ go_library(
         "//pkg/sentry/fs/tty",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel:uncaught_signal_go_proto",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/limits",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 572b5b472..37d0c31fd 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+	ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
 	spb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
 )
@@ -73,12 +74,18 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 }
 
 // Emit implements eventchannel.Emitter.
-func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
-	// Only interested in UnimplementedSyscall, skip the rest.
-	us, ok := msg.(*spb.UnimplementedSyscall)
-	if !ok {
-		return false, nil
+func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
+	switch m := msg.(type) {
+	case *spb.UnimplementedSyscall:
+		c.emitUnimplementedSyscall(m)
+	case *ucspb.UncaughtSignal:
+		c.emitUncaughtSignal(m)
 	}
+
+	return false, nil
+}
+
+func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 	regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
 
 	c.mu.Lock()
@@ -113,7 +120,13 @@ func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
 		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
 		tr.onReported(regs)
 	}
-	return false, nil
+}
+
+func (c *compatEmitter) emitUncaughtSignal(msg *ucspb.UncaughtSignal) {
+	sig := syscall.Signal(msg.SignalNumber)
+	c.sink.Infof(
+		"Uncaught signal: %q (%d), PID: %d, TID: %d, fault addr: %#x",
+		sig, msg.SignalNumber, msg.Pid, msg.Tid, msg.FaultAddr)
 }
 
 // Close implements eventchannel.Emitter.
-- 
cgit v1.2.3


From 0b76887147820a809beaa497ede8dc4f7b7b120a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 5 Mar 2019 23:39:14 -0800
Subject: Priority-inheritance futex implementation

It is Implemented without the priority inheritance part given
that gVisor defers scheduling decisions to Go runtime and doesn't
have control over it.

PiperOrigin-RevId: 236989545
Change-Id: I714c8ca0798743ecf3167b14ffeb5cd834302560
---
 pkg/abi/linux/futex.go                          |   6 +
 pkg/sentry/kernel/futex/BUILD                   |   2 +
 pkg/sentry/kernel/futex/futex.go                | 215 +++++++++++++++++++++---
 pkg/sentry/kernel/futex/futex_test.go           |   4 +
 pkg/sentry/kernel/task_futex.go                 |   7 +
 pkg/sentry/mm/io.go                             |  45 +++++
 pkg/sentry/platform/platform.go                 |  10 ++
 pkg/sentry/platform/safecopy/BUILD              |   4 +-
 pkg/sentry/platform/safecopy/atomic_amd64.s     |  28 +++
 pkg/sentry/platform/safecopy/atomic_arm64.s     |  28 +++
 pkg/sentry/platform/safecopy/safecopy.go        |   4 +
 pkg/sentry/platform/safecopy/safecopy_unsafe.go |  20 +++
 pkg/sentry/platform/safecopy/sighandler_amd64.s |   9 +
 pkg/sentry/platform/safecopy/sighandler_arm64.s |  11 ++
 pkg/sentry/safemem/block_unsafe.go              |  10 ++
 pkg/sentry/syscalls/linux/sys_futex.go          |  68 +++++++-
 pkg/sentry/usermem/bytes_io_unsafe.go           |   8 +
 pkg/sentry/usermem/usermem.go                   |   7 +
 pkg/syserror/syserror.go                        |   1 +
 runsc/boot/compat.go                            |   4 +-
 test/syscalls/linux/BUILD                       |   1 +
 test/syscalls/linux/futex.cc                    | 115 ++++++++++++-
 22 files changed, 578 insertions(+), 29 deletions(-)

(limited to 'runsc')

diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go
index 5dff01fba..afdf4123b 100644
--- a/pkg/abi/linux/futex.go
+++ b/pkg/abi/linux/futex.go
@@ -54,3 +54,9 @@ const (
 
 // FUTEX_TID_MASK is the TID portion of a PI futex word.
 const FUTEX_TID_MASK = 0x3fffffff
+
+// Constants used for priority-inheritance futexes.
+const (
+	FUTEX_WAITERS    = 0x80000000
+	FUTEX_OWNER_DIED = 0x40000000
+)
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 91feeb5ed..b6af5b20b 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -37,6 +37,8 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
         "//pkg/sentry/memmap",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index b3e628fd4..cd7d51621 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -95,12 +95,15 @@ func (k *Key) matches(k2 *Key) bool {
 
 // Target abstracts memory accesses and keys.
 type Target interface {
-	// SwapUint32 gives access to usermem.SwapUint32.
+	// SwapUint32 gives access to usermem.IO.SwapUint32.
 	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
 
-	// CompareAndSwap gives access to usermem.CompareAndSwapUint32.
+	// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
 	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
 
+	// LoadUint32 gives access to usermem.IO.LoadUint32.
+	LoadUint32(addr usermem.Addr) (uint32, error)
+
 	// GetSharedKey returns a Key with kind KindSharedPrivate or
 	// KindSharedMappable corresponding to the memory mapped at address addr.
 	//
@@ -112,11 +115,11 @@ type Target interface {
 
 // check performs a basic equality check on the given address.
 func check(t Target, addr usermem.Addr, val uint32) error {
-	prev, err := t.CompareAndSwapUint32(addr, val, val)
+	cur, err := t.LoadUint32(addr)
 	if err != nil {
 		return err
 	}
-	if prev != val {
+	if cur != val {
 		return syserror.EAGAIN
 	}
 	return nil
@@ -140,11 +143,14 @@ func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
 	)
 	if opType == linux.FUTEX_OP_SET {
 		oldVal, err = t.SwapUint32(addr, opArg)
+		if err != nil {
+			return false, err
+		}
 	} else {
 		for {
-			oldVal, err = t.CompareAndSwapUint32(addr, 0, 0)
+			oldVal, err = t.LoadUint32(addr)
 			if err != nil {
-				break
+				return false, err
 			}
 			var newVal uint32
 			switch opType {
@@ -161,7 +167,7 @@ func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
 			}
 			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
 			if err != nil {
-				break
+				return false, err
 			}
 			if prev == oldVal {
 				break // Success.
@@ -222,6 +228,9 @@ type Waiter struct {
 	// The bitmask we're waiting on.
 	// This is used the case of a FUTEX_WAKE_BITSET.
 	bitmask uint32
+
+	// tid is the thread ID for the waiter in case this is a PI mutex.
+	tid uint32
 }
 
 // NewWaiter returns a new unqueued Waiter.
@@ -262,23 +271,28 @@ func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
 		// Remove from the bucket and wake the waiter.
 		woke := w
 		w = w.Next() // Next iteration.
-		b.waiters.Remove(woke)
-		woke.C <- struct{}{}
-
-		// NOTE: The above channel write establishes a write barrier according
-		// to the memory model, so nothing may be ordered around it. Since
-		// we've dequeued woke and will never touch it again, we can safely
-		// store nil to woke.bucket here and allow the WaitComplete() to
-		// short-circuit grabbing the bucket lock. If they somehow miss the
-		// store, we are still holding the lock, so we can know that they won't
-		// dequeue woke, assume it's free and have the below operation
-		// afterwards.
-		woke.bucket.Store(nil)
+		b.wakeWaiterLocked(woke)
 		done++
 	}
 	return done
 }
 
+func (b *bucket) wakeWaiterLocked(w *Waiter) {
+	// Remove from the bucket and wake the waiter.
+	b.waiters.Remove(w)
+	w.C <- struct{}{}
+
+	// NOTE: The above channel write establishes a write barrier according
+	// to the memory model, so nothing may be ordered around it. Since
+	// we've dequeued w and will never touch it again, we can safely
+	// store nil to w.bucket here and allow the WaitComplete() to
+	// short-circuit grabbing the bucket lock. If they somehow miss the
+	// store, we are still holding the lock, so we can know that they won't
+	// dequeue w, assume it's free and have the below operation
+	// afterwards.
+	w.bucket.Store(nil)
+}
+
 // requeueLocked takes n waiters from the bucket and moves them to naddr on the
 // bucket "to".
 //
@@ -596,7 +610,7 @@ func (m *Manager) WaitComplete(w *Waiter) {
 			continue
 		}
 
-		// Remove w from b.
+		// Remove waiter from bucket.
 		b.waiters.Remove(w)
 		w.bucket.Store(nil)
 		b.mu.Unlock()
@@ -606,3 +620,164 @@ func (m *Manager) WaitComplete(w *Waiter) {
 	// Release references held by the waiter.
 	w.key.release()
 }
+
+// LockPI attempts to lock the futex following the Priority-inheritance futex
+// rules. The lock is acquired only when 'addr' points to 0. The TID of the
+// calling task is set to 'addr' to indicate the futex is owned. It returns true
+// if the futex was successfully acquired.
+//
+// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
+// exit_robust_list()). Given we don't support robust lists, although handled
+// below, it's never set.
+func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return false, err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.tid = tid
+
+	b := m.lockBucket(&k)
+	// Hot function: avoid defers.
+
+	success, err := m.lockPILocked(w, t, addr, tid, b, try)
+	if err != nil {
+		w.key.release()
+		b.mu.Unlock()
+		return false, err
+	}
+	if success || try {
+		// Release waiter if it's not going to be a wait.
+		w.key.release()
+	}
+	b.mu.Unlock()
+	return success, nil
+}
+
+func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
+	for {
+		cur, err := t.LoadUint32(addr)
+		if err != nil {
+			return false, err
+		}
+		if (cur & linux.FUTEX_TID_MASK) == tid {
+			return false, syserror.EDEADLK
+		}
+
+		if (cur & linux.FUTEX_TID_MASK) == 0 {
+			// No owner and no waiters, try to acquire the futex.
+
+			// Set TID and preserve owner died status.
+			val := tid
+			val |= cur & linux.FUTEX_OWNER_DIED
+			prev, err := t.CompareAndSwapUint32(addr, cur, val)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				// Linux reacquires the bucket lock on retries, which will re-lookup the
+				// mapping at the futex address. However, retrying while holding the
+				// lock is more efficient and reduces the chance of another conflict.
+				continue
+			}
+			// Futex acquired.
+			return true, nil
+		}
+
+		// Futex is already owned, prepare to wait.
+
+		if try {
+			// Caller doesn't want to wait.
+			return false, nil
+		}
+
+		// Set waiters bit if not set yet.
+		if cur&linux.FUTEX_WAITERS == 0 {
+			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				continue
+			}
+		}
+
+		// Add the waiter to the bucket.
+		b.waiters.PushBack(w)
+		w.bucket.Store(b)
+		return false, nil
+	}
+}
+
+// UnlockPI unlock the futex following the Priority-inheritance futex
+// rules. The address provided must contain the caller's TID. If there are
+// waiters, TID of the next waiter (FIFO) is set to the given address, and the
+// waiter woken up. If there are no waiters, 0 is set to the address.
+func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	b := m.lockBucket(&k)
+
+	err = m.unlockPILocked(t, addr, tid, b)
+
+	k.release()
+	b.mu.Unlock()
+	return err
+}
+
+func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+
+	if (cur & linux.FUTEX_TID_MASK) != tid {
+		return syserror.EPERM
+	}
+
+	if b.waiters.Empty() {
+		// It's safe to set 0 because there are no waiters, no new owner, and the
+		// executing task is the current owner (no owner died bit).
+		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
+		if err != nil {
+			return err
+		}
+		if prev != cur {
+			// Let user mode handle CAS races. This is different than lock, which
+			// retries when CAS fails.
+			return syserror.EAGAIN
+		}
+		return nil
+	}
+
+	next := b.waiters.Front()
+
+	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
+	// set, because the executing task takes over as the owner.
+	val := next.tid
+	if next.Next() != nil {
+		val |= linux.FUTEX_WAITERS
+	}
+
+	prev, err := t.CompareAndSwapUint32(addr, cur, val)
+	if err != nil {
+		return err
+	}
+	if prev != cur {
+		return syserror.EINVAL
+	}
+
+	b.wakeWaiterLocked(next)
+	return nil
+}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index a7ab9f229..9d44ee8e5 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -49,6 +49,10 @@ func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint
 	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
 }
 
+func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+}
+
 func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
 	return Key{
 		Kind:   KindSharedMappable,
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 921f7bdbc..351cf47d7 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -41,6 +41,13 @@ func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32,
 	})
 }
 
+// LoadUint32 implemets futex.Target.LoadUint32.
+func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
 // GetSharedKey implements futex.Target.GetSharedKey.
 func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
 	return t.MemoryManager().GetSharedFutexKey(t, addr)
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index 6600ddd78..e0cebef84 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -346,6 +346,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new
 		if err != nil {
 			return 0, translateIOError(ctx, err)
 		}
+		// Return the number of bytes read.
 		return 4, nil
 	})
 	return old, err
@@ -388,11 +389,55 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.
 		if err != nil {
 			return 0, translateIOError(ctx, err)
 		}
+		// Return the number of bytes read.
 		return 4, nil
 	})
 	return prev, err
 }
 
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.CheckIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			val, err := mm.as.LoadUint32(addr)
+			if err == nil {
+				return val, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var val uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		val, err = safemem.LoadUint32(im)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		// Return the number of bytes read.
+		return 4, nil
+	})
+	return val, err
+}
+
 // handleASIOFault handles a page fault at address addr for an AddressSpaceIO
 // operation spanning ioar.
 //
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index f16588e6e..a9e76bd45 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -254,6 +254,11 @@ type AddressSpaceIO interface {
 	//
 	// Preconditions: addr must be aligned to a 4-byte boundary.
 	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+
+	// LoadUint32 atomically loads the uint32 value at addr and returns it.
+	//
+	// Preconditions: addr must be aligned to a 4-byte boundary.
+	LoadUint32(addr usermem.Addr) (uint32, error)
 }
 
 // NoAddressSpaceIO implements AddressSpaceIO methods by panicing.
@@ -284,6 +289,11 @@ func (NoAddressSpaceIO) CompareAndSwapUint32(addr usermem.Addr, old, new uint32)
 	panic("This platform does not support AddressSpaceIO")
 }
 
+// LoadUint32 implements AddressSpaceIO.LoadUint32.
+func (NoAddressSpaceIO) LoadUint32(addr usermem.Addr) (uint32, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
 // SegmentationFault is an error returned by AddressSpaceIO methods when IO
 // fails due to access of an unmapped page, or a mapped page with insufficient
 // permissions.
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 05a6a61ae..d97a40297 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -18,9 +18,7 @@ go_library(
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy",
     visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/syserror",
-    ],
+    deps = ["//pkg/syserror"],
 )
 
 go_test(
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
index 873ffa046..f90b4bfd1 100644
--- a/pkg/sentry/platform/safecopy/atomic_amd64.s
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@@ -106,3 +106,31 @@ TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
   CMPXCHGL DX, 0(DI)
   MOVL AX, prev+16(FP)
   RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+  MOVL DI, sig+12(FP)
+  RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleLoadUint32Fault will store a different value in this address.
+  MOVL $0, sig+12(FP)
+
+  MOVQ addr+0(FP), AX
+  MOVL (AX), BX
+  MOVL BX, val+8(FP)
+  RET
diff --git a/pkg/sentry/platform/safecopy/atomic_arm64.s b/pkg/sentry/platform/safecopy/atomic_arm64.s
index 554a5c1e1..d58ed71f7 100644
--- a/pkg/sentry/platform/safecopy/atomic_arm64.s
+++ b/pkg/sentry/platform/safecopy/atomic_arm64.s
@@ -96,3 +96,31 @@ again:
 done:
 	MOVW R3, prev+16(FP)
 	RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+	MOVW R1, sig+12(FP)
+	RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleLoadUint32Fault will store a different value in this address.
+	MOVW $0, sig+12(FP)
+
+	MOVD addr+0(FP), R0
+	LDARW (R0), R1
+	MOVW R1, val+8(FP)
+	RET
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
index c60f73103..69c66a3b7 100644
--- a/pkg/sentry/platform/safecopy/safecopy.go
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@@ -75,6 +75,8 @@ var (
 	swapUint64End             uintptr
 	compareAndSwapUint32Begin uintptr
 	compareAndSwapUint32End   uintptr
+	loadUint32Begin           uintptr
+	loadUint32End             uintptr
 
 	// savedSigSegVHandler is a pointer to the SIGSEGV handler that was
 	// configured before we replaced it with our own. We still call into it
@@ -119,6 +121,8 @@ func initializeAddresses() {
 	swapUint64End = FindEndAddress(swapUint64Begin)
 	compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer()
 	compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin)
+	loadUint32Begin = reflect.ValueOf(loadUint32).Pointer()
+	loadUint32End = FindEndAddress(loadUint32Begin)
 }
 
 func init() {
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
index e78a6714e..f84527484 100644
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -79,6 +79,14 @@ func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
 //go:noescape
 func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
 
+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+
 // CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
 // copied and an error if SIGSEGV or SIGBUS is received while reading from src.
 func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
@@ -260,6 +268,18 @@ func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
 	return prev, errorFromFaultSignal(ptr, sig)
 }
 
+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	val, sig := loadUint32(ptr)
+	return val, errorFromFaultSignal(ptr, sig)
+}
+
 func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
 	switch sig {
 	case 0:
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
index 06614f1b4..db7701a29 100644
--- a/pkg/sentry/platform/safecopy/sighandler_amd64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@@ -101,6 +101,15 @@ not_swapuint64:
 	JMP handle_fault
 
 not_casuint32:
+	CMPQ CX, ·loadUint32Begin(SB)
+	JB not_loaduint32
+	CMPQ CX, ·loadUint32End(SB)
+	JAE not_loaduint32
+
+	LEAQ handleLoadUint32Fault(SB), CX
+	JMP handle_fault
+
+not_loaduint32:
 original_handler:
 	// Jump to the previous signal handler, which is likely the golang one.
 	XORQ CX, CX
diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s
index 5e8e193e7..cdfca8207 100644
--- a/pkg/sentry/platform/safecopy/sighandler_arm64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_arm64.s
@@ -110,6 +110,17 @@ not_swapuint64:
 	B handle_fault
 
 not_casuint32:
+	MOVD ·loadUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_loaduint32
+	MOVD ·loadUint32End(SB), R8
+	CMP R8, R7
+	BHS not_loaduint32
+
+	MOVD $handleLoadUint32Fault(SB), R7
+	B handle_fault
+
+not_loaduint32:
 original_handler:
 	// Jump to the previous signal handler, which is likely the golang one.
 	MOVD ·savedSigBusHandler(SB), R7
diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go
index e91ff66ae..c3a9780d2 100644
--- a/pkg/sentry/safemem/block_unsafe.go
+++ b/pkg/sentry/safemem/block_unsafe.go
@@ -267,3 +267,13 @@ func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) {
 	}
 	return safecopy.CompareAndSwapUint32(b.start, old, new)
 }
+
+// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b.
+//
+// Preconditions: b.Len() >= 4.
+func LoadUint32(b Block) (uint32, error) {
+	if b.length < 4 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.LoadUint32(b.start)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 7a1d396ec..f0c89cba4 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -124,6 +124,46 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
 	return 0, kernel.ERESTART_RESTARTBLOCK
 }
 
+func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error {
+	w := t.FutexWaiter()
+	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
+	if err != nil {
+		return err
+	}
+	if locked {
+		// Futex acquired, we're done!
+		return nil
+	}
+
+	if forever {
+		err = t.Block(w.C)
+	} else {
+		notifier, tchan := ktime.NewChannelNotifier()
+		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+		timer.Swap(ktime.Setting{
+			Enabled: true,
+			Next:    ktime.FromTimespec(ts),
+		})
+		err = t.BlockWithTimer(w.C, tchan)
+		timer.Destroy()
+	}
+
+	t.Futex().WaitComplete(w)
+	return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error {
+	w := t.FutexWaiter()
+	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
+	if err != nil {
+		return err
+	}
+	if !locked {
+		return syserror.EWOULDBLOCK
+	}
+	return nil
+}
+
 // Futex implements linux syscall futex(2).
 // It provides a method for a program to wait for a value at a given address to
 // change, and a method to wake up anyone waiting on a particular address.
@@ -144,7 +184,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	switch cmd {
 	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
 		// WAIT{_BITSET} wait forever if the timeout isn't passed.
-		forever := timeout == 0
+		forever := (timeout == 0)
 
 		var timespec linux.Timespec
 		if !forever {
@@ -205,8 +245,30 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
 		return uintptr(n), nil, err
 
-	case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
-		// We don't support any priority inversion futexes.
+	case linux.FUTEX_LOCK_PI:
+		forever := (timeout == 0)
+
+		var timespec linux.Timespec
+		if !forever {
+			var err error
+			timespec, err = copyTimespecIn(t, timeout)
+			if err != nil {
+				return 0, nil, err
+			}
+		}
+		err := futexLockPI(t, timespec, forever, addr, private)
+		return 0, nil, err
+
+	case linux.FUTEX_TRYLOCK_PI:
+		err := tryLockPI(t, addr, private)
+		return 0, nil, err
+
+	case linux.FUTEX_UNLOCK_PI:
+		err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
+		return 0, nil, err
+
+	case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, syserror.ENOSYS
 
 	default:
diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go
index 8bdf3a508..7add8bc82 100644
--- a/pkg/sentry/usermem/bytes_io_unsafe.go
+++ b/pkg/sentry/usermem/bytes_io_unsafe.go
@@ -37,3 +37,11 @@ func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new
 	}
 	return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil
 }
+
+// LoadUint32 implements IO.LoadUint32.
+func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) {
+	if _, err := b.rangeCheck(addr, 4); err != nil {
+		return 0, err
+	}
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil
+}
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 75ac4d22d..c3c9c153b 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -103,6 +103,13 @@ type IO interface {
 	// any following locks in the lock order. addr must be aligned to a 4-byte
 	// boundary.
 	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
+
+	// LoadUint32 atomically loads the uint32 value at addr and returns it.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. addr must be aligned to a 4-byte
+	// boundary.
+	LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error)
 }
 
 // IOOpts contains options applicable to all IO methods.
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 4228707f4..5558cccff 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -33,6 +33,7 @@ var (
 	ECHILD       = error(syscall.ECHILD)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
 	ECONNRESET   = error(syscall.ECONNRESET)
+	EDEADLK      = error(syscall.EDEADLK)
 	EEXIST       = error(syscall.EEXIST)
 	EFAULT       = error(syscall.EFAULT)
 	EFBIG        = error(syscall.EFBIG)
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 37d0c31fd..b3499bcde 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -99,8 +99,8 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			// args: cmd, ...
 			tr = newArgsTracker(0)
 
-		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL:
-			// args: fd, cmd, ...
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX:
+			// args: fd/addr, cmd, ...
 			tr = newArgsTracker(1)
 
 		case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 4c818238b..2c214925e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -808,6 +808,7 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:memory_util",
+        "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index 6fa284013..35933b660 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -32,6 +32,7 @@
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/memory_util.h"
+#include "test/util/save_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -118,6 +119,30 @@ int futex_wake_op(bool priv, std::atomic<int>* uaddr1, std::atomic<int>* uaddr2,
   return syscall(SYS_futex, uaddr1, op, nwake1, nwake2, uaddr2, sub_op);
 }
 
+int futex_lock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_LOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+int futex_trylock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_TRYLOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+int futex_unlock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_UNLOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
 // Fixture for futex tests parameterized by whether to use private or shared
 // futexes.
 class PrivateAndSharedFutexTest : public ::testing::TestWithParam<bool> {
@@ -589,7 +614,95 @@ TEST(SharedFutexTest, WakeInterprocessFile_NoRandomSave) {
       << " status " << status;
 }
 
-}  // namespace
+TEST_P(PrivateAndSharedFutexTest, PIBasic) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+
+  ASSERT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+  EXPECT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EDEADLK));
 
+  ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), 0);
+  EXPECT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIConcurrency_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  std::unique_ptr<ScopedThread> threads[100];
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) {
+    threads[i] = absl::make_unique<ScopedThread>([is_priv, &a] {
+      for (size_t j = 0; j < 10; ++j) {
+        ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+        EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid());
+        SleepSafe(absl::Milliseconds(5));
+        ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+      }
+    });
+  }
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIWaiters) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+
+  ScopedThread th([is_priv, &a] {
+    ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+    ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+  });
+
+  // Wait until the thread blocks on the futex, setting the waiters bit.
+  auto start = absl::Now();
+  while (a.load() != (FUTEX_WAITERS | gettid())) {
+    ASSERT_LT(absl::Now() - start, absl::Seconds(5));
+    absl::SleepFor(absl::Milliseconds(100));
+  }
+  ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+}
+
+TEST_P(PrivateAndSharedFutexTest, PITryLock) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  ASSERT_THAT(futex_trylock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+
+  EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EDEADLK));
+  ScopedThread th([is_priv, &a] {
+    EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EAGAIN));
+  });
+  th.Join();
+
+  ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds());
+}
+
+TEST_P(PrivateAndSharedFutexTest, PITryLockConcurrency_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  std::unique_ptr<ScopedThread> threads[100];
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) {
+    threads[i] = absl::make_unique<ScopedThread>([is_priv, &a] {
+      for (size_t j = 0; j < 10;) {
+        if (futex_trylock_pi(is_priv, &a) >= 0) {
+          ++j;
+          EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid());
+          SleepSafe(absl::Milliseconds(5));
+          ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+        }
+      }
+    });
+  }
+}
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 56a61282953b46c8f8b707d5948a2d3958dced0c Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 8 Mar 2019 15:48:16 -0800
Subject: Implement IP_MULTICAST_LOOP.

IP_MULTICAST_LOOP controls whether or not multicast packets sent on the default
route are looped back. In order to implement this switch, support for sending
and looping back multicast packets on the default route had to be implemented.

For now we only support IPv4 multicast.

PiperOrigin-RevId: 237534603
Change-Id: I490ac7ff8e8ebef417c7eb049a919c29d156ac1c
---
 pkg/sentry/socket/epsocket/epsocket.go             |  37 +-
 pkg/syserr/netstack.go                             |   2 +
 pkg/tcpip/network/arp/arp.go                       |   2 +-
 pkg/tcpip/network/ip_test.go                       |   8 +-
 pkg/tcpip/network/ipv4/ipv4.go                     |  15 +-
 pkg/tcpip/network/ipv6/icmp_test.go                |   2 +-
 pkg/tcpip/network/ipv6/ipv6.go                     |  15 +-
 pkg/tcpip/stack/nic.go                             |  18 +-
 pkg/tcpip/stack/registration.go                    |  14 +-
 pkg/tcpip/stack/route.go                           |  12 +-
 pkg/tcpip/stack/stack.go                           |  24 +-
 pkg/tcpip/stack/stack_test.go                      |  40 +-
 pkg/tcpip/stack/transport_test.go                  |   2 +-
 pkg/tcpip/tcpip.go                                 |   5 +
 pkg/tcpip/transport/icmp/endpoint.go               |   4 +-
 pkg/tcpip/transport/icmp/endpoint_state.go         |   2 +-
 pkg/tcpip/transport/tcp/endpoint.go                |   2 +-
 pkg/tcpip/transport/tcp/endpoint_state.go          |   1 +
 pkg/tcpip/transport/udp/BUILD                      |   1 +
 pkg/tcpip/transport/udp/endpoint.go                |  43 +-
 pkg/tcpip/transport/udp/endpoint_state.go          |   2 +-
 runsc/boot/network.go                              |  16 +-
 test/syscalls/linux/socket_ipv4_udp_unbound.cc     | 451 +++++++++++++++++++--
 .../socket_ipv4_udp_unbound_external_networking.cc | 332 +++++++++++++++
 24 files changed, 946 insertions(+), 104 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 4e547ea33..f7636e056 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -911,6 +911,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 		}
 		return rv.InetMulticastRequest, nil
 
+	case linux.IP_MULTICAST_LOOP:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.MulticastLoopOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		if v {
+			return int32(1), nil
+		}
+		return int32(0), nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1178,6 +1193,15 @@ func copyInMulticastRequest(optVal []byte) (linux.InetMulticastRequestWithNIC, *
 	return req, nil
 }
 
+// reduceToByte ORs all of the bytes in the input.
+func reduceToByte(buf []byte) byte {
+	var out byte
+	for _, b := range buf {
+		out |= b
+	}
+	return out
+}
+
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
 func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
@@ -1235,6 +1259,18 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
 		}))
 
+	case linux.IP_MULTICAST_LOOP:
+		if len(optVal) < 1 {
+			return syserr.ErrInvalidArgument
+		}
+		if len(optVal) > sizeOfInt32 {
+			optVal = optVal[:sizeOfInt32]
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOpt(
+			tcpip.MulticastLoopOption(reduceToByte(optVal) != 0),
+		))
+
 	case linux.MCAST_JOIN_GROUP:
 		// FIXME: Implement MCAST_JOIN_GROUP.
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1252,7 +1288,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_MSFILTER,
 		linux.IP_MTU_DISCOVER,
 		linux.IP_MULTICAST_ALL,
-		linux.IP_MULTICAST_LOOP,
 		linux.IP_NODEFRAG,
 		linux.IP_OPTIONS,
 		linux.IP_PASSSEC,
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index 05ca475d1..c5a628c7d 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -23,6 +23,7 @@ import (
 var (
 	ErrUnknownProtocol       = New(tcpip.ErrUnknownProtocol.String(), linux.EINVAL)
 	ErrUnknownNICID          = New(tcpip.ErrUnknownNICID.String(), linux.EINVAL)
+	ErrUnknownDevice         = New(tcpip.ErrUnknownDevice.String(), linux.ENODEV)
 	ErrUnknownProtocolOption = New(tcpip.ErrUnknownProtocolOption.String(), linux.ENOPROTOOPT)
 	ErrDuplicateNICID        = New(tcpip.ErrDuplicateNICID.String(), linux.EEXIST)
 	ErrDuplicateAddress      = New(tcpip.ErrDuplicateAddress.String(), linux.EEXIST)
@@ -49,6 +50,7 @@ var (
 var netstackErrorTranslations = map[*tcpip.Error]*Error{
 	tcpip.ErrUnknownProtocol:       ErrUnknownProtocol,
 	tcpip.ErrUnknownNICID:          ErrUnknownNICID,
+	tcpip.ErrUnknownDevice:         ErrUnknownDevice,
 	tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption,
 	tcpip.ErrDuplicateNICID:        ErrDuplicateNICID,
 	tcpip.ErrDuplicateAddress:      ErrDuplicateAddress,
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index ed39640c1..5ab542f2c 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -79,7 +79,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, buffer.Prependable, buffer.VectorisedView, tcpip.TransportProtocolNumber, uint8, stack.PacketLooping) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 97a43aece..7eb0e697d 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -177,7 +177,7 @@ func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 		NIC:         1,
 	}})
 
-	return s.FindRoute(1, local, remote, ipv4.ProtocolNumber)
+	return s.FindRoute(1, local, remote, ipv4.ProtocolNumber, false /* multicastLoop */)
 }
 
 func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
@@ -191,7 +191,7 @@ func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 		NIC:         1,
 	}})
 
-	return s.FindRoute(1, local, remote, ipv6.ProtocolNumber)
+	return s.FindRoute(1, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */)
 }
 
 func TestIPv4Send(t *testing.T) {
@@ -221,7 +221,7 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, hdr, payload.ToVectorisedView(), 123, 123); err != nil {
+	if err := ep.WritePacket(&r, hdr, payload.ToVectorisedView(), 123, 123, stack.PacketOut); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 }
@@ -450,7 +450,7 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, hdr, payload.ToVectorisedView(), 123, 123); err != nil {
+	if err := ep.WritePacket(&r, hdr, payload.ToVectorisedView(), 123, 123, stack.PacketOut); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 }
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index bfc3c08fa..545684032 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -104,7 +104,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop stack.PacketLooping) *tcpip.Error {
 	ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
 	length := uint16(hdr.UsedLength() + payload.Size())
 	id := uint32(0)
@@ -123,8 +123,19 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload b
 		DstAddr:     r.RemoteAddress,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
-	r.Stats().IP.PacketsSent.Increment()
 
+	if loop&stack.PacketLoop != 0 {
+		views := make([]buffer.View, 1, 1+len(payload.Views()))
+		views[0] = hdr.View()
+		views = append(views, payload.Views()...)
+		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
+		e.HandlePacket(r, vv)
+	}
+	if loop&stack.PacketOut == 0 {
+		return nil
+	}
+
+	r.Stats().IP.PacketsSent.Increment()
 	return e.linkEP.WritePacket(r, hdr, payload, ProtocolNumber)
 }
 
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 797176243..15574bab1 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -161,7 +161,7 @@ func (c *testContext) cleanup() {
 func TestLinkResolution(t *testing.T) {
 	c := newTestContext(t)
 	defer c.cleanup()
-	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber)
+	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 5f68ef7d5..df3b64c98 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -84,7 +84,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop stack.PacketLooping) *tcpip.Error {
 	length := uint16(hdr.UsedLength() + payload.Size())
 	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
 	ip.Encode(&header.IPv6Fields{
@@ -94,8 +94,19 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload b
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
-	r.Stats().IP.PacketsSent.Increment()
 
+	if loop&stack.PacketLoop != 0 {
+		views := make([]buffer.View, 1, 1+len(payload.Views()))
+		views[0] = hdr.View()
+		views = append(views, payload.Views()...)
+		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
+		e.HandlePacket(r, vv)
+	}
+	if loop&stack.PacketOut == 0 {
+		return nil
+	}
+
+	r.Stats().IP.PacketsSent.Increment()
 	return e.linkEP.WritePacket(r, hdr, payload, ProtocolNumber)
 }
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 79f845225..14267bb48 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -28,10 +28,11 @@ import (
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
-	stack  *Stack
-	id     tcpip.NICID
-	name   string
-	linkEP LinkEndpoint
+	stack    *Stack
+	id       tcpip.NICID
+	name     string
+	linkEP   LinkEndpoint
+	loopback bool
 
 	demux *transportDemuxer
 
@@ -62,12 +63,13 @@ const (
 	NeverPrimaryEndpoint
 )
 
-func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint) *NIC {
+func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC {
 	return &NIC{
 		stack:     stack,
 		id:        id,
 		name:      name,
 		linkEP:    ep,
+		loopback:  loopback,
 		demux:     newTransportDemuxer(stack),
 		primary:   make(map[tcpip.NetworkProtocolNumber]*ilist.List),
 		endpoints: make(map[NetworkEndpointID]*referencedNetworkEndpoint),
@@ -407,7 +409,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 		n.mu.RLock()
 		for _, ref := range n.endpoints {
 			if ref.protocol == header.IPv4ProtocolNumber && ref.tryIncRef() {
-				r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
+				r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* multicastLoop */)
 				r.RemoteLinkAddress = remote
 				ref.ep.HandlePacket(&r, vv)
 				ref.decRef()
@@ -418,7 +420,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 	}
 
 	if ref := n.getRef(protocol, dst); ref != nil {
-		r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
+		r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* multicastLoop */)
 		r.RemoteLinkAddress = remote
 		ref.ep.HandlePacket(&r, vv)
 		ref.decRef()
@@ -430,7 +432,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 	//
 	// TODO: Should we be forwarding the packet even if promiscuous?
 	if n.stack.Forwarding() {
-		r, err := n.stack.FindRoute(0, "", dst, protocol)
+		r, err := n.stack.FindRoute(0, "", dst, protocol, false /* multicastLoop */)
 		if err != nil {
 			n.stack.stats.IP.InvalidAddressesReceived.Increment()
 			return
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 62acd5919..cf4d52fe9 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -125,6 +125,18 @@ type TransportDispatcher interface {
 	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView)
 }
 
+// PacketLooping specifies where an outbound packet should be sent.
+type PacketLooping byte
+
+const (
+	// PacketOut indicates that the packet should be passed to the link
+	// endpoint.
+	PacketOut PacketLooping = 1 << iota
+
+	// PacketLoop indicates that the packet should be handled locally.
+	PacketLoop
+)
+
 // NetworkEndpoint is the interface that needs to be implemented by endpoints
 // of network layer protocols (e.g., ipv4, ipv6).
 type NetworkEndpoint interface {
@@ -149,7 +161,7 @@ type NetworkEndpoint interface {
 
 	// WritePacket writes a packet to the given destination address and
 	// protocol.
-	WritePacket(r *Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error
+	WritePacket(r *Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop PacketLooping) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 2b4185014..c9603ad5e 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -46,17 +46,20 @@ type Route struct {
 	// ref a reference to the network endpoint through which the route
 	// starts.
 	ref *referencedNetworkEndpoint
+
+	multicastLoop bool
 }
 
 // makeRoute initializes a new route. It takes ownership of the provided
 // reference to a network endpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint) Route {
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, multicastLoop bool) Route {
 	return Route{
 		NetProto:         netProto,
 		LocalAddress:     localAddr,
 		LocalLinkAddress: localLinkAddr,
 		RemoteAddress:    remoteAddr,
 		ref:              ref,
+		multicastLoop:    multicastLoop,
 	}
 }
 
@@ -134,7 +137,12 @@ func (r *Route) IsResolutionRequired() bool {
 
 // WritePacket writes the packet through the given route.
 func (r *Route) WritePacket(hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
-	err := r.ref.ep.WritePacket(r, hdr, payload, protocol, ttl)
+	loop := PacketOut
+	if r.multicastLoop && (header.IsV4MulticastAddress(r.RemoteAddress) || header.IsV6MulticastAddress(r.RemoteAddress)) {
+		loop |= PacketLoop
+	}
+
+	err := r.ref.ep.WritePacket(r, hdr, payload, protocol, ttl, loop)
 	if err == tcpip.ErrNoRoute {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index cfda7ec3c..047b704e0 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -513,7 +513,7 @@ func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network
 
 // createNIC creates a NIC with the provided id and link-layer endpoint, and
 // optionally enable it.
-func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, enabled bool) *tcpip.Error {
+func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, enabled, loopback bool) *tcpip.Error {
 	ep := FindLinkEndpoint(linkEP)
 	if ep == nil {
 		return tcpip.ErrBadLinkEndpoint
@@ -527,7 +527,7 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpoint
 		return tcpip.ErrDuplicateNICID
 	}
 
-	n := newNIC(s, id, name, ep)
+	n := newNIC(s, id, name, ep, loopback)
 
 	s.nics[id] = n
 	if enabled {
@@ -539,26 +539,32 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpoint
 
 // CreateNIC creates a NIC with the provided id and link-layer endpoint.
 func (s *Stack) CreateNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *tcpip.Error {
-	return s.createNIC(id, "", linkEP, true)
+	return s.createNIC(id, "", linkEP, true, false)
 }
 
 // CreateNamedNIC creates a NIC with the provided id and link-layer endpoint,
 // and a human-readable name.
 func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
-	return s.createNIC(id, name, linkEP, true)
+	return s.createNIC(id, name, linkEP, true, false)
+}
+
+// CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer
+// endpoint, and a human-readable name.
+func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, name, linkEP, true, true)
 }
 
 // CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint,
 // but leave it disable. Stack.EnableNIC must be called before the link-layer
 // endpoint starts delivering packets to it.
 func (s *Stack) CreateDisabledNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *tcpip.Error {
-	return s.createNIC(id, "", linkEP, false)
+	return s.createNIC(id, "", linkEP, false, false)
 }
 
 // CreateDisabledNamedNIC is a combination of CreateNamedNIC and
 // CreateDisabledNIC.
 func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
-	return s.createNIC(id, name, linkEP, false)
+	return s.createNIC(id, name, linkEP, false, false)
 }
 
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
@@ -748,7 +754,7 @@ func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.Netwo
 
 // FindRoute creates a route to the given destination address, leaving through
 // the given nic and local address (if provided).
-func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (Route, *tcpip.Error) {
+func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
@@ -758,7 +764,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	if id != 0 && !needRoute {
 		if nic, ok := s.nics[id]; ok {
 			if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
-				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref), nil
+				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, multicastLoop && !nic.loopback), nil
 			}
 		}
 	} else {
@@ -774,7 +780,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 						remoteAddr = ref.ep.ID().LocalAddress
 					}
 
-					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref)
+					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, multicastLoop && !nic.loopback)
 					if needRoute {
 						r.NextHop = route.Gateway
 					}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index aba1e984c..b366de21d 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -112,7 +112,7 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return f.linkEP.Capabilities()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, _ uint8) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, _ uint8, loop stack.PacketLooping) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
@@ -122,6 +122,18 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, hdr buffer.Prependable
 	b[0] = r.RemoteAddress[0]
 	b[1] = f.id.LocalAddress[0]
 	b[2] = byte(protocol)
+
+	if loop&stack.PacketLoop != 0 {
+		views := make([]buffer.View, 1, 1+len(payload.Views()))
+		views[0] = hdr.View()
+		views = append(views, payload.Views()...)
+		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
+		f.HandlePacket(r, vv)
+	}
+	if loop&stack.PacketOut == 0 {
+		return nil
+	}
+
 	return f.linkEP.WritePacket(r, hdr, payload, fakeNetNumber)
 }
 
@@ -262,7 +274,7 @@ func TestNetworkReceive(t *testing.T) {
 }
 
 func sendTo(t *testing.T, s *stack.Stack, addr tcpip.Address) {
-	r, err := s.FindRoute(0, "", addr, fakeNetNumber)
+	r, err := s.FindRoute(0, "", addr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute failed: %v", err)
 	}
@@ -354,7 +366,7 @@ func TestNetworkSendMultiRoute(t *testing.T) {
 }
 
 func testRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr, expectedSrcAddr tcpip.Address) {
-	r, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber)
+	r, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute failed: %v", err)
 	}
@@ -371,7 +383,7 @@ func testRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr,
 }
 
 func testNoRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr tcpip.Address) {
-	_, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber)
+	_, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
 	if err != tcpip.ErrNoRoute {
 		t.Fatalf("FindRoute returned unexpected error, expected tcpip.ErrNoRoute, got %v", err)
 	}
@@ -514,7 +526,7 @@ func TestDelayedRemovalDueToRoute(t *testing.T) {
 	}
 
 	// Get a route, check that packet is still deliverable.
-	r, err := s.FindRoute(0, "", "\x02", fakeNetNumber)
+	r, err := s.FindRoute(0, "", "\x02", fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute failed: %v", err)
 	}
@@ -584,7 +596,7 @@ func TestPromiscuousMode(t *testing.T) {
 	}
 
 	// Check that we can't get a route as there is no local address.
-	_, err := s.FindRoute(0, "", "\x02", fakeNetNumber)
+	_, err := s.FindRoute(0, "", "\x02", fakeNetNumber, false /* multicastLoop */)
 	if err != tcpip.ErrNoRoute {
 		t.Fatalf("FindRoute returned unexpected status: expected %v, got %v", tcpip.ErrNoRoute, err)
 	}
@@ -622,7 +634,7 @@ func TestAddressSpoofing(t *testing.T) {
 
 	// With address spoofing disabled, FindRoute does not permit an address
 	// that was not added to the NIC to be used as the source.
-	r, err := s.FindRoute(0, srcAddr, dstAddr, fakeNetNumber)
+	r, err := s.FindRoute(0, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
 	if err == nil {
 		t.Errorf("FindRoute succeeded with route %+v when it should have failed", r)
 	}
@@ -632,7 +644,7 @@ func TestAddressSpoofing(t *testing.T) {
 	if err := s.SetSpoofing(1, true); err != nil {
 		t.Fatalf("SetSpoofing failed: %v", err)
 	}
-	r, err = s.FindRoute(0, srcAddr, dstAddr, fakeNetNumber)
+	r, err = s.FindRoute(0, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute failed: %v", err)
 	}
@@ -654,14 +666,14 @@ func TestBroadcastNeedsNoRoute(t *testing.T) {
 	s.SetRouteTable([]tcpip.Route{})
 
 	// If there is no endpoint, it won't work.
-	if _, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber); err != tcpip.ErrNetworkUnreachable {
+	if _, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable {
 		t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
 	}
 
 	if err := s.AddAddress(1, fakeNetNumber, header.IPv4Any); err != nil {
 		t.Fatalf("AddAddress(%v, %v) failed: %v", fakeNetNumber, header.IPv4Any, err)
 	}
-	r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber)
+	r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute(1, %v, %v, %v) failed: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
 	}
@@ -675,7 +687,7 @@ func TestBroadcastNeedsNoRoute(t *testing.T) {
 	}
 
 	// If the NIC doesn't exist, it won't work.
-	if _, err := s.FindRoute(2, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber); err != tcpip.ErrNetworkUnreachable {
+	if _, err := s.FindRoute(2, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable {
 		t.Fatalf("got FindRoute(2, %v, %v, %v) = %v want = %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
 	}
 }
@@ -738,7 +750,7 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 			}
 
 			// If there is no endpoint, it won't work.
-			if _, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber); err != want {
+			if _, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); err != want {
 				t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", anyAddr, tc.address, fakeNetNumber, err, want)
 			}
 
@@ -746,7 +758,7 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 				t.Fatalf("AddAddress(%v, %v) failed: %v", fakeNetNumber, anyAddr, err)
 			}
 
-			if r, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber); tc.routeNeeded {
+			if r, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); tc.routeNeeded {
 				// Route table is empty but we need a route, this should cause an error.
 				if err != tcpip.ErrNoRoute {
 					t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", anyAddr, tc.address, fakeNetNumber, err, tcpip.ErrNoRoute)
@@ -763,7 +775,7 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 				}
 			}
 			// If the NIC doesn't exist, it won't work.
-			if _, err := s.FindRoute(2, anyAddr, tc.address, fakeNetNumber); err != want {
+			if _, err := s.FindRoute(2, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); err != want {
 				t.Fatalf("got FindRoute(2, %v, %v, %v) = %v want = %v", anyAddr, tc.address, fakeNetNumber, err, want)
 			}
 		})
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index a9e844e3d..279ab3c56 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -103,7 +103,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	f.peerAddr = addr.Addr
 
 	// Find the route.
-	r, err := f.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber)
+	r, err := f.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		return tcpip.ErrNoRoute
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 7010d1b68..825854148 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -68,6 +68,7 @@ func (e *Error) IgnoreStats() bool {
 var (
 	ErrUnknownProtocol       = &Error{msg: "unknown protocol"}
 	ErrUnknownNICID          = &Error{msg: "unknown nic id"}
+	ErrUnknownDevice         = &Error{msg: "unknown device"}
 	ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"}
 	ErrDuplicateNICID        = &Error{msg: "duplicate nic id"}
 	ErrDuplicateAddress      = &Error{msg: "duplicate address"}
@@ -477,6 +478,10 @@ type MulticastInterfaceOption struct {
 	InterfaceAddr Address
 }
 
+// MulticastLoopOption is used by SetSockOpt/GetSockOpt to specify whether
+// multicast packets sent over a non-loopback interface will be looped back.
+type MulticastLoopOption bool
+
 // MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
 // AddMembershipOption and RemoveMembershipOption.
 type MembershipOption struct {
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 05c4b532a..d876005fe 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -277,7 +277,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 		}
 
 		// Find the enpoint.
-		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto)
+		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto, false /* multicastLoop */)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -471,7 +471,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, e.bindAddr, addr.Addr, netProto)
+	r, err := e.stack.FindRoute(nicid, e.bindAddr, addr.Addr, netProto, false /* multicastLoop */)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/icmp/endpoint_state.go b/pkg/tcpip/transport/icmp/endpoint_state.go
index 21008d089..8a7909246 100644
--- a/pkg/tcpip/transport/icmp/endpoint_state.go
+++ b/pkg/tcpip/transport/icmp/endpoint_state.go
@@ -71,7 +71,7 @@ func (e *endpoint) afterLoad() {
 
 	var err *tcpip.Error
 	if e.state == stateConnected {
-		e.route, err = e.stack.FindRoute(e.regNICID, e.bindAddr, e.id.RemoteAddress, e.netProto)
+		e.route, err = e.stack.FindRoute(e.regNICID, e.bindAddr, e.id.RemoteAddress, e.netProto, false /* multicastLoop */)
 		if err != nil {
 			panic(*err)
 		}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index ae99f0f8e..fc4f82402 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1091,7 +1091,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, addr.Addr, netProto)
+	r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 87e988afa..a42e09b8c 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -307,6 +307,7 @@ func loadError(s string) *tcpip.Error {
 		var errors = []*tcpip.Error{
 			tcpip.ErrUnknownProtocol,
 			tcpip.ErrUnknownNICID,
+			tcpip.ErrUnknownDevice,
 			tcpip.ErrUnknownProtocolOption,
 			tcpip.ErrDuplicateNICID,
 			tcpip.ErrDuplicateAddress,
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 8ccb79c48..d271490c1 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -27,6 +27,7 @@ go_library(
     imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/sleep",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 4108cb09c..3693abae5 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -81,6 +81,7 @@ type endpoint struct {
 	multicastTTL   uint8
 	multicastAddr  tcpip.Address
 	multicastNICID tcpip.NICID
+	multicastLoop  bool
 	reusePort      bool
 	broadcast      bool
 
@@ -124,6 +125,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
 		//
 		// Linux defaults to TTL=1.
 		multicastTTL:  1,
+		multicastLoop: true,
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
 	}
@@ -274,7 +276,7 @@ func (e *endpoint) connectRoute(nicid tcpip.NICID, addr tcpip.FullAddress) (stac
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, localAddr, addr.Addr, netProto)
+	r, err := e.stack.FindRoute(nicid, localAddr, addr.Addr, netProto, e.multicastLoop)
 	if err != nil {
 		return stack.Route{}, 0, 0, err
 	}
@@ -458,13 +460,19 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 	case tcpip.AddMembershipOption:
 		nicID := v.NIC
-		if v.InterfaceAddr != header.IPv4Any {
+		if v.InterfaceAddr == header.IPv4Any {
+			if nicID == 0 {
+				r, err := e.stack.FindRoute(0, "", v.MulticastAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+				if err == nil {
+					nicID = r.NICID()
+					r.Release()
+				}
+			}
+		} else {
 			nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr)
 		}
 		if nicID == 0 {
-			// TODO: Allow adding memberships without
-			// specifing an interface.
-			return tcpip.ErrNoRoute
+			return tcpip.ErrUnknownDevice
 		}
 
 		// TODO: check that v.MulticastAddr is a multicast address.
@@ -479,11 +487,19 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 	case tcpip.RemoveMembershipOption:
 		nicID := v.NIC
-		if v.InterfaceAddr != header.IPv4Any {
+		if v.InterfaceAddr == header.IPv4Any {
+			if nicID == 0 {
+				r, err := e.stack.FindRoute(0, "", v.MulticastAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+				if err == nil {
+					nicID = r.NICID()
+					r.Release()
+				}
+			}
+		} else {
 			nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr)
 		}
 		if nicID == 0 {
-			return tcpip.ErrNoRoute
+			return tcpip.ErrUnknownDevice
 		}
 
 		// TODO: check that v.MulticastAddr is a multicast address.
@@ -503,6 +519,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			}
 		}
 
+	case tcpip.MulticastLoopOption:
+		e.mu.Lock()
+		e.multicastLoop = bool(v)
+		e.mu.Unlock()
+
 	case tcpip.ReusePortOption:
 		e.mu.Lock()
 		e.reusePort = v != 0
@@ -578,6 +599,14 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.MulticastLoopOption:
+		e.mu.RLock()
+		v := e.multicastLoop
+		e.mu.RUnlock()
+
+		*o = tcpip.MulticastLoopOption(v)
+		return nil
+
 	case *tcpip.ReusePortOption:
 		e.mu.RLock()
 		v := e.reusePort
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 4d8210294..b2daaf751 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -82,7 +82,7 @@ func (e *endpoint) afterLoad() {
 
 	var err *tcpip.Error
 	if e.state == stateConnected {
-		e.route, err = e.stack.FindRoute(e.regNICID, e.id.LocalAddress, e.id.RemoteAddress, netProto)
+		e.route, err = e.stack.FindRoute(e.regNICID, e.id.LocalAddress, e.id.RemoteAddress, netProto, e.multicastLoop)
 		if err != nil {
 			panic(*err)
 		}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0cadf48d6..40bc147ca 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -112,7 +112,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		linkEP := loopback.New()
 
 		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil {
 			return err
 		}
 
@@ -144,7 +144,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
-		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
 			return err
 		}
 
@@ -169,9 +169,15 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 // createNICWithAddrs creates a NIC in the network stack and adds the given
 // addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP) error {
-	if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
-		return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP, loopback bool) error {
+	if loopback {
+		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(linkEP)); err != nil {
+			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+		}
+	} else {
+		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
+			return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+		}
 	}
 
 	// Always start with an arp address for the NIC.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 2d702179e..38bc85ce9 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -61,7 +61,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNoGroup) {
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -99,7 +99,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -134,12 +134,12 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
   // Bind the second FD to the v4 any address to ensure that we can receive any
   // unicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -174,7 +174,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
-  EXPECT_THAT(
+  ASSERT_THAT(
       bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
@@ -182,12 +182,12 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -197,7 +197,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -207,7 +207,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
                          reinterpret_cast<sockaddr*>(&send_addr.addr),
                          send_addr.addr_len),
@@ -222,7 +222,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // bind and the group membership is configured by NIC ID.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -230,7 +230,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
-  EXPECT_THAT(
+  ASSERT_THAT(
       bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
@@ -238,12 +238,12 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -253,7 +253,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -263,7 +263,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
                          reinterpret_cast<sockaddr*>(&send_addr.addr),
                          send_addr.addr_len),
@@ -278,7 +278,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by address.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
@@ -287,19 +287,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
                          &iface, sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -309,7 +309,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -319,7 +319,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
                          reinterpret_cast<sockaddr*>(&send_addr.addr),
                          send_addr.addr_len),
@@ -334,7 +334,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by NIC ID.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
@@ -343,19 +343,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
                          &iface, sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -365,7 +365,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -375,7 +375,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
                          reinterpret_cast<sockaddr*>(&send_addr.addr),
                          send_addr.addr_len),
@@ -390,7 +390,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by address.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
@@ -399,19 +399,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
                          &iface, sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -421,7 +421,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -429,7 +429,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   auto connect_addr = V4Multicast();
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(connect)(sockets->first_fd(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
@@ -437,7 +437,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
       SyscallSucceedsWithValue(sizeof(send_buf)));
 
@@ -450,7 +450,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by NIC ID.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
@@ -459,19 +459,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
                          &iface, sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -481,7 +481,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -489,7 +489,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   auto connect_addr = V4Multicast();
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(connect)(sockets->first_fd(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
@@ -497,7 +497,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
       SyscallSucceedsWithValue(sizeof(send_buf)));
 
@@ -510,6 +510,354 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto connect_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  EXPECT_THAT(
+      RetryEINTR(connect)(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&connect_addr.addr),
+                          connect_addr.addr_len),
+      SyscallSucceeds());
+
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto connect_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  ASSERT_THAT(
+      RetryEINTR(connect)(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&connect_addr.addr),
+                          connect_addr.addr_len),
+      SyscallSucceeds());
+
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfNoLoop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
 // Check that dropping a group membership that does not exist fails.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastInvalidDrop) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -668,5 +1016,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidAddr) {
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
+TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupNoIf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallFailsWithErrno(ENODEV));
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupInvalidIf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn group = {};
+  group.imr_address.s_addr = inet_addr("255.255.255");
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallFailsWithErrno(ENODEV));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 7d561b991..8b4fc57b6 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
 
+#include <arpa/inet.h>
 #include <netinet/in.h>
 #include <sys/socket.h>
 #include <sys/types.h>
@@ -24,6 +25,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 
@@ -227,5 +229,335 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendUnicastOnUnbound) {
               SyscallSucceedsWithValue(sizeof(kTestMsg)));
 }
 
+constexpr char kMulticastAddress[] = "224.0.2.1";
+
+TestAddress V4Multicast() {
+  TestAddress t("V4Multicast");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      inet_addr(kMulticastAddress);
+  return t;
+}
+
+// Check that multicast packets won't be delivered to the sending socket with no
+// set interface or group membership.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       TestSendMulticastSelfNoGroup) {
+  // FIXME: A group membership is not required for external
+  // multicast on gVisor.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  auto bind_addr = V4Any();
+  ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                   bind_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t bind_addr_len = bind_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                  &bind_addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+      SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to the sending socket without
+// setting an interface.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastSelf) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  auto bind_addr = V4Any();
+  ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                   bind_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t bind_addr_len = bind_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                  &bind_addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  ASSERT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast packets won't be delivered to the sending socket with no
+// set interface and IP_MULTICAST_LOOP disabled.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       TestSendMulticastSelfLoopOff) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  auto bind_addr = V4Any();
+  ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                   bind_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t bind_addr_len = bind_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                  &bind_addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+  // Disable multicast looping.
+  EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(
+      RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+      SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets won't be delivered to another socket with no
+// set interface or group membership.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastNoGroup) {
+  // FIXME: A group membership is not required for external
+  // multicast on gVisor.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to another socket without
+// setting an interface.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticast) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast packets won't be delivered to another socket with no
+// set interface and IP_MULTICAST_LOOP disabled on the sending socket.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       TestSendMulticastSenderNoLoop) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Disable multicast looping on the sender.
+  EXPECT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  EXPECT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to the sending socket without
+// setting an interface and IP_MULTICAST_LOOP disabled on the receiving socket.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       TestSendMulticastReceiverNoLoop) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Disable multicast looping on the receiver.
+  ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From bc9b979b9412ad5852872c1a9bee462f73d2455e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 11 Mar 2019 11:46:18 -0700
Subject: Add profiling commands to runsc

Example:
  runsc debug --root=<dir> \
      --profile-heap=/tmp/heap.prof \
      --profile-cpu=/tmp/cpu.prod --profile-delay=30 \
      <container ID>
PiperOrigin-RevId: 237848456
Change-Id: Icff3f20c1b157a84d0922599eaea327320dad773
---
 pkg/seccomp/seccomp.go      |   2 +-
 pkg/sentry/control/BUILD    |   2 +
 pkg/sentry/control/pprof.go | 124 ++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/config.go        |   4 ++
 runsc/boot/controller.go    |   8 +++
 runsc/boot/filter/config.go |  13 +++++
 runsc/boot/filter/filter.go |  11 ++--
 runsc/boot/loader.go        |   7 +--
 runsc/cmd/debug.go          |  44 ++++++++++++++--
 runsc/main.go               |   2 +
 runsc/sandbox/sandbox.go    |  55 ++++++++++++++++++++
 11 files changed, 262 insertions(+), 10 deletions(-)
 create mode 100644 pkg/sentry/control/pprof.go

(limited to 'runsc')

diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index ba2955752..e113f3574 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -55,7 +55,7 @@ func Install(rules SyscallRules) error {
 	}
 
 	// Uncomment to get stack trace when there is a violation.
-	// defaultAction = uint32(linux.SECCOMP_RET_TRAP)
+	// defaultAction = linux.BPFAction(linux.SECCOMP_RET_TRAP)
 
 	log.Infof("Installing seccomp filters for %d syscalls (action=%v)", len(rules), defaultAction)
 
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index f54e01ee8..5052bcc0d 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "control",
     srcs = [
         "control.go",
+        "pprof.go",
         "proc.go",
         "state.go",
     ],
@@ -15,6 +16,7 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/fd",
         "//pkg/log",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
new file mode 100644
index 000000000..1af092af3
--- /dev/null
+++ b/pkg/sentry/control/pprof.go
@@ -0,0 +1,124 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"errors"
+	"runtime"
+	"runtime/pprof"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+var errNoOutput = errors.New("no output writer provided")
+
+// ProfileOpts contains options for the StartCPUProfile/Goroutine RPC call.
+type ProfileOpts struct {
+	// File is the filesystem path for the profile.
+	File string `json:"path"`
+
+	// FilePayload is the destination for the profiling output.
+	urpc.FilePayload
+}
+
+// Profile includes profile-related RPC stubs. It provides a way to
+// control the built-in pprof facility in sentry via sentryctl.
+//
+// The following options to sentryctl are added:
+//
+// - collect CPU profile on-demand.
+//   sentryctl -pid <pid> pprof-cpu-start
+//   sentryctl -pid <pid> pprof-cpu-stop
+//
+// - dump out the stack trace of current go routines.
+//   sentryctl -pid <pid> pprof-goroutine
+type Profile struct {
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// cpuFile is the current CPU profile output file.
+	cpuFile *fd.FD
+}
+
+// StartCPUProfile is an RPC stub which starts recording the CPU profile in a
+// file.
+func (p *Profile) StartCPUProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+
+	output, err := fd.NewFromFile(o.FilePayload.Files[0])
+	if err != nil {
+		return err
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Returns an error if profiling is already started.
+	if err := pprof.StartCPUProfile(output); err != nil {
+		output.Close()
+		return err
+	}
+
+	p.cpuFile = output
+	return nil
+}
+
+// StopCPUProfile is an RPC stub which stops the CPU profiling and flush out the
+// profile data. It takes no argument.
+func (p *Profile) StopCPUProfile(_, _ *struct{}) error {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if p.cpuFile == nil {
+		return errors.New("CPU profiling not started")
+	}
+
+	pprof.StopCPUProfile()
+	p.cpuFile.Close()
+	p.cpuFile = nil
+	return nil
+}
+
+// HeapProfile generates a heap profile for the sentry.
+func (p *Profile) HeapProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	runtime.GC() // Get up-to-date statistics.
+	if err := pprof.WriteHeapProfile(output); err != nil {
+		return err
+	}
+	return nil
+}
+
+// Goroutine is an RPC stub which dumps out the stack trace for all running
+// goroutines.
+func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	if err := pprof.Lookup("goroutine").WriteTo(output, 2); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 400203c99..626fcabdd 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -202,6 +202,9 @@ type Config struct {
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
 	PanicSignal int
 
+	// ProfileEnable is set to prepare the sandbox to be profiled.
+	ProfileEnable bool
+
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
@@ -228,6 +231,7 @@ func (c *Config) ToFlags() []string {
 		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
 		"--watchdog-action=" + c.WatchdogAction.String(),
 		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
+		"--profile=" + strconv.FormatBool(c.ProfileEnable),
 	}
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 23d476f7f..a864be720 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -95,6 +95,11 @@ const (
 
 	// SandboxStacks collects sandbox stacks for debugging.
 	SandboxStacks = "debug.Stacks"
+
+	// Profiling related commands (see pprof.go for more details).
+	StartCPUProfile = "Profile.StartCPUProfile"
+	StopCPUProfile  = "Profile.StopCPUProfile"
+	HeapProfile     = "Profile.HeapProfile"
 )
 
 // ControlSocketAddr generates an abstract unix socket name for the given ID.
@@ -135,6 +140,9 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 
 	srv.Register(&debug{})
+	if l.conf.ProfileEnable {
+		srv.Register(&control.Profile{})
+	}
 
 	return &controller{
 		srv:     srv,
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index bde749861..1ba5b7257 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -470,3 +470,16 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 		},
 	}
 }
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index d69a6a2cc..fb197f9b1 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -29,9 +29,10 @@ import (
 
 // Options are seccomp filter related options.
 type Options struct {
-	Platform     platform.Platform
-	HostNetwork  bool
-	ControllerFD int
+	Platform      platform.Platform
+	HostNetwork   bool
+	ProfileEnable bool
+	ControllerFD  int
 }
 
 // Install installs seccomp filters for based on the given platform.
@@ -47,6 +48,10 @@ func Install(opt Options) error {
 		Report("host networking enabled: syscall filters less restrictive!")
 		s.Merge(hostInetFilters())
 	}
+	if opt.ProfileEnable {
+		Report("profile enabled: syscall filters less restrictive!")
+		s.Merge(profileFilters())
+	}
 
 	switch p := opt.Platform.(type) {
 	case *ptrace.PTrace:
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 42fe6f312..4c7e6abfc 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -445,9 +445,10 @@ func (l *Loader) run() error {
 		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
 	} else {
 		opts := filter.Options{
-			Platform:     l.k.Platform,
-			HostNetwork:  l.conf.Network == NetworkHost,
-			ControllerFD: l.ctrl.srv.FD(),
+			Platform:      l.k.Platform,
+			HostNetwork:   l.conf.Network == NetworkHost,
+			ProfileEnable: l.conf.ProfileEnable,
+			ControllerFD:  l.ctrl.srv.FD(),
 		}
 		if err := filter.Install(opts); err != nil {
 			return fmt.Errorf("installing seccomp filters: %v", err)
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index e10326754..3ee9a9b49 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -16,7 +16,9 @@ package cmd
 
 import (
 	"context"
+	"os"
 	"syscall"
+	"time"
 
 	"flag"
 	"github.com/google/subcommands"
@@ -27,9 +29,12 @@ import (
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid    int
-	stacks bool
-	signal int
+	pid          int
+	stacks       bool
+	signal       int
+	profileHeap  string
+	profileCPU   string
+	profileDelay int
 }
 
 // Name implements subcommands.Command.
@@ -51,6 +56,9 @@ func (*Debug) Usage() string {
 func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set")
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
+	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
+	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
+	f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 }
 
@@ -114,5 +122,35 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("     *** Stack dump ***\n%s", stacks)
 	}
+	if d.profileCPU != "" {
+		f, err := os.Create(d.profileCPU)
+		if err != nil {
+			Fatalf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.StartCPUProfile(f); err != nil {
+			Fatalf(err.Error())
+		}
+		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
+		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+
+		if err := c.Sandbox.StopCPUProfile(); err != nil {
+			Fatalf(err.Error())
+		}
+		log.Infof("CPU profile written to %q", d.profileCPU)
+	}
+	if d.profileHeap != "" {
+		f, err := os.Create(d.profileHeap)
+		if err != nil {
+			Fatalf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.HeapProfile(f); err != nil {
+			Fatalf(err.Error())
+		}
+		log.Infof("Heap profile written to %q", d.profileHeap)
+	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/main.go b/runsc/main.go
index 4f89312b3..82c37ec11 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -63,6 +63,7 @@ var (
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+	profile        = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
 
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
 )
@@ -146,6 +147,7 @@ func main() {
 		StraceLogSize:  *straceLogSize,
 		WatchdogAction: wa,
 		PanicSignal:    *panicSignal,
+		ProfileEnable:  *profile,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 	}
 	if len(*straceSyscalls) != 0 {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ce8c21681..2698e3f86 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -825,6 +825,61 @@ func (s *Sandbox) Stacks() (string, error) {
 	return stacks, nil
 }
 
+// HeapProfile writes a heap profile to the given file.
+func (s *Sandbox) HeapProfile(f *os.File) error {
+	log.Debugf("Heap profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.HeapProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q heap profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StartCPUProfile start CPU profile writing to the given file.
+func (s *Sandbox) StartCPUProfile(f *os.File) error {
+	log.Debugf("CPU profile start %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.StartCPUProfile, &opts, nil); err != nil {
+		return fmt.Errorf("starting sandbox %q CPU profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StopCPUProfile stops a previously started CPU profile.
+func (s *Sandbox) StopCPUProfile() error {
+	log.Debugf("CPU profile stop %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.StopCPUProfile, nil, nil); err != nil {
+		return fmt.Errorf("stopping sandbox %q CPU profile: %v", s.ID, err)
+	}
+	return nil
+}
+
 // DestroyContainer destroys the given container. If it is the root container,
 // then the entire sandbox is destroyed.
 func (s *Sandbox) DestroyContainer(cid string) error {
-- 
cgit v1.2.3


From a16f6e50c5a6465b94f367d62c7a46b34ef15f66 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 12 Mar 2019 14:36:58 -0700
Subject: Make HandleLocal apply to all non-loopback interfaces.

HandleLocal is very similar conceptually to MULTICAST_LOOP, so we can unify
the implementations. This has the benefit of making HandleLocal apply even when
the fdbased link endpoint isn't in use.

In addition, move looping logic to route creation so that it doesn't need to be
run for each packet. This should improve performance.

PiperOrigin-RevId: 238099480
Change-Id: I72839f16f25310471453bc9d3fb8544815b25c23
---
 pkg/tcpip/link/fdbased/endpoint.go | 15 ---------------
 pkg/tcpip/stack/nic.go             |  4 ++--
 pkg/tcpip/stack/route.go           | 21 ++++++++++++---------
 pkg/tcpip/stack/stack.go           | 13 +++++++++++--
 runsc/boot/loader.go               |  5 +++--
 runsc/boot/network.go              |  1 -
 6 files changed, 28 insertions(+), 31 deletions(-)

(limited to 'runsc')

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index fa980716d..d726551b0 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -100,11 +100,6 @@ type endpoint struct {
 	inboundDispatcher linkDispatcher
 	dispatcher        stack.NetworkDispatcher
 
-	// handleLocal indicates whether packets destined to itself should be
-	// handled by the netstack internally (true) or be forwarded to the FD
-	// endpoint (false).
-	handleLocal bool
-
 	// packetDispatchMode controls the packet dispatcher used by this
 	// endpoint.
 	packetDispatchMode PacketDispatchMode
@@ -128,7 +123,6 @@ type Options struct {
 	Address            tcpip.LinkAddress
 	SaveRestore        bool
 	DisconnectOk       bool
-	HandleLocal        bool
 	PacketDispatchMode PacketDispatchMode
 }
 
@@ -168,7 +162,6 @@ func New(opts *Options) tcpip.LinkEndpointID {
 		closed:             opts.ClosedFunc,
 		addr:               opts.Address,
 		hdrSize:            hdrSize,
-		handleLocal:        opts.HandleLocal,
 		packetDispatchMode: opts.PacketDispatchMode,
 	}
 
@@ -256,14 +249,6 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
 func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
-	if e.handleLocal && r.LocalAddress != "" && r.LocalAddress == r.RemoteAddress {
-		views := make([]buffer.View, 1, 1+len(payload.Views()))
-		views[0] = hdr.View()
-		views = append(views, payload.Views()...)
-		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
-		e.dispatcher.DeliverNetworkPacket(e, r.RemoteLinkAddress, r.LocalLinkAddress, protocol, vv)
-		return nil
-	}
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
 		eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize))
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 14267bb48..defa8102a 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -409,7 +409,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 		n.mu.RLock()
 		for _, ref := range n.endpoints {
 			if ref.protocol == header.IPv4ProtocolNumber && ref.tryIncRef() {
-				r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* multicastLoop */)
+				r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* handleLocal */, false /* multicastLoop */)
 				r.RemoteLinkAddress = remote
 				ref.ep.HandlePacket(&r, vv)
 				ref.decRef()
@@ -420,7 +420,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 	}
 
 	if ref := n.getRef(protocol, dst); ref != nil {
-		r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* multicastLoop */)
+		r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* handleLocal */, false /* multicastLoop */)
 		r.RemoteLinkAddress = remote
 		ref.ep.HandlePacket(&r, vv)
 		ref.decRef()
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index c9603ad5e..86fb728b2 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -47,19 +47,27 @@ type Route struct {
 	// starts.
 	ref *referencedNetworkEndpoint
 
-	multicastLoop bool
+	// loop controls where WritePacket should send packets.
+	loop PacketLooping
 }
 
 // makeRoute initializes a new route. It takes ownership of the provided
 // reference to a network endpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, multicastLoop bool) Route {
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, handleLocal, multicastLoop bool) Route {
+	loop := PacketOut
+	if handleLocal && localAddr != "" && remoteAddr == localAddr {
+		loop = PacketLoop
+	} else if multicastLoop && (header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)) {
+		loop |= PacketLoop
+	}
+
 	return Route{
 		NetProto:         netProto,
 		LocalAddress:     localAddr,
 		LocalLinkAddress: localLinkAddr,
 		RemoteAddress:    remoteAddr,
 		ref:              ref,
-		multicastLoop:    multicastLoop,
+		loop:             loop,
 	}
 }
 
@@ -137,12 +145,7 @@ func (r *Route) IsResolutionRequired() bool {
 
 // WritePacket writes the packet through the given route.
 func (r *Route) WritePacket(hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
-	loop := PacketOut
-	if r.multicastLoop && (header.IsV4MulticastAddress(r.RemoteAddress) || header.IsV6MulticastAddress(r.RemoteAddress)) {
-		loop |= PacketLoop
-	}
-
-	err := r.ref.ep.WritePacket(r, hdr, payload, protocol, ttl, loop)
+	err := r.ref.ep.WritePacket(r, hdr, payload, protocol, ttl, r.loop)
 	if err == tcpip.ErrNoRoute {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 047b704e0..cbfe5c3c7 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -308,6 +308,9 @@ type Stack struct {
 
 	// clock is used to generate user-visible times.
 	clock tcpip.Clock
+
+	// handleLocal allows non-loopback interfaces to loop packets.
+	handleLocal bool
 }
 
 // Options contains optional Stack configuration.
@@ -319,6 +322,11 @@ type Options struct {
 
 	// Stats are optional statistic counters.
 	Stats tcpip.Stats
+
+	// HandleLocal indicates whether packets destined to their source
+	// should be handled by the stack internally (true) or outside the
+	// stack (false).
+	HandleLocal bool
 }
 
 // New allocates a new networking stack with only the requested networking and
@@ -343,6 +351,7 @@ func New(network []string, transport []string, opts Options) *Stack {
 		PortManager:        ports.NewPortManager(),
 		clock:              clock,
 		stats:              opts.Stats.FillIn(),
+		handleLocal:        opts.HandleLocal,
 	}
 
 	// Add specified network protocols.
@@ -764,7 +773,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	if id != 0 && !needRoute {
 		if nic, ok := s.nics[id]; ok {
 			if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
-				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, multicastLoop && !nic.loopback), nil
+				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback), nil
 			}
 		}
 	} else {
@@ -780,7 +789,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 						remoteAddr = ref.ep.ID().LocalAddress
 					}
 
-					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, multicastLoop && !nic.loopback)
+					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback)
 					if needRoute {
 						r.NextHop = route.Gateway
 					}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 4c7e6abfc..9ebe64dce 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -769,8 +769,9 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
 		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4}
 		s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
-			Clock: clock,
-			Stats: epsocket.Metrics,
+			Clock:       clock,
+			Stats:       epsocket.Metrics,
+			HandleLocal: true,
 		})}
 		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
 			return nil, fmt.Errorf("failed to enable SACK: %v", err)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 40bc147ca..f025a42f1 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -138,7 +138,6 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			FD:                 newFD,
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
-			HandleLocal:        true,
 			Address:            mac,
 			PacketDispatchMode: fdbased.PacketMMap,
 		})
-- 
cgit v1.2.3


From 2512cc561778b096459182b531eae4e0797e4ec5 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 13 Mar 2019 19:23:02 -0700
Subject: Allow filesystem.Mount to take an optional interface argument.

PiperOrigin-RevId: 238360231
Change-Id: I5eaf8d26f8892f77d71c7fbd6c5225ef471cedf1
---
 pkg/sentry/fs/copy_up_test.go          |  4 ++--
 pkg/sentry/fs/dev/fs.go                |  2 +-
 pkg/sentry/fs/filesystems.go           |  2 +-
 pkg/sentry/fs/gofer/fs.go              |  2 +-
 pkg/sentry/fs/gofer/session_state.go   |  2 +-
 pkg/sentry/fs/host/fs.go               |  2 +-
 pkg/sentry/fs/host/fs_test.go          |  2 +-
 pkg/sentry/fs/mount_overlay.go         |  2 +-
 pkg/sentry/fs/proc/fs.go               |  2 +-
 pkg/sentry/fs/restore.go               |  7 +++++--
 pkg/sentry/fs/sys/fs.go                |  2 +-
 pkg/sentry/fs/tmpfs/fs.go              |  2 +-
 pkg/sentry/fs/tty/fs.go                |  2 +-
 pkg/sentry/syscalls/linux/sys_mount.go |  2 +-
 runsc/boot/fs.go                       | 18 +++++++++---------
 runsc/boot/loader_test.go              | 28 ++++++++++++++--------------
 16 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 2b2f4bb8f..98a0b7638 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -98,7 +98,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
 
 	// Create a lower tmpfs mount.
 	fsys, _ := fs.FindFilesystem("tmpfs")
-	lower, err := fsys.Mount(contexttest.Context(t), "", fs.MountSourceFlags{}, "")
+	lower, err := fsys.Mount(contexttest.Context(t), "", fs.MountSourceFlags{}, "", nil)
 	if err != nil {
 		t.Fatalf("failed to mount tmpfs: %v", err)
 	}
@@ -147,7 +147,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
 	}
 
 	// Create an empty upper tmpfs mount which we will copy up into.
-	upper, err := fsys.Mount(ctx, "", fs.MountSourceFlags{}, "")
+	upper, err := fsys.Mount(ctx, "", fs.MountSourceFlags{}, "", nil)
 	if err != nil {
 		t.Fatalf("failed to mount tmpfs: %v", err)
 	}
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index abfe689f0..cf4e7d00f 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -66,7 +66,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns a devtmpfs root that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 	// devtmpfs backed by ramfs ignores bad options. See fs/ramfs/inode.c:ramfs_parse_options.
 	//  -> we should consider parsing the mode and backing devtmpfs by this.
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index ba8be85e4..aa664b973 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -50,7 +50,7 @@ type Filesystem interface {
 	// data options.
 	//
 	// Mount may return arbitrary errors. They do not need syserr translations.
-	Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error)
+	Mount(ctx context.Context, device string, flags MountSourceFlags, data string, dataObj interface{}) (*Inode, error)
 
 	// AllowUserMount determines whether mount(2) is allowed to mount a
 	// file system of this type.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 2dc000c6f..adff0abac 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -120,7 +120,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns an attached 9p client that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// Parse and validate the mount options.
 	o, err := options(data)
 	if err != nil {
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index d9fd7a221..0ad5d63b5 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -57,7 +57,7 @@ func (s *session) afterLoad() {
 	}
 
 	// Validate the mount flags and options.
-	opts, err := options(args.Data)
+	opts, err := options(args.DataString)
 	if err != nil {
 		panic("failed to parse mount options: " + err.Error())
 	}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index d2ba38449..800649211 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -82,7 +82,7 @@ func (*Filesystem) Flags() fs.FilesystemFlags {
 
 // Mount returns an fs.Inode exposing the host file system.  It is intended to be locked
 // down in PreExec below.
-func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// Parse generic comma-separated key=value options.
 	options := fs.GenericMountSourceOptions(data)
 
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index 44db61ecd..c83b29a16 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -321,7 +321,7 @@ func TestRootPath(t *testing.T) {
 	hostFS := &Filesystem{}
 	ctx := contexttest.Context(t)
 	data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name())
-	inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data)
+	inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data, nil)
 	if err != nil {
 		t.Fatalf("Mount failed: %v", err)
 	}
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index fb91635bc..4c89673b5 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -122,6 +122,6 @@ func (*overlayFilesystem) AllowUserList() bool {
 }
 
 // Mount implements Filesystem.Mount.
-func (ofs *overlayFilesystem) Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error) {
+func (ofs *overlayFilesystem) Mount(ctx context.Context, device string, flags MountSourceFlags, data string, _ interface{}) (*Inode, error) {
 	panic("overlayFilesystem.Mount should not be called!")
 }
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 63f737ff4..666a2d054 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -57,7 +57,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns the root of a procfs that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 
 	// Parse generic comma-separated key=value options, this file system expects them.
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
index da2df7e1d..a6645b41e 100644
--- a/pkg/sentry/fs/restore.go
+++ b/pkg/sentry/fs/restore.go
@@ -41,8 +41,11 @@ type MountArgs struct {
 	// Flags corresponds to the flags argument of Mount.
 	Flags MountSourceFlags
 
-	// Data corresponds to the data argument of Mount.
-	Data string
+	// DataString corresponds to the data argument of Mount.
+	DataString string
+
+	// DataObj corresponds to the data interface argument of Mount.
+	DataObj interface{}
 }
 
 // restoreEnv holds the fs package global RestoreEnvironment.
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 301fef038..44ae43754 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -57,7 +57,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns a sysfs root which can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 	// sysfs ignores data, see fs/sysfs/mount.c:sysfs_mount.
 
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index d495430e9..d0c93028f 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -82,7 +82,7 @@ func (*Filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns a tmpfs root that can be positioned in the vfs.
-func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 
 	// Parse generic comma-separated key=value options, this file system expects them.
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 0c412eb21..43e0e2a04 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -59,7 +59,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // MountSource returns a devpts root that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 
 	// No options are supported.
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index bf0df7302..6b8d75d24 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -101,7 +101,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		superFlags.ReadOnly = true
 	}
 
-	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data)
+	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil)
 	if err != nil {
 		return 0, nil, syserror.EINVAL
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index ada292c9e..25e23c09b 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -181,7 +181,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
 	p9FS := mustFindFilesystem("9p")
 	opts := p9MountOptions(fd, conf.FileAccess)
-	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
+	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
 	if err != nil {
 		return nil, fmt.Errorf("creating root mount point: %v", err)
 	}
@@ -220,7 +220,7 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 	}
 
 	// Create overlay on top of mount dir.
-	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
+	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
 	if err != nil {
 		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
 	}
@@ -309,7 +309,7 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, ro
 		mf.ReadOnly = true
 	}
 
-	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","))
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
 	if err != nil {
 		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
 	}
@@ -415,9 +415,9 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 	}
 
 	newMount := fs.MountArgs{
-		Dev:   mountDevice(m),
-		Flags: mountFlags(m.Options),
-		Data:  strings.Join(opts, ","),
+		Dev:        mountDevice(m),
+		Flags:      mountFlags(m.Options),
+		DataString: strings.Join(opts, ","),
 	}
 	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
 	log.Infof("Added mount at %q: %+v", fsName, newMount)
@@ -441,9 +441,9 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	}
 
 	rootMount := fs.MountArgs{
-		Dev:   rootDevice,
-		Flags: mf,
-		Data:  strings.Join(opts, ","),
+		Dev:        rootDevice,
+		Flags:      mf,
+		DataString: strings.Join(opts, ","),
 	}
 	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 4fcc0faea..01578cfc5 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -456,9 +456,9 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "9pfs-/",
-							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -510,13 +510,13 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "9pfs-/",
-							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 						{
-							Dev:  "9pfs-/dev/fd-foo",
-							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating",
+							Dev:        "9pfs-/dev/fd-foo",
+							DataString: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -568,16 +568,16 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "9pfs-/",
-							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
 						{
-							Dev:   "none",
-							Flags: fs.MountSourceFlags{NoAtime: true},
-							Data:  "uid=1022",
+							Dev:        "none",
+							Flags:      fs.MountSourceFlags{NoAtime: true},
+							DataString: "uid=1022",
 						},
 						{
 							Dev: "none",
-- 
cgit v1.2.3


From 8f4634997bd97810a85a70b71f000378d9db2e55 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 14 Mar 2019 08:11:36 -0700
Subject: Decouple filemem from platform and move it to pgalloc.MemoryFile.

This is in preparation for improved page cache reclaim, which requires
greater integration between the page cache and page allocator.

PiperOrigin-RevId: 238444706
Change-Id: Id24141b3678d96c7d7dc24baddd9be555bffafe4
---
 pkg/sentry/context/contexttest/BUILD          |   2 +
 pkg/sentry/context/contexttest/contexttest.go |  25 +
 pkg/sentry/fs/ashmem/BUILD                    |   1 -
 pkg/sentry/fs/binder/BUILD                    |   1 +
 pkg/sentry/fs/binder/binder.go                |  21 +-
 pkg/sentry/fs/dev/BUILD                       |   2 +-
 pkg/sentry/fs/dev/null.go                     |   4 +-
 pkg/sentry/fs/fsutil/BUILD                    |   1 +
 pkg/sentry/fs/fsutil/README.md                |  11 +-
 pkg/sentry/fs/fsutil/file_range_set.go        |  23 +-
 pkg/sentry/fs/fsutil/inode_cached.go          |  42 +-
 pkg/sentry/fs/proc/meminfo.go                 |   6 +-
 pkg/sentry/fs/tmpfs/inode_file.go             |  24 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                  |   2 +-
 pkg/sentry/kernel/BUILD                       |   3 +-
 pkg/sentry/kernel/contexttest/BUILD           |   1 +
 pkg/sentry/kernel/contexttest/contexttest.go  |   2 +
 pkg/sentry/kernel/kernel.go                   |  57 +-
 pkg/sentry/kernel/memevent/memory_events.go   |   2 +-
 pkg/sentry/kernel/shm/BUILD                   |   1 +
 pkg/sentry/kernel/shm/shm.go                  |  19 +-
 pkg/sentry/kernel/task.go                     |   5 +
 pkg/sentry/kernel/task_context.go             |   2 +-
 pkg/sentry/kernel/timekeeper.go               |   5 +-
 pkg/sentry/kernel/timekeeper_test.go          |   8 +-
 pkg/sentry/kernel/vdso.go                     |  17 +-
 pkg/sentry/loader/BUILD                       |   2 +-
 pkg/sentry/loader/vdso.go                     |  21 +-
 pkg/sentry/memutil/memutil_unsafe.go          |  14 +-
 pkg/sentry/mm/BUILD                           |   2 +
 pkg/sentry/mm/README.md                       |   4 +-
 pkg/sentry/mm/aio_context.go                  |  17 +-
 pkg/sentry/mm/lifecycle.go                    |   5 +-
 pkg/sentry/mm/mm.go                           |  20 +-
 pkg/sentry/mm/mm_test.go                      |   4 +-
 pkg/sentry/mm/pma.go                          |  20 +-
 pkg/sentry/mm/save_restore.go                 |  10 +-
 pkg/sentry/mm/special_mappable.go             |  36 +-
 pkg/sentry/mm/syscalls.go                     |   8 +-
 pkg/sentry/pgalloc/BUILD                      |  57 ++
 pkg/sentry/pgalloc/context.go                 |  48 ++
 pkg/sentry/pgalloc/pgalloc.go                 | 922 ++++++++++++++++++++++++++
 pkg/sentry/pgalloc/pgalloc_test.go            | 168 +++++
 pkg/sentry/pgalloc/pgalloc_unsafe.go          |  40 ++
 pkg/sentry/pgalloc/save_restore.go            | 205 ++++++
 pkg/sentry/platform/filemem/BUILD             |  56 --
 pkg/sentry/platform/filemem/filemem.go        | 879 ------------------------
 pkg/sentry/platform/filemem/filemem_state.go  | 194 ------
 pkg/sentry/platform/filemem/filemem_test.go   | 168 -----
 pkg/sentry/platform/filemem/filemem_unsafe.go |  40 --
 pkg/sentry/platform/kvm/BUILD                 |   1 -
 pkg/sentry/platform/kvm/address_space.go      |   4 -
 pkg/sentry/platform/kvm/kvm.go                |  17 -
 pkg/sentry/platform/kvm/kvm_test.go           |   1 -
 pkg/sentry/platform/platform.go               |  90 +--
 pkg/sentry/platform/ptrace/BUILD              |   1 -
 pkg/sentry/platform/ptrace/ptrace.go          |  14 +-
 pkg/sentry/state/BUILD                        |   1 -
 pkg/sentry/state/state.go                     |   5 +-
 pkg/sentry/syscalls/linux/sys_sysinfo.go      |   6 +-
 pkg/sentry/usage/memory.go                    |   3 -
 runsc/boot/BUILD                              |   2 +
 runsc/boot/controller.go                      |   9 +-
 runsc/boot/events.go                          |   2 +-
 runsc/boot/loader.go                          |  26 +-
 65 files changed, 1743 insertions(+), 1666 deletions(-)
 create mode 100644 pkg/sentry/pgalloc/BUILD
 create mode 100644 pkg/sentry/pgalloc/context.go
 create mode 100644 pkg/sentry/pgalloc/pgalloc.go
 create mode 100644 pkg/sentry/pgalloc/pgalloc_test.go
 create mode 100644 pkg/sentry/pgalloc/pgalloc_unsafe.go
 create mode 100644 pkg/sentry/pgalloc/save_restore.go
 delete mode 100644 pkg/sentry/platform/filemem/BUILD
 delete mode 100644 pkg/sentry/platform/filemem/filemem.go
 delete mode 100644 pkg/sentry/platform/filemem/filemem_state.go
 delete mode 100644 pkg/sentry/platform/filemem/filemem_test.go
 delete mode 100644 pkg/sentry/platform/filemem/filemem_unsafe.go

(limited to 'runsc')

diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index bed156b70..ce4f1e42c 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -13,6 +13,8 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
+        "//pkg/sentry/memutil",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/ptrace",
         "//pkg/sentry/uniqueid",
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index d5fd9f165..a29087775 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -16,6 +16,7 @@
 package contexttest
 
 import (
+	"os"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -24,6 +25,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
@@ -35,6 +38,17 @@ import (
 // Note that some filesystems may require a minimal kernel for testing, which
 // this test context does not provide. For such tests, see kernel/contexttest.
 func Context(tb testing.TB) context.Context {
+	const memfileName = "contexttest-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		tb.Fatalf("error creating application memory file: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile)
+	if err != nil {
+		memfile.Close()
+		tb.Fatalf("error creating pgalloc.MemoryFile: %v", err)
+	}
 	p, err := ptrace.New()
 	if err != nil {
 		tb.Fatal(err)
@@ -43,6 +57,7 @@ func Context(tb testing.TB) context.Context {
 	return &TestContext{
 		Context:     context.Background(),
 		l:           limits.NewLimitSet(),
+		mf:          mf,
 		platform:    p,
 		otherValues: make(map[interface{}]interface{}),
 	}
@@ -53,6 +68,7 @@ func Context(tb testing.TB) context.Context {
 type TestContext struct {
 	context.Context
 	l           *limits.LimitSet
+	mf          *pgalloc.MemoryFile
 	platform    platform.Platform
 	otherValues map[interface{}]interface{}
 }
@@ -94,6 +110,10 @@ func (t *TestContext) Value(key interface{}) interface{} {
 	switch key {
 	case limits.CtxLimits:
 		return t.l
+	case pgalloc.CtxMemoryFile:
+		return t.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return t
 	case platform.CtxPlatform:
 		return t.platform
 	case uniqueid.CtxGlobalUniqueID:
@@ -112,6 +132,11 @@ func (t *TestContext) Value(key interface{}) interface{} {
 	}
 }
 
+// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
+func (t *TestContext) MemoryFile() *pgalloc.MemoryFile {
+	return t.mf
+}
+
 // RootContext returns a Context that may be used in tests that need root
 // credentials. Uses ptrace as the platform.Platform.
 func RootContext(tb testing.TB) context.Context {
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index dcf620dca..ef1c31a3e 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -23,7 +23,6 @@ go_library(
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index 8a448175f..3710664d3 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -17,6 +17,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 19cd55e65..16fb4806f 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -74,9 +75,9 @@ func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *
 // ioctl.
 func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	return fs.NewFile(ctx, d, flags, &Proc{
-		bd:       bd,
-		task:     kernel.TaskFromContext(ctx),
-		platform: platform.FromContext(ctx),
+		bd:   bd,
+		task: kernel.TaskFromContext(ctx),
+		mfp:  pgalloc.MemoryFileProviderFromContext(ctx),
 	}), nil
 }
 
@@ -88,14 +89,14 @@ type Proc struct {
 	fsutil.FileNoFsync       `state:"nosave"`
 	fsutil.FileNotDirReaddir `state:"nosave"`
 
-	bd       *Device
-	task     *kernel.Task
-	platform platform.Platform
+	bd   *Device
+	task *kernel.Task
+	mfp  pgalloc.MemoryFileProvider
 
 	// mu protects fr.
 	mu sync.Mutex `state:"nosave"`
 
-	// mapped is memory allocated from platform.Memory() by AddMapping.
+	// mapped is memory allocated from mfp.MemoryFile() by AddMapping.
 	mapped platform.FileRange
 }
 
@@ -104,7 +105,7 @@ func (bp *Proc) Release() {
 	bp.mu.Lock()
 	defer bp.mu.Unlock()
 	if bp.mapped.Length() != 0 {
-		bp.platform.Memory().DecRef(bp.mapped)
+		bp.mfp.MemoryFile().DecRef(bp.mapped)
 	}
 }
 
@@ -204,7 +205,7 @@ func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar userm
 	}
 	// Binder only allocates and maps a single page up-front
 	// (drivers/android/binder.c:binder_mmap() => binder_update_page_range()).
-	fr, err := bp.platform.Memory().Allocate(usermem.PageSize, usage.Anonymous)
+	fr, err := bp.mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous)
 	if err != nil {
 		return err
 	}
@@ -241,7 +242,7 @@ func (bp *Proc) Translate(ctx context.Context, required, optional memmap.Mappabl
 		return []memmap.Translation{
 			{
 				Source: memmap.MappableRange{0, usermem.PageSize},
-				File:   bp.platform.Memory(),
+				File:   bp.mfp.MemoryFile(),
 				Offset: bp.mapped.Start,
 			},
 		}, err
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index e5b962c8c..6c4fdaba9 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -27,7 +27,7 @@ go_library(
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
-        "//pkg/sentry/platform",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 73fd09058..83f43c203 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -21,7 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -115,7 +115,7 @@ var _ fs.FileOperations = (*zeroFileOperations)(nil)
 
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (*zeroFileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
-	m, err := mm.NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+	m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index d41fc17cc..01098675d 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -85,6 +85,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/fsutil/README.md b/pkg/sentry/fs/fsutil/README.md
index 6e677890c..8be367334 100644
--- a/pkg/sentry/fs/fsutil/README.md
+++ b/pkg/sentry/fs/fsutil/README.md
@@ -112,11 +112,12 @@ finds the file that was mapped and its `CachingInodeOperations`. It then calls
 It may choose to allocate more memory (i.e. do "readahead") to minimize
 subsequent faults.
 
-Memory that is allocated comes from a host tmpfs file (see `filemem.FileMem`).
-The host tmpfs file memory is brought up to date with the contents of the mapped
-file on its filesystem. The region of the host tmpfs file that reflects the
-mapped file is then mapped into the host address space of the application so
-that subsequent memory accesses do not repeatedly generate a `SIGSEGV`.
+Memory that is allocated comes from a host tmpfs file (see
+`pgalloc.MemoryFile`). The host tmpfs file memory is brought up to date with the
+contents of the mapped file on its filesystem. The region of the host tmpfs file
+that reflects the mapped file is then mapped into the host address space of the
+application so that subsequent memory accesses do not repeatedly generate a
+`SIGSEGV`.
 
 The range that was allocated, including any extra memory allocation to minimize
 faults, is marked dirty due to the write fault. This overcounts dirty memory if
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index dd7ab4b4a..32ebf64ff 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -77,7 +78,7 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileR
 }
 
 // Fill attempts to ensure that all memmap.Mappable offsets in required are
-// mapped to a platform.File offset, by allocating from mem with the given
+// mapped to a platform.File offset, by allocating from mf with the given
 // memory usage kind and invoking readAt to store data into memory. (If readAt
 // returns a successful partial read, Fill will call it repeatedly until all
 // bytes have been read.) EOF is handled consistently with the requirements of
@@ -90,7 +91,7 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileR
 //
 // Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
 // required and optional must be page-aligned.
-func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mem platform.Memory, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
 	gap := frs.LowerBoundGap(required.Start)
 	for gap.Ok() && gap.Start() < required.End {
 		if gap.Range().Length() == 0 {
@@ -100,7 +101,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 		gr := gap.Range().Intersect(optional)
 
 		// Read data into the gap.
-		fr, err := platform.AllocateAndFill(mem, gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
+		fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
 			var done uint64
 			for !dsts.IsEmpty() {
 				n, err := readAt(ctx, dsts, gr.Start+done)
@@ -108,7 +109,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 				dsts = dsts.DropFirst64(n)
 				if err != nil {
 					if err == io.EOF {
-						// platform.AllocateAndFill truncates down to a page
+						// MemoryFile.AllocateAndFill truncates down to a page
 						// boundary, but FileRangeSet.Fill is supposed to
 						// zero-fill to the end of the page in this case.
 						donepgaddr, ok := usermem.Addr(done).RoundUp()
@@ -143,20 +144,20 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 // corresponding platform.FileRanges.
 //
 // Preconditions: mr must be page-aligned.
-func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mem platform.Memory) {
+func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
 	seg := frs.LowerBoundSegment(mr.Start)
 	for seg.Ok() && seg.Start() < mr.End {
 		seg = frs.Isolate(seg, mr)
-		mem.DecRef(seg.FileRange())
+		mf.DecRef(seg.FileRange())
 		seg = frs.Remove(seg).NextSegment()
 	}
 }
 
 // DropAll removes all segments in mr, freeing the corresponding
 // platform.FileRanges.
-func (frs *FileRangeSet) DropAll(mem platform.Memory) {
+func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) {
 	for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		mem.DecRef(seg.FileRange())
+		mf.DecRef(seg.FileRange())
 	}
 	frs.RemoveAll()
 }
@@ -164,7 +165,7 @@ func (frs *FileRangeSet) DropAll(mem platform.Memory) {
 // Truncate updates frs to reflect Mappable truncation to the given length:
 // bytes after the new EOF on the same page are zeroed, and pages after the new
 // EOF are freed.
-func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) {
+func (frs *FileRangeSet) Truncate(end uint64, mf *pgalloc.MemoryFile) {
 	pgendaddr, ok := usermem.Addr(end).RoundUp()
 	if ok {
 		pgend := uint64(pgendaddr)
@@ -173,7 +174,7 @@ func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) {
 		frs.SplitAt(pgend)
 		seg := frs.LowerBoundSegment(pgend)
 		for seg.Ok() {
-			mem.DecRef(seg.FileRange())
+			mf.DecRef(seg.FileRange())
 			seg = frs.Remove(seg).NextSegment()
 		}
 
@@ -189,7 +190,7 @@ func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) {
 	if seg.Ok() {
 		fr := seg.FileRange()
 		fr.Start += end - seg.Start()
-		ims, err := mem.MapInternal(fr, usermem.Write)
+		ims, err := mf.MapInternal(fr, usermem.Write)
 		if err != nil {
 			// There's no good recourse from here. This means
 			// that we can't keep cached memory consistent with
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index ef11676b8..9bd923678 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -62,8 +63,8 @@ type CachingInodeOperations struct {
 	// backingFile is a handle to a cached file object.
 	backingFile CachedFileObject
 
-	// platform is used to allocate memory that caches backingFile's contents.
-	platform platform.Platform
+	// mfp is used to allocate memory that caches backingFile's contents.
+	mfp pgalloc.MemoryFileProvider
 
 	// forcePageCache indicates the sentry page cache should be used regardless
 	// of whether the platform supports host mapped I/O or not. This must not be
@@ -96,7 +97,7 @@ type CachingInodeOperations struct {
 	dataMu sync.RWMutex `state:"nosave"`
 
 	// cache maps offsets into the cached file to offsets into
-	// platform.Memory() that store the file's data.
+	// mfp.MemoryFile() that store the file's data.
 	//
 	// cache is protected by dataMu.
 	cache FileRangeSet
@@ -148,13 +149,13 @@ type CachedFileObject interface {
 // NewCachingInodeOperations returns a new CachingInodeOperations backed by
 // a CachedFileObject and its initial unstable attributes.
 func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations {
-	p := platform.FromContext(ctx)
-	if p == nil {
-		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
 	}
 	return &CachingInodeOperations{
 		backingFile:    backingFile,
-		platform:       p,
+		mfp:            mfp,
 		forcePageCache: forcePageCache,
 		attr:           uattr,
 		hostFileMapper: NewHostFileMapper(),
@@ -311,7 +312,7 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode,
 	// written back.
 	c.dataMu.Lock()
 	defer c.dataMu.Unlock()
-	c.cache.Truncate(uint64(size), c.platform.Memory())
+	c.cache.Truncate(uint64(size), c.mfp.MemoryFile())
 	c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend})
 
 	return nil
@@ -323,7 +324,7 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode)
 
 	// Write dirty pages back.
 	c.dataMu.Lock()
-	err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt)
+	err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.mfp.MemoryFile(), c.backingFile.WriteFromBlocksAt)
 	c.dataMu.Unlock()
 	if err != nil {
 		c.attrMu.Unlock()
@@ -527,7 +528,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 		return 0, nil
 	}
 
-	mem := rw.c.platform.Memory()
+	mem := rw.c.mfp.MemoryFile()
 	var done uint64
 	seg, gap := rw.c.cache.Find(uint64(rw.offset))
 	for rw.offset < end {
@@ -613,7 +614,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 		return 0, nil
 	}
 
-	mem := rw.c.platform.Memory()
+	mf := rw.c.mfp.MemoryFile()
 	var done uint64
 	seg, gap := rw.c.cache.Find(uint64(rw.offset))
 	for rw.offset < end {
@@ -622,7 +623,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 		case seg.Ok() && seg.Start() < mr.End:
 			// Get internal mappings from the cache.
 			segMR := seg.Range().Intersect(mr)
-			ims, err := mem.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
 			if err != nil {
 				rw.maybeGrowFile()
 				rw.c.dataMu.Unlock()
@@ -711,13 +712,13 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 	// Writeback dirty mapped memory now that there are no longer any
 	// mappings that reference it. This is our naive memory eviction
 	// strategy.
-	mem := c.platform.Memory()
+	mf := c.mfp.MemoryFile()
 	c.dataMu.Lock()
 	for _, r := range unmapped {
-		if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
+		if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
 			log.Warningf("Failed to writeback cached data %v: %v", r, err)
 		}
-		c.cache.Drop(r, mem)
+		c.cache.Drop(r, mf)
 		c.dirty.KeepClean(r)
 	}
 	c.dataMu.Unlock()
@@ -760,8 +761,8 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 		optional.End = pgend
 	}
 
-	mem := c.platform.Memory()
-	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mem, usage.PageCache, c.backingFile.ReadToBlocksAt)
+	mf := c.mfp.MemoryFile()
+	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
 
 	var ts []memmap.Translation
 	var translatedEnd uint64
@@ -769,7 +770,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 		segMR := seg.Range().Intersect(optional)
 		ts = append(ts, memmap.Translation{
 			Source: segMR,
-			File:   mem,
+			File:   mf,
 			Offset: seg.FileRangeOf(segMR).Start,
 		})
 		if at.Write {
@@ -820,16 +821,17 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error
 
 	// Sync the cache's contents so that if we have a host fd after restore,
 	// the remote file's contents are coherent.
+	mf := c.mfp.MemoryFile()
 	c.dataMu.Lock()
 	defer c.dataMu.Unlock()
-	if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
+	if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
 		return err
 	}
 
 	// Discard the cache so that it's not stored in saved state. This is safe
 	// because per InvalidateUnsavable invariants, no new translations can have
 	// been returned after we invalidated all existing translations above.
-	c.cache.DropAll(c.platform.Memory())
+	c.cache.DropAll(mf)
 	c.dirty.RemoveAll()
 
 	return nil
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index b31258eed..620e93ce3 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -44,10 +44,10 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 		return nil, 0
 	}
 
-	mem := d.k.Platform.Memory()
-	mem.UpdateUsage()
+	mf := d.k.MemoryFile()
+	mf.UpdateUsage()
 	snapshot, totalUsage := usage.MemoryAccounting.Copy()
-	totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage)
+	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
 	anon := snapshot.Anonymous + snapshot.Tmpfs
 	file := snapshot.PageCache + snapshot.Mapped
 	// We don't actually have active/inactive LRUs, so just make up numbers.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 13d06684d..a98fbf0f1 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -52,7 +52,7 @@ type fileInodeOperations struct {
 
 	fsutil.InodeSimpleExtendedAttributes
 
-	// kernel is used to allocate platform memory that stores the file's contents.
+	// kernel is used to allocate memory that stores the file's contents.
 	kernel *kernel.Kernel
 
 	// memUsage is the default memory usage that will be reported by this file.
@@ -85,7 +85,7 @@ type fileInodeOperations struct {
 
 var _ fs.InodeOperations = (*fileInodeOperations)(nil)
 
-// NewInMemoryFile returns a new file backed by p.Memory().
+// NewInMemoryFile returns a new file backed by Kernel.MemoryFile().
 func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr) fs.InodeOperations {
 	return &fileInodeOperations{
 		attr:     uattr,
@@ -98,7 +98,7 @@ func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.Unsta
 func (f *fileInodeOperations) Release(context.Context) {
 	f.dataMu.Lock()
 	defer f.dataMu.Unlock()
-	f.data.DropAll(f.kernel.Platform.Memory())
+	f.data.DropAll(f.kernel.MemoryFile())
 }
 
 // Mappable implements fs.InodeOperations.Mappable.
@@ -202,7 +202,7 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in
 	// and can remove them.
 	f.dataMu.Lock()
 	defer f.dataMu.Unlock()
-	f.data.Truncate(uint64(size), f.kernel.Platform.Memory())
+	f.data.Truncate(uint64(size), f.kernel.MemoryFile())
 
 	return nil
 }
@@ -312,7 +312,7 @@ func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 		return 0, nil
 	}
 
-	mem := rw.f.kernel.Platform.Memory()
+	mf := rw.f.kernel.MemoryFile()
 	var done uint64
 	seg, gap := rw.f.data.Find(uint64(rw.offset))
 	for rw.offset < end {
@@ -320,7 +320,7 @@ func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 		switch {
 		case seg.Ok():
 			// Get internal mappings.
-			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
 			if err != nil {
 				return done, err
 			}
@@ -378,7 +378,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 		}
 	}()
 
-	mem := rw.f.kernel.Platform.Memory()
+	mf := rw.f.kernel.MemoryFile()
 	// Page-aligned mr for when we need to allocate memory. RoundUp can't
 	// overflow since end is an int64.
 	pgstartaddr := usermem.Addr(rw.offset).RoundDown()
@@ -392,7 +392,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 		switch {
 		case seg.Ok():
 			// Get internal mappings.
-			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
 			if err != nil {
 				return done, err
 			}
@@ -412,7 +412,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 		case gap.Ok():
 			// Allocate memory for the write.
 			gapMR := gap.Range().Intersect(pgMR)
-			fr, err := mem.Allocate(gapMR.Length(), rw.f.memUsage)
+			fr, err := mf.Allocate(gapMR.Length(), rw.f.memUsage)
 			if err != nil {
 				return done, err
 			}
@@ -467,8 +467,8 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 		optional.End = pgend
 	}
 
-	mem := f.kernel.Platform.Memory()
-	cerr := f.data.Fill(ctx, required, optional, mem, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+	mf := f.kernel.MemoryFile()
+	cerr := f.data.Fill(ctx, required, optional, mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
 		// Newly-allocated pages are zeroed, so we don't need to do anything.
 		return dsts.NumBytes(), nil
 	})
@@ -479,7 +479,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 		segMR := seg.Range().Intersect(optional)
 		ts = append(ts, memmap.Translation{
 			Source: segMR,
-			File:   mem,
+			File:   mf,
 			Offset: seg.FileRangeOf(segMR).Start,
 		})
 		translatedEnd = segMR.End
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 4b1762ce4..1a9d12c0b 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -74,7 +74,7 @@ type Dir struct {
 	// InodeOperation methods to it.
 	ramfsDir *ramfs.Dir
 
-	// kernel is used to allocate platform memory as storage for tmpfs Files.
+	// kernel is used to allocate memory as storage for tmpfs Files.
 	kernel *kernel.Kernel
 }
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index d9bbfb556..4d34bc733 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -173,6 +173,7 @@ go_library(
         "//pkg/sentry/loader",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/netlink/port",
@@ -212,7 +213,7 @@ go_test(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/limits",
-        "//pkg/sentry/platform",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/time",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
index 5769a3b28..bfb2a0b73 100644
--- a/pkg/sentry/kernel/contexttest/BUILD
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -12,6 +12,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
     ],
 )
diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go
index 9eb18e7e8..eb56a6a07 100644
--- a/pkg/sentry/kernel/contexttest/contexttest.go
+++ b/pkg/sentry/kernel/contexttest/contexttest.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 )
 
@@ -33,6 +34,7 @@ func Context(tb testing.TB) context.Context {
 	k := &kernel.Kernel{
 		Platform: platform.FromContext(ctx),
 	}
+	k.SetMemoryFile(pgalloc.MemoryFileFromContext(ctx))
 	ctx.(*contexttest.TestContext).RegisterValue(kernel.CtxKernel, k)
 	return ctx
 }
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index c6afae2e6..3533fd8f7 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -58,6 +58,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
@@ -89,12 +90,14 @@ type Kernel struct {
 
 	// All of the following fields are immutable unless otherwise specified.
 
-	// Platform is the platform that is used to execute tasks in the
-	// created Kernel. It is embedded so that Kernel can directly serve as
-	// Platform in mm logic and also serve as platform.MemoryProvider in
-	// filemem S/R logic.
+	// Platform is the platform that is used to execute tasks in the created
+	// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
+	// embedded anonymously (the same issue applies).
 	platform.Platform `state:"nosave"`
 
+	// mf provides application memory.
+	mf *pgalloc.MemoryFile `state:"nosave"`
+
 	// See InitKernelArgs for the meaning of these fields.
 	featureSet                  *cpuid.FeatureSet
 	timekeeper                  *Timekeeper
@@ -229,7 +232,8 @@ type InitKernelArgs struct {
 
 // Init initialize the Kernel with no tasks.
 //
-// Callers must manually set Kernel.Platform before caling Init.
+// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
+// before calling Init.
 func (k *Kernel) Init(args InitKernelArgs) error {
 	if args.FeatureSet == nil {
 		return fmt.Errorf("FeatureSet is nil")
@@ -332,15 +336,9 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	log.Infof("Kernel save stats: %s", &stats)
 	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
 
-	// Save the memory state.
-	//
-	// FIXME: In the future, this should not be dispatched via
-	// an abstract memory type. This should be dispatched to a single
-	// memory implementation that belongs to the kernel. (There is
-	// currently a single implementation anyways, it just needs to be
-	// "unabstracted" and reparented appropriately.)
+	// Save the memory file's state.
 	memoryStart := time.Now()
-	if err := k.Platform.Memory().SaveTo(w); err != nil {
+	if err := k.mf.SaveTo(w); err != nil {
 		return err
 	}
 	log.Infof("Memory save took [%s].", time.Since(memoryStart))
@@ -418,13 +416,9 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 }
 
 // LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error {
+func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
 	loadStart := time.Now()
-	if p == nil {
-		return fmt.Errorf("Platform is nil")
-	}
 
-	k.Platform = p
 	k.networkStack = net
 
 	initAppCores := k.applicationCores
@@ -438,11 +432,9 @@ func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) erro
 	log.Infof("Kernel load stats: %s", &stats)
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
-	// Load the memory state.
-	//
-	// See the note in SaveTo.
+	// Load the memory file's state.
 	memoryStart := time.Now()
-	if err := k.Platform.Memory().LoadFrom(r); err != nil {
+	if err := k.mf.LoadFrom(r); err != nil {
 		return err
 	}
 	log.Infof("Memory load took [%s].", time.Since(memoryStart))
@@ -597,6 +589,10 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
 		return ctx.args.Limits
+	case pgalloc.CtxMemoryFile:
+		return ctx.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return ctx.k
 	case platform.CtxPlatform:
 		return ctx.k
 	case uniqueid.CtxGlobalUniqueID:
@@ -1018,6 +1014,17 @@ func (k *Kernel) NowMonotonic() int64 {
 	return now
 }
 
+// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
+// LoadFrom.
+func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
+	k.mf = mf
+}
+
+// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
+func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
+	return k.mf
+}
+
 // SupervisorContext returns a Context with maximum privileges in k. It should
 // only be used by goroutines outside the control of the emulated kernel
 // defined by e.
@@ -1083,7 +1090,7 @@ func (k *Kernel) ListSockets(family int) []*refs.WeakRef {
 	socks := []*refs.WeakRef{}
 	if table, ok := k.socketTable[family]; ok {
 		socks = make([]*refs.WeakRef, 0, len(table))
-		for s, _ := range table {
+		for s := range table {
 			socks = append(socks, s)
 		}
 	}
@@ -1123,6 +1130,10 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 	case limits.CtxLimits:
 		// No limits apply.
 		return limits.NewLimitSet()
+	case pgalloc.CtxMemoryFile:
+		return ctx.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return ctx.k
 	case platform.CtxPlatform:
 		return ctx.k
 	case uniqueid.CtxGlobalUniqueID:
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index b6283c5d1..d09d6debf 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -95,7 +95,7 @@ func (m *MemoryEvents) run() {
 }
 
 func (m *MemoryEvents) emit() {
-	totalPlatform, err := m.k.Platform.Memory().TotalUsage()
+	totalPlatform, err := m.k.MemoryFile().TotalUsage()
 	if err != nil {
 		log.Warningf("Failed to fetch memory usage for memory events: %v", err)
 		return
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index f45770eef..bc2089872 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -20,6 +20,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 96414d060..4525aabf4 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -45,6 +45,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -199,19 +200,19 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 //
 // Precondition: Caller must hold r.mu.
 func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
-	p := platform.FromContext(ctx)
-	if p == nil {
-		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
 	}
 
 	effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
-	fr, err := p.Memory().Allocate(effectiveSize, usage.Anonymous)
+	fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
 	if err != nil {
 		return nil, err
 	}
 
 	shm := &Shm{
-		p:             p,
+		mfp:           mfp,
 		registry:      r,
 		creator:       creator,
 		size:          size,
@@ -312,7 +313,7 @@ type Shm struct {
 	// destruction.
 	refs.AtomicRefCount
 
-	p platform.Platform
+	mfp pgalloc.MemoryFileProvider
 
 	// registry points to the shm registry containing this segment. Immutable.
 	registry *Registry
@@ -333,7 +334,7 @@ type Shm struct {
 	// Invariant: effectiveSize must be a multiple of usermem.PageSize.
 	effectiveSize uint64
 
-	// fr is the offset into platform.Memory() that backs this contents of this
+	// fr is the offset into mfp.MemoryFile() that backs this contents of this
 	// segment. Immutable.
 	fr platform.FileRange
 
@@ -452,7 +453,7 @@ func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableR
 		return []memmap.Translation{
 			{
 				Source: source,
-				File:   s.p.Memory(),
+				File:   s.mfp.MemoryFile(),
 				Offset: s.fr.Start + source.Start,
 			},
 		}, err
@@ -599,7 +600,7 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
 }
 
 func (s *Shm) destroy() {
-	s.p.Memory().DecRef(s.fr)
+	s.mfp.MemoryFile().DecRef(s.fr)
 	s.registry.remove(s)
 }
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 702e40cce..e9f133c0b 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
@@ -587,6 +588,10 @@ func (t *Task) Value(key interface{}) interface{} {
 		return t.k.RealtimeClock()
 	case limits.CtxLimits:
 		return t.tg.limits
+	case pgalloc.CtxMemoryFile:
+		return t.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return t.k
 	case platform.CtxPlatform:
 		return t.k
 	case uniqueid.CtxGlobalUniqueID:
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index ee3e49d17..d1c82f2aa 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -144,7 +144,7 @@ func (t *Task) Stack() *arch.Stack {
 //  * fs: Binary FeatureSet
 func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
 	// Prepare a new user address space to load into.
-	m := mm.NewMemoryManager(k)
+	m := mm.NewMemoryManager(k, k)
 	defer m.DecUsers(ctx)
 
 	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 6bff80f13..d7bd85e78 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
 )
@@ -85,9 +86,9 @@ type Timekeeper struct {
 // NewTimekeeper does not take ownership of paramPage.
 //
 // SetClocks must be called on the returned Timekeeper before it is usable.
-func NewTimekeeper(platform platform.Platform, paramPage platform.FileRange) (*Timekeeper, error) {
+func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) {
 	return &Timekeeper{
-		params: NewVDSOParamPage(platform, paramPage),
+		params: NewVDSOParamPage(mfp, paramPage),
 	}, nil
 }
 
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index 71674c21c..6084bcb18 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -18,7 +18,7 @@ import (
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -53,13 +53,13 @@ func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
 // SetClocks called.
 func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper {
 	ctx := contexttest.Context(tb)
-	p := platform.FromContext(ctx)
-	fr, err := p.Memory().Allocate(usermem.PageSize, usage.Anonymous)
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	fr, err := mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous)
 	if err != nil {
 		tb.Fatalf("failed to allocate memory: %v", err)
 	}
 	return &Timekeeper{
-		params: NewVDSOParamPage(p, fr),
+		params: NewVDSOParamPage(mfp, fr),
 	}
 }
 
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 0ec858a4a..3a35f1d00 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -55,9 +56,9 @@ type vdsoParams struct {
 //
 // +stateify savable
 type VDSOParamPage struct {
-	// The parameter page is fr, allocated from platform.Memory().
-	platform platform.Platform
-	fr       platform.FileRange
+	// The parameter page is fr, allocated from mfp.MemoryFile().
+	mfp pgalloc.MemoryFileProvider
+	fr  platform.FileRange
 
 	// seq is the current sequence count written to the page.
 	//
@@ -73,20 +74,20 @@ type VDSOParamPage struct {
 //
 // Preconditions:
 //
-// * fr is a single page allocated from platform.Memory(). VDSOParamPage does
+// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
 //   not take ownership of fr; it must remain allocated for the lifetime of the
 //   VDSOParamPage.
 //
 // * VDSOParamPage must be the only writer to fr.
 //
-// * platform.Memory().MapInternal(fr) must return a single safemem.Block.
-func NewVDSOParamPage(platform platform.Platform, fr platform.FileRange) *VDSOParamPage {
-	return &VDSOParamPage{platform: platform, fr: fr}
+// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
+func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage {
+	return &VDSOParamPage{mfp: mfp, fr: fr}
 }
 
 // access returns a mapping of the param page.
 func (v *VDSOParamPage) access() (safemem.Block, error) {
-	bs, err := v.platform.Memory().MapInternal(v.fr, usermem.ReadWrite)
+	bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite)
 	if err != nil {
 		return safemem.Block{}, err
 	}
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 1ea260a4e..66300f25a 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -39,7 +39,7 @@ go_library(
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
-        "//pkg/sentry/platform",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/safemem",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index c070c7316..273f6b5b9 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -28,7 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -217,7 +217,7 @@ type VDSO struct {
 
 // PrepareVDSO validates the system VDSO and returns a VDSO, containing the
 // param page for updating by the kernel.
-func PrepareVDSO(p platform.Platform) (*VDSO, error) {
+func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
 	vdsoFile := newByteReaderFile(vdsoBin)
 
 	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
@@ -234,35 +234,36 @@ func PrepareVDSO(p platform.Platform) (*VDSO, error) {
 		return nil, fmt.Errorf("VDSO size overflows? %#x", len(vdsoBin))
 	}
 
-	vdso, err := p.Memory().Allocate(uint64(size), usage.System)
+	mf := mfp.MemoryFile()
+	vdso, err := mf.Allocate(uint64(size), usage.System)
 	if err != nil {
 		return nil, fmt.Errorf("unable to allocate VDSO memory: %v", err)
 	}
 
-	ims, err := p.Memory().MapInternal(vdso, usermem.ReadWrite)
+	ims, err := mf.MapInternal(vdso, usermem.ReadWrite)
 	if err != nil {
-		p.Memory().DecRef(vdso)
+		mf.DecRef(vdso)
 		return nil, fmt.Errorf("unable to map VDSO memory: %v", err)
 	}
 
 	_, err = safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(vdsoBin)))
 	if err != nil {
-		p.Memory().DecRef(vdso)
+		mf.DecRef(vdso)
 		return nil, fmt.Errorf("unable to copy VDSO into memory: %v", err)
 	}
 
 	// Finally, allocate a param page for this VDSO.
-	paramPage, err := p.Memory().Allocate(usermem.PageSize, usage.System)
+	paramPage, err := mf.Allocate(usermem.PageSize, usage.System)
 	if err != nil {
-		p.Memory().DecRef(vdso)
+		mf.DecRef(vdso)
 		return nil, fmt.Errorf("unable to allocate VDSO param page: %v", err)
 	}
 
 	return &VDSO{
-		ParamPage: mm.NewSpecialMappable("[vvar]", p, paramPage),
+		ParamPage: mm.NewSpecialMappable("[vvar]", mfp, paramPage),
 		// TODO: Don't advertise the VDSO, as some applications may
 		// not be able to handle multiple [vdso] hints.
-		vdso:  mm.NewSpecialMappable("", p, vdso),
+		vdso:  mm.NewSpecialMappable("", mfp, vdso),
 		phdrs: info.phdrs,
 	}, nil
 }
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go
index 8d9fc64fb..bc2c72f55 100644
--- a/pkg/sentry/memutil/memutil_unsafe.go
+++ b/pkg/sentry/memutil/memutil_unsafe.go
@@ -15,6 +15,7 @@
 package memutil
 
 import (
+	"fmt"
 	"syscall"
 	"unsafe"
 
@@ -22,14 +23,17 @@ import (
 )
 
 // CreateMemFD creates a memfd file and returns the fd.
-func CreateMemFD(name string, flags int) (fd int, err error) {
+func CreateMemFD(name string, flags int) (int, error) {
 	p, err := syscall.BytePtrFromString(name)
 	if err != nil {
 		return -1, err
 	}
-	r0, _, e0 := syscall.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0)
-	if e0 != 0 {
-		return -1, e0
+	fd, _, e := syscall.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0)
+	if e != 0 {
+		if e == syscall.ENOSYS {
+			return -1, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher")
+		}
+		return -1, e
 	}
-	return int(r0), nil
+	return int(fd), nil
 }
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index a85ffdef8..c78cb4280 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -111,6 +111,7 @@ go_library(
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/safecopy",
         "//pkg/sentry/safemem",
@@ -133,6 +134,7 @@ go_test(
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
index e485a5ca5..e6efbf565 100644
--- a/pkg/sentry/mm/README.md
+++ b/pkg/sentry/mm/README.md
@@ -153,7 +153,7 @@ manner, and the sentry handles the fault:
     represented by a host file descriptor and offset, since (as noted in
     "Background") this is the memory mapping primitive provided by the host
     kernel. In general, memory is allocated from a temporary host file using the
-    `filemem` package. Supposing that the sentry allocates offset 0x3000 from
+    `pgalloc` package. Supposing that the sentry allocates offset 0x3000 from
     host file "memory-file", the resulting state is:
 
         Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
@@ -274,7 +274,7 @@ In the sentry:
     methods
     [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
 
-[filemem]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/filemem/filemem.go
 [memmap]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/memmap/memmap.go
 [mm]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/mm/mm.go
+[pgalloc]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/pgalloc/pgalloc.go
 [platform]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/platform.go
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 5e86d3b49..6cec6387a 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -201,24 +202,24 @@ func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
 type aioMappable struct {
 	refs.AtomicRefCount
 
-	p  platform.Platform
-	fr platform.FileRange
+	mfp pgalloc.MemoryFileProvider
+	fr  platform.FileRange
 }
 
 var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
 
-func newAIOMappable(p platform.Platform) (*aioMappable, error) {
-	fr, err := p.Memory().Allocate(aioRingBufferSize, usage.Anonymous)
+func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
+	fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous)
 	if err != nil {
 		return nil, err
 	}
-	return &aioMappable{p: p, fr: fr}, nil
+	return &aioMappable{mfp: mfp, fr: fr}, nil
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *aioMappable) DecRef() {
 	m.AtomicRefCount.DecRefWithDestructor(func() {
-		m.p.Memory().DecRef(m.fr)
+		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
 
@@ -299,7 +300,7 @@ func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.M
 		return []memmap.Translation{
 			{
 				Source: source,
-				File:   m.p.Memory(),
+				File:   m.mfp.MemoryFile(),
 				Offset: m.fr.Start + source.Start,
 			},
 		}, err
@@ -320,7 +321,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 	// libaio peeks inside looking for a magic number. This function allocates
 	// a page per context and keeps it set to zeroes to ensure it will not
 	// match AIO_RING_MAGIC and make libaio happy.
-	m, err := newAIOMappable(mm.p)
+	m, err := newAIOMappable(mm.mfp)
 	if err != nil {
 		return 0, err
 	}
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 1ee8ae74e..a71286f14 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -23,14 +23,16 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
-func NewMemoryManager(p platform.Platform) *MemoryManager {
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager {
 	return &MemoryManager{
 		p:           p,
+		mfp:         mfp,
 		haveASIO:    p.SupportsAddressSpaceIO(),
 		privateRefs: &privateRefs{},
 		users:       1,
@@ -60,6 +62,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	defer mm.mappingMu.RUnlock()
 	mm2 := &MemoryManager{
 		p:           mm.p,
+		mfp:         mm.mfp,
 		haveASIO:    mm.haveASIO,
 		layout:      mm.layout,
 		privateRefs: mm.privateRefs,
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index e2c636f38..6ed838d64 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -40,6 +40,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -50,10 +51,9 @@ import (
 //
 // +stateify savable
 type MemoryManager struct {
-	// p is the platform.
-	//
-	// p is immutable.
-	p platform.Platform
+	// p and mfp are immutable.
+	p   platform.Platform
+	mfp pgalloc.MemoryFileProvider
 
 	// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
 	// eliminating an indirect call in the hot I/O path, this makes
@@ -369,8 +369,8 @@ func (v *vma) loadRealPerms(b int) {
 // +stateify savable
 type pma struct {
 	// file is the file mapped by this pma. Only pmas for which file ==
-	// platform.Platform.Memory() may be saved. pmas hold a reference to the
-	// corresponding file range while they exist.
+	// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
+	// the corresponding file range while they exist.
 	file platform.File `state:"nosave"`
 
 	// off is the offset into file at which this pma begins.
@@ -387,7 +387,7 @@ type pma struct {
 
 	// private is true if this pma represents private memory.
 	//
-	// If private is true, file must be platform.Platform.Memory(), the pma
+	// If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma
 	// holds a reference on the mapped memory that is tracked in privateRefs,
 	// and calls to Invalidate for which
 	// memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
@@ -405,9 +405,9 @@ type pma struct {
 type privateRefs struct {
 	mu sync.Mutex `state:"nosave"`
 
-	// refs maps offsets into Platform.Memory() to the number of pmas (or,
-	// equivalently, MemoryManagers) that share ownership of the memory at that
-	// offset.
+	// refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of
+	// pmas (or, equivalently, MemoryManagers) that share ownership of the
+	// memory at that offset.
 	refs fileRefcountSet
 }
 
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index f2db43196..e12cb3bd1 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -29,7 +30,8 @@ import (
 
 func testMemoryManager(ctx context.Context) *MemoryManager {
 	p := platform.FromContext(ctx)
-	mm := NewMemoryManager(p)
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	mm := NewMemoryManager(p, mfp)
 	mm.layout = arch.MmapLayout{
 		MinAddr:      p.MinUserAddress(),
 		MaxAddr:      p.MaxUserAddress(),
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index d102035d8..bb779a45b 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -328,8 +328,8 @@ func (mm *MemoryManager) insertPMAsLocked(ctx context.Context, vseg vmaIterator,
 		// Limit the range we allocate to ar, aligned to privateAllocUnit.
 		maskAR := privateAligned(ar)
 		allocAR := optAR.Intersect(maskAR)
-		mem := mm.p.Memory()
-		fr, err := mem.Allocate(uint64(allocAR.Length()), usage.Anonymous)
+		mf := mm.mfp.MemoryFile()
+		fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
 		if err != nil {
 			return pgap, err
 		}
@@ -342,10 +342,10 @@ func (mm *MemoryManager) insertPMAsLocked(ctx context.Context, vseg vmaIterator,
 		}
 
 		mm.addRSSLocked(allocAR)
-		mem.IncRef(fr)
+		mf.IncRef(fr)
 
 		return mm.pmas.Insert(pgap, allocAR, pma{
-			file:              mem,
+			file:              mf,
 			off:               fr.Start,
 			vmaEffectivePerms: vma.effectivePerms,
 			vmaMaxPerms:       vma.maxPerms,
@@ -426,7 +426,7 @@ func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.Add
 	// Limit the range we copy to ar, aligned to privateAllocUnit.
 	maskAR := privateAligned(ar)
 	var invalidatedIterators, didUnmapAS bool
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for {
 		if mm.isPMACopyOnWriteLocked(pseg) {
 			// Determine the range to copy.
@@ -438,7 +438,7 @@ func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.Add
 			}
 
 			// Copy contents.
-			fr, err := platform.AllocateAndFill(mem, uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
+			fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
 			if _, ok := err.(safecopy.BusError); ok {
 				// If we got SIGBUS during the copy, deliver SIGBUS to
 				// userspace (instead of SIGSEGV) if we're breaking
@@ -449,7 +449,7 @@ func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.Add
 				return pseg.PrevGap(), invalidatedIterators, err
 			}
 			mm.incPrivateRef(fr)
-			mem.IncRef(fr)
+			mf.IncRef(fr)
 
 			// Unmap all of maskAR, not just copyAR, to minimize host syscalls.
 			// AddressSpace mappings must be removed before mm.decPrivateRef().
@@ -471,7 +471,7 @@ func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.Add
 			}
 			pma.file.DecRef(pseg.fileRange())
 
-			pma.file = mem
+			pma.file = mf
 			pma.off = fr.Start
 			pma.private = true
 			pma.needCOW = false
@@ -881,9 +881,9 @@ func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
 	refSet.MergeAdjacent(fr)
 	mm.privateRefs.mu.Unlock()
 
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for _, fr := range freed {
-		mem.DecRef(fr)
+		mf.DecRef(fr)
 	}
 }
 
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
index 6e7080a84..46e0e0754 100644
--- a/pkg/sentry/mm/save_restore.go
+++ b/pkg/sentry/mm/save_restore.go
@@ -37,12 +37,12 @@ func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error {
 
 // beforeSave is invoked by stateify.
 func (mm *MemoryManager) beforeSave() {
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
-		if pma := pseg.ValuePtr(); pma.file != mem {
+		if pma := pseg.ValuePtr(); pma.file != mf {
 			// InvalidateUnsavable should have caused all such pmas to be
 			// invalidated.
-			panic(fmt.Sprintf("Can't save pma %#v with non-Memory file of type %T:\n%s", pseg.Range(), pma.file, mm))
+			panic(fmt.Sprintf("Can't save pma %#v with non-MemoryFile of type %T:\n%s", pseg.Range(), pma.file, mm))
 		}
 	}
 }
@@ -50,8 +50,8 @@ func (mm *MemoryManager) beforeSave() {
 // afterLoad is invoked by stateify.
 func (mm *MemoryManager) afterLoad() {
 	mm.haveASIO = mm.p.SupportsAddressSpaceIO()
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
-		pseg.ValuePtr().file = mem
+		pseg.ValuePtr().file = mf
 	}
 }
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 64d0dd3f6..aa94d7d6a 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -33,24 +34,24 @@ import (
 type SpecialMappable struct {
 	refs.AtomicRefCount
 
-	p    platform.Platform
+	mfp  pgalloc.MemoryFileProvider
 	fr   platform.FileRange
 	name string
 }
 
 // NewSpecialMappable returns a SpecialMappable that owns fr, which represents
-// offsets in p.Memory() that contain the SpecialMappable's data. The
+// offsets in mfp.MemoryFile() that contain the SpecialMappable's data. The
 // SpecialMappable will use the given name in /proc/[pid]/maps.
 //
 // Preconditions: fr.Length() != 0.
-func NewSpecialMappable(name string, p platform.Platform, fr platform.FileRange) *SpecialMappable {
-	return &SpecialMappable{p: p, fr: fr, name: name}
+func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable {
+	return &SpecialMappable{mfp: mfp, fr: fr, name: name}
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *SpecialMappable) DecRef() {
 	m.AtomicRefCount.DecRefWithDestructor(func() {
-		m.p.Memory().DecRef(m.fr)
+		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
 
@@ -99,7 +100,7 @@ func (m *SpecialMappable) Translate(ctx context.Context, required, optional memm
 		return []memmap.Translation{
 			{
 				Source: source,
-				File:   m.p.Memory(),
+				File:   m.mfp.MemoryFile(),
 				Offset: m.fr.Start + source.Start,
 			},
 		}, err
@@ -109,19 +110,19 @@ func (m *SpecialMappable) Translate(ctx context.Context, required, optional memm
 
 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
 func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error {
-	// Since data is stored in platform.Platform.Memory(), the contents of
-	// which are preserved across save/restore, we don't need to do anything.
+	// Since data is stored in pgalloc.MemoryFile, the contents of which are
+	// preserved across save/restore, we don't need to do anything.
 	return nil
 }
 
-// Platform returns the Platform whose Memory stores the SpecialMappable's
-// contents.
-func (m *SpecialMappable) Platform() platform.Platform {
-	return m.p
+// MemoryFileProvider returns the MemoryFileProvider whose MemoryFile stores
+// the SpecialMappable's contents.
+func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider {
+	return m.mfp
 }
 
-// FileRange returns the offsets into Platform().Memory() that store the
-// SpecialMappable's contents.
+// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that
+// store the SpecialMappable's contents.
 func (m *SpecialMappable) FileRange() platform.FileRange {
 	return m.fr
 }
@@ -137,7 +138,7 @@ func (m *SpecialMappable) Length() uint64 {
 // TODO: The use of SpecialMappable is a lazy code reuse hack. Linux
 // uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
 // do the same to get non-zero device and inode IDs.
-func NewSharedAnonMappable(length uint64, p platform.Platform) (*SpecialMappable, error) {
+func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
 	if length == 0 {
 		return nil, syserror.EINVAL
 	}
@@ -145,10 +146,9 @@ func NewSharedAnonMappable(length uint64, p platform.Platform) (*SpecialMappable
 	if !ok {
 		return nil, syserror.EINVAL
 	}
-
-	fr, err := p.Memory().Allocate(uint64(alignedLen), usage.Anonymous)
+	fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous)
 	if err != nil {
 		return nil, err
 	}
-	return NewSpecialMappable("/dev/zero (deleted)", p, fr), nil
+	return NewSpecialMappable("/dev/zero (deleted)", mfp, fr), nil
 }
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index fd6929e08..b56e0d3b9 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -99,7 +99,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 			if opts.MappingIdentity != nil {
 				return 0, syserror.EINVAL
 			}
-			m, err := NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+			m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
 			if err != nil {
 				return 0, err
 			}
@@ -965,7 +965,7 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	// ensures that Decommit immediately reduces host memory usage.
 	var didUnmapAS bool
 	pseg := mm.pmas.LowerBoundSegment(ar.Start)
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
 		vma := vseg.ValuePtr()
 		if vma.mlockMode != memmap.MLockNone {
@@ -984,7 +984,7 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
 				psegAR := pseg.Range().Intersect(ar)
 				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
-					if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+					if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
 						pseg = pseg.NextSegment()
 						continue
 					}
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
new file mode 100644
index 000000000..7efa55c20
--- /dev/null
+++ b/pkg/sentry/pgalloc/BUILD
@@ -0,0 +1,57 @@
+package(licenses = ["notice"])
+
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+go_template_instance(
+    name = "usage_set",
+    out = "usage_set.go",
+    consts = {
+        "minDegree": "10",
+    },
+    imports = {
+        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    },
+    package = "pgalloc",
+    prefix = "usage",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "usageInfo",
+        "Functions": "usageSetFunctions",
+    },
+)
+
+go_library(
+    name = "pgalloc",
+    srcs = [
+        "context.go",
+        "pgalloc.go",
+        "pgalloc_unsafe.go",
+        "save_restore.go",
+        "usage_set.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/memutil",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "pgalloc_test",
+    size = "small",
+    srcs = ["pgalloc_test.go"],
+    embed = [":pgalloc"],
+    deps = ["//pkg/sentry/usermem"],
+)
diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go
new file mode 100644
index 000000000..adc97e78f
--- /dev/null
+++ b/pkg/sentry/pgalloc/context.go
@@ -0,0 +1,48 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is this package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxMemoryFile is a Context.Value key for a MemoryFile.
+	CtxMemoryFile contextID = iota
+
+	// CtxMemoryFileProvider is a Context.Value key for a MemoryFileProvider.
+	CtxMemoryFileProvider
+)
+
+// MemoryFileFromContext returns the MemoryFile used by ctx, or nil if no such
+// MemoryFile exists.
+func MemoryFileFromContext(ctx context.Context) *MemoryFile {
+	if v := ctx.Value(CtxMemoryFile); v != nil {
+		return v.(*MemoryFile)
+	}
+	return nil
+}
+
+// MemoryFileProviderFromContext returns the MemoryFileProvider used by ctx, or nil if no such
+// MemoryFileProvider exists.
+func MemoryFileProviderFromContext(ctx context.Context) MemoryFileProvider {
+	if v := ctx.Value(CtxMemoryFileProvider); v != nil {
+		return v.(MemoryFileProvider)
+	}
+	return nil
+}
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
new file mode 100644
index 000000000..0754e608f
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -0,0 +1,922 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pgalloc contains the page allocator subsystem, which manages memory
+// that may be mapped into application address spaces.
+//
+// Lock order:
+//
+// pgalloc.MemoryFile.mu
+//   pgalloc.MemoryFile.mappingsMu
+package pgalloc
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MemoryFile is a platform.File whose pages may be allocated to arbitrary
+// users.
+type MemoryFile struct {
+	// MemoryFile owns a single backing file, which is modeled as follows:
+	//
+	// Each page in the file can be committed or uncommitted. A page is
+	// committed if the host kernel is spending resources to store its contents
+	// and uncommitted otherwise. This definition includes pages that the host
+	// kernel has swapped; this is intentional, to ensure that accounting does
+	// not change even if host kernel swapping behavior changes, and that
+	// memory used by pseudo-swap mechanisms like zswap is still accounted.
+	//
+	// The initial contents of uncommitted pages are implicitly zero bytes. A
+	// read or write to the contents of an uncommitted page causes it to be
+	// committed. This is the only event that can cause a uncommitted page to
+	// be committed.
+	//
+	// fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed
+	// pages to be uncommitted. This is the only event that can cause a
+	// committed page to be uncommitted.
+	//
+	// Memory accounting is based on identifying the set of committed pages.
+	// Since we do not have direct access to the MMU, tracking reads and writes
+	// to uncommitted pages to detect commitment would introduce additional
+	// page faults, which would be prohibitively expensive. Instead, we query
+	// the host kernel to determine which pages are committed.
+
+	// file is the backing file. The file pointer is immutable.
+	file *os.File
+
+	mu sync.Mutex
+
+	// usage maps each page in the file to metadata for that page. Pages for
+	// which no segment exists in usage are both unallocated (not in use) and
+	// uncommitted.
+	//
+	// Since usage stores usageInfo objects by value, clients should usually
+	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
+	// pointer to the usageInfo rather than a copy.
+	//
+	// usage must be kept maximally merged (that is, there should never be two
+	// adjacent segments with the same values). At least markReclaimed depends
+	// on this property.
+	//
+	// usage is protected by mu.
+	usage usageSet
+
+	// The UpdateUsage function scans all segments with knownCommitted set
+	// to false, sees which pages are committed and creates corresponding
+	// segments with knownCommitted set to true.
+	//
+	// In order to avoid unnecessary scans, usageExpected tracks the total
+	// file blocks expected. This is used to elide the scan when this
+	// matches the underlying file blocks.
+	//
+	// To track swapped pages, usageSwapped tracks the discrepency between
+	// what is observed in core and what is reported by the file. When
+	// usageSwapped is non-zero, a sweep will be performed at least every
+	// second. The start of the last sweep is recorded in usageLast.
+	//
+	// All usage attributes are all protected by mu.
+	usageExpected uint64
+	usageSwapped  uint64
+	usageLast     time.Time
+
+	// minUnallocatedPage is the minimum page that may be unallocated.
+	// i.e., there are no unallocated pages below minUnallocatedPage.
+	//
+	// minUnallocatedPage is protected by mu.
+	minUnallocatedPage uint64
+
+	// fileSize is the size of the backing memory file in bytes. fileSize is
+	// always a power-of-two multiple of chunkSize.
+	//
+	// fileSize is protected by mu.
+	fileSize int64
+
+	// destroyed is set by Destroy to instruct the reclaimer goroutine to
+	// release resources and exit. destroyed is protected by mu.
+	destroyed bool
+
+	// reclaimable is true if usage may contain reclaimable pages. reclaimable
+	// is protected by mu.
+	reclaimable bool
+
+	// minReclaimablePage is the minimum page that may be reclaimable.
+	// i.e., all reclaimable pages are >= minReclaimablePage.
+	//
+	// minReclaimablePage is protected by mu.
+	minReclaimablePage uint64
+
+	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
+	// transitions from false to true.
+	reclaimCond sync.Cond
+
+	// Pages from the backing file are mapped into the local address space on
+	// the granularity of large pieces called chunks. mappings is a []uintptr
+	// that stores, for each chunk, the start address of a mapping of that
+	// chunk in the current process' address space, or 0 if no such mapping
+	// exists. Once a chunk is mapped, it is never remapped or unmapped until
+	// the MemoryFile is destroyed.
+	//
+	// Mutating the mappings slice or its contents requires both holding
+	// mappingsMu and using atomic memory operations. (The slice is mutated
+	// whenever the file is expanded. Per the above, the only permitted
+	// mutation of the slice's contents is the assignment of a mapping to a
+	// chunk that was previously unmapped.) Reading the slice or its contents
+	// only requires *either* holding mappingsMu or using atomic memory
+	// operations. This allows MemoryFile.MapInternal to avoid locking in the
+	// common case where chunk mappings already exist.
+	mappingsMu sync.Mutex
+	mappings   atomic.Value
+}
+
+// usage tracks usage information.
+//
+// +stateify savable
+type usageInfo struct {
+	// kind is the usage kind.
+	kind usage.MemoryKind
+
+	// knownCommitted is true if the tracked region is definitely committed.
+	// (If it is false, the tracked region may or may not be committed.)
+	knownCommitted bool
+
+	refs uint64
+}
+
+const (
+	chunkShift = 24
+	chunkSize  = 1 << chunkShift // 16 MB
+	chunkMask  = chunkSize - 1
+
+	initialSize = chunkSize
+
+	// maxPage is the highest 64-bit page.
+	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
+)
+
+// NewMemoryFile creates a MemoryFile backed by the given file. If
+// NewMemoryFile succeeds, ownership of file is transferred to the returned
+// MemoryFile.
+func NewMemoryFile(file *os.File) (*MemoryFile, error) {
+	// Truncate the file to 0 bytes first to ensure that it's empty.
+	if err := file.Truncate(0); err != nil {
+		return nil, err
+	}
+	if err := file.Truncate(initialSize); err != nil {
+		return nil, err
+	}
+	f := &MemoryFile{
+		fileSize: initialSize,
+		file:     file,
+		// No pages are reclaimable. DecRef will always be able to
+		// decrease minReclaimablePage from this point.
+		minReclaimablePage: maxPage,
+	}
+	f.reclaimCond.L = &f.mu
+	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	go f.runReclaim() // S/R-SAFE: f.mu
+
+	// The Linux kernel contains an optional feature called "Integrity
+	// Measurement Architecture" (IMA). If IMA is enabled, it will checksum
+	// binaries the first time they are mapped PROT_EXEC. This is bad news for
+	// executable pages mapped from our backing file, which can grow to
+	// terabytes in (sparse) size. If IMA attempts to checksum a file that
+	// large, it will allocate all of the sparse pages and quickly exhaust all
+	// memory.
+	//
+	// Work around IMA by immediately creating a temporary PROT_EXEC mapping,
+	// while the backing file is still small. IMA will ignore any future
+	// mappings.
+	m, _, errno := syscall.Syscall6(
+		syscall.SYS_MMAP,
+		0,
+		usermem.PageSize,
+		syscall.PROT_EXEC,
+		syscall.MAP_SHARED,
+		file.Fd(),
+		0)
+	if errno != 0 {
+		// This isn't fatal (IMA may not even be in use). Log the error, but
+		// don't return it.
+		log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
+	} else {
+		if _, _, errno := syscall.Syscall(
+			syscall.SYS_MUNMAP,
+			m,
+			usermem.PageSize,
+			0); errno != 0 {
+			panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
+		}
+	}
+
+	return f, nil
+}
+
+// Destroy releases all resources used by f.
+//
+// Preconditions: All pages allocated by f have been freed.
+//
+// Postconditions: None of f's methods may be called after Destroy.
+func (f *MemoryFile) Destroy() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.destroyed = true
+	f.reclaimCond.Signal()
+}
+
+// Allocate returns a range of initially-zeroed pages of the given length with
+// the given accounting kind and a single reference held by the caller. When
+// the last reference on an allocated page is released, ownership of the page
+// is returned to the MemoryFile, allowing it to be returned by a future call
+// to Allocate.
+//
+// Preconditions: length must be page-aligned and non-zero.
+func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
+	if length == 0 || length%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid allocation length: %#x", length))
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Align hugepage-and-larger allocations on hugepage boundaries to try
+	// to take advantage of hugetmpfs.
+	alignment := uint64(usermem.PageSize)
+	if length >= usermem.HugePageSize {
+		alignment = usermem.HugePageSize
+	}
+
+	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
+	end := start + length
+	// File offsets are int64s. Since length must be strictly positive, end
+	// cannot legitimately be 0.
+	if end < start || int64(end) <= 0 {
+		return platform.FileRange{}, syserror.ENOMEM
+	}
+
+	// Expand the file if needed. Double the file size on each expansion;
+	// uncommitted pages have effectively no cost.
+	fileSize := f.fileSize
+	for int64(end) > fileSize {
+		if fileSize >= 2*fileSize {
+			// fileSize overflow.
+			return platform.FileRange{}, syserror.ENOMEM
+		}
+		fileSize *= 2
+	}
+	if fileSize > f.fileSize {
+		if err := f.file.Truncate(fileSize); err != nil {
+			return platform.FileRange{}, err
+		}
+		f.fileSize = fileSize
+		f.mappingsMu.Lock()
+		oldMappings := f.mappings.Load().([]uintptr)
+		newMappings := make([]uintptr, fileSize>>chunkShift)
+		copy(newMappings, oldMappings)
+		f.mappings.Store(newMappings)
+		f.mappingsMu.Unlock()
+	}
+
+	// Mark selected pages as in use.
+	fr := platform.FileRange{start, end}
+	if !f.usage.Add(fr, usageInfo{
+		kind: kind,
+		refs: 1,
+	}) {
+		panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
+	}
+
+	if minUnallocatedPage < start {
+		f.minUnallocatedPage = minUnallocatedPage
+	} else {
+		// start was the first unallocated page. The next must be
+		// somewhere beyond end.
+		f.minUnallocatedPage = end
+	}
+
+	return fr, nil
+}
+
+// findUnallocatedRange returns the first unallocated page in usage of the
+// specified length and alignment beginning at page start and the first single
+// unallocated page.
+func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
+	// Only searched until the first page is found.
+	firstPage := start
+	foundFirstPage := false
+	alignMask := alignment - 1
+	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
+		r := seg.Range()
+
+		if !foundFirstPage && r.Start > firstPage {
+			foundFirstPage = true
+		}
+
+		if start >= r.End {
+			// start was rounded up to an alignment boundary from the end
+			// of a previous segment and is now beyond r.End.
+			continue
+		}
+		// This segment represents allocated or reclaimable pages; only the
+		// range from start to the segment's beginning is allocatable, and the
+		// next allocatable range begins after the segment.
+		if r.Start > start && r.Start-start >= length {
+			break
+		}
+		start = (r.End + alignMask) &^ alignMask
+		if !foundFirstPage {
+			firstPage = r.End
+		}
+	}
+	return start, firstPage
+}
+
+// AllocateAndFill allocates memory of the given kind and fills it by calling
+// r.ReadToBlocks() repeatedly until either length bytes are read or a non-nil
+// error is returned. It returns the memory filled by r, truncated down to the
+// nearest page. If this is shorter than length bytes due to an error returned
+// by r.ReadToBlocks(), it returns that error.
+//
+// Preconditions: length > 0. length must be page-aligned.
+func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) {
+	fr, err := f.Allocate(length, kind)
+	if err != nil {
+		return platform.FileRange{}, err
+	}
+	dsts, err := f.MapInternal(fr, usermem.Write)
+	if err != nil {
+		f.DecRef(fr)
+		return platform.FileRange{}, err
+	}
+	n, err := safemem.ReadFullToBlocks(r, dsts)
+	un := uint64(usermem.Addr(n).RoundDown())
+	if un < length {
+		// Free unused memory and update fr to contain only the memory that is
+		// still allocated.
+		f.DecRef(platform.FileRange{fr.Start + un, fr.End})
+		fr.End = fr.Start + un
+	}
+	return fr, err
+}
+
+// fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
+const (
+	_FALLOC_FL_KEEP_SIZE  = 1
+	_FALLOC_FL_PUNCH_HOLE = 2
+)
+
+// Decommit releases resources associated with maintaining the contents of the
+// given pages. If Decommit succeeds, future accesses of the decommitted pages
+// will read zeroes.
+//
+// Preconditions: fr.Length() > 0.
+func (f *MemoryFile) Decommit(fr platform.FileRange) error {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	// "After a successful call, subsequent reads from this range will
+	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
+	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
+	err := syscall.Fallocate(
+		int(f.file.Fd()),
+		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
+		int64(fr.Start),
+		int64(fr.Length()))
+	if err != nil {
+		return err
+	}
+	f.markDecommitted(fr)
+	return nil
+}
+
+func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	// Since we're changing the knownCommitted attribute, we need to merge
+	// across the entire range to ensure that the usage tree is minimal.
+	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
+		val := seg.ValuePtr()
+		if val.knownCommitted {
+			// Drop the usageExpected appropriately.
+			amount := seg.Range().Length()
+			usage.MemoryAccounting.Dec(amount, val.kind)
+			f.usageExpected -= amount
+			val.knownCommitted = false
+		}
+	})
+	if gap.Ok() {
+		panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
+	}
+	f.usage.MergeRange(fr)
+}
+
+// runReclaim implements the reclaimer goroutine, which continuously decommits
+// reclaimable pages in order to reduce memory usage and make them available
+// for allocation.
+func (f *MemoryFile) runReclaim() {
+	for {
+		fr, ok := f.findReclaimable()
+		if !ok {
+			break
+		}
+
+		if err := f.Decommit(fr); err != nil {
+			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
+			// Zero the pages manually. This won't reduce memory usage, but at
+			// least ensures that the pages will be zero when reallocated.
+			f.forEachMappingSlice(fr, func(bs []byte) {
+				for i := range bs {
+					bs[i] = 0
+				}
+			})
+			// Pretend the pages were decommitted even though they weren't,
+			// since the memory accounting implementation has no idea how to
+			// deal with this.
+			f.markDecommitted(fr)
+		}
+		f.markReclaimed(fr)
+	}
+	// We only get here if findReclaimable finds f.destroyed set and returns
+	// false.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if !f.destroyed {
+		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
+	}
+	f.file.Close()
+	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
+	// that has possibly been reassigned.
+	f.file = nil
+	mappings := f.mappings.Load().([]uintptr)
+	for i, m := range mappings {
+		if m != 0 {
+			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
+			if errno != 0 {
+				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
+			}
+		}
+	}
+	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
+	f.mappings.Store([]uintptr{})
+}
+
+func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for {
+		for {
+			if f.destroyed {
+				return platform.FileRange{}, false
+			}
+			if f.reclaimable {
+				break
+			}
+			f.reclaimCond.Wait()
+		}
+		// Allocate returns the first usable range in offset order and is
+		// currently a linear scan, so reclaiming from the beginning of the
+		// file minimizes the expected latency of Allocate.
+		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
+			if seg.ValuePtr().refs == 0 {
+				f.minReclaimablePage = seg.End()
+				return seg.Range(), true
+			}
+		}
+		// No pages are reclaimable.
+		f.reclaimable = false
+		f.minReclaimablePage = maxPage
+	}
+}
+
+func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	seg := f.usage.FindSegment(fr.Start)
+	// All of fr should be mapped to a single uncommitted reclaimable segment
+	// accounted to System.
+	if !seg.Ok() {
+		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
+	}
+	if !seg.Range().IsSupersetOf(fr) {
+		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
+	}
+	if got, want := seg.Value(), (usageInfo{
+		kind:           usage.System,
+		knownCommitted: false,
+		refs:           0,
+	}); got != want {
+		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
+	}
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
+	// caller of markReclaimed may not have decommitted it, so we can only mark
+	// fr as reclaimed.
+	f.usage.Remove(f.usage.Isolate(seg, fr))
+	if fr.Start < f.minUnallocatedPage {
+		// We've deallocated at least one lower page.
+		f.minUnallocatedPage = fr.Start
+	}
+}
+
+// IncRef implements platform.File.IncRef.
+func (f *MemoryFile) IncRef(fr platform.FileRange) {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
+		seg.ValuePtr().refs++
+	})
+	if gap.Ok() {
+		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
+	}
+
+	f.usage.MergeAdjacent(fr)
+}
+
+// DecRef implements platform.File.DecRef.
+func (f *MemoryFile) DecRef(fr platform.FileRange) {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	var freed bool
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
+		seg = f.usage.Isolate(seg, fr)
+		val := seg.ValuePtr()
+		if val.refs == 0 {
+			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
+		}
+		val.refs--
+		if val.refs == 0 {
+			freed = true
+			// Reclassify memory as System, until it's freed by the reclaim
+			// goroutine.
+			if val.knownCommitted {
+				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind)
+			}
+			val.kind = usage.System
+		}
+	}
+	f.usage.MergeAdjacent(fr)
+
+	if freed {
+		if fr.Start < f.minReclaimablePage {
+			// We've freed at least one lower page.
+			f.minReclaimablePage = fr.Start
+		}
+		f.reclaimable = true
+		f.reclaimCond.Signal()
+	}
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	if !fr.WellFormed() || fr.Length() == 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+	if at.Execute {
+		return safemem.BlockSeq{}, syserror.EACCES
+	}
+
+	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
+	if chunks == 1 {
+		// Avoid an unnecessary slice allocation.
+		var seq safemem.BlockSeq
+		err := f.forEachMappingSlice(fr, func(bs []byte) {
+			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
+		})
+		return seq, err
+	}
+	blocks := make([]safemem.Block, 0, chunks)
+	err := f.forEachMappingSlice(fr, func(bs []byte) {
+		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
+	})
+	return safemem.BlockSeqFromSlice(blocks), err
+}
+
+// forEachMappingSlice invokes fn on a sequence of byte slices that
+// collectively map all bytes in fr.
+func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
+	mappings := f.mappings.Load().([]uintptr)
+	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
+		chunk := int(chunkStart >> chunkShift)
+		m := atomic.LoadUintptr(&mappings[chunk])
+		if m == 0 {
+			var err error
+			mappings, m, err = f.getChunkMapping(chunk)
+			if err != nil {
+				return err
+			}
+		}
+		startOff := uint64(0)
+		if chunkStart < fr.Start {
+			startOff = fr.Start - chunkStart
+		}
+		endOff := uint64(chunkSize)
+		if chunkStart+chunkSize > fr.End {
+			endOff = fr.End - chunkStart
+		}
+		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
+	}
+	return nil
+}
+
+func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
+	f.mappingsMu.Lock()
+	defer f.mappingsMu.Unlock()
+	// Another thread may have replaced f.mappings altogether due to file
+	// expansion.
+	mappings := f.mappings.Load().([]uintptr)
+	// Another thread may have already mapped the chunk.
+	if m := mappings[chunk]; m != 0 {
+		return mappings, m, nil
+	}
+	m, _, errno := syscall.Syscall6(
+		syscall.SYS_MMAP,
+		0,
+		chunkSize,
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_SHARED,
+		f.file.Fd(),
+		uintptr(chunk<<chunkShift))
+	if errno != 0 {
+		return nil, 0, errno
+	}
+	atomic.StoreUintptr(&mappings[chunk], m)
+	return mappings, m, nil
+}
+
+// FD implements platform.File.FD.
+func (f *MemoryFile) FD() int {
+	return int(f.file.Fd())
+}
+
+// UpdateUsage ensures that the memory usage statistics in
+// usage.MemoryAccounting are up to date.
+func (f *MemoryFile) UpdateUsage() error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// If the underlying usage matches where the usage tree already
+	// represents, then we can just avoid the entire scan (we know it's
+	// accurate).
+	currentUsage, err := f.TotalUsage()
+	if err != nil {
+		return err
+	}
+	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
+		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
+		return nil
+	}
+	// If the current usage matches the expected but there's swap
+	// accounting, then ensure a scan takes place at least every second
+	// (when requested).
+	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
+		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
+		return nil
+	}
+
+	f.usageLast = time.Now()
+	err = f.updateUsageLocked(currentUsage, mincore)
+	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
+		currentUsage, f.usageExpected, f.usageSwapped)
+	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
+	return err
+}
+
+// updateUsageLocked attempts to detect commitment of previous-uncommitted
+// pages by invoking checkCommitted, which is a function that, for each page i
+// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
+//
+// Precondition: f.mu must be held.
+func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
+	// Track if anything changed to elide the merge. In the common case, we
+	// expect all segments to be committed and no merge to occur.
+	changedAny := false
+	defer func() {
+		if changedAny {
+			f.usage.MergeAll()
+		}
+
+		// Adjust the swap usage to reflect reality.
+		if f.usageExpected < currentUsage {
+			// Since no pages may be marked decommitted while we hold mu, we
+			// know that usage may have only increased since we got the last
+			// current usage. Therefore, if usageExpected is still short of
+			// currentUsage, we must assume that the difference is in pages
+			// that have been swapped.
+			newUsageSwapped := currentUsage - f.usageExpected
+			if f.usageSwapped < newUsageSwapped {
+				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System)
+			} else {
+				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System)
+			}
+			f.usageSwapped = newUsageSwapped
+		} else if f.usageSwapped != 0 {
+			// We have more usage accounted for than the file itself.
+			// That's fine, we probably caught a race where pages were
+			// being committed while the above loop was running. Just
+			// report the higher number that we found and ignore swap.
+			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
+			f.usageSwapped = 0
+		}
+	}()
+
+	// Reused mincore buffer, will generally be <= 4096 bytes.
+	var buf []byte
+
+	// Iterate over all usage data. There will only be usage segments
+	// present when there is an associated reference.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		val := seg.Value()
+
+		// Already known to be committed; ignore.
+		if val.knownCommitted {
+			continue
+		}
+
+		// Assume that reclaimable pages (that aren't already known to be
+		// committed) are not committed. This isn't necessarily true, even
+		// after the reclaimer does Decommit(), because the kernel may
+		// subsequently back the hugepage-sized region containing the
+		// decommitted page with a hugepage. However, it's consistent with our
+		// treatment of unallocated pages, which have the same property.
+		if val.refs == 0 {
+			continue
+		}
+
+		// Get the range for this segment. As we touch slices, the
+		// Start value will be walked along.
+		r := seg.Range()
+
+		var checkErr error
+		err := f.forEachMappingSlice(r, func(s []byte) {
+			if checkErr != nil {
+				return
+			}
+
+			// Ensure that we have sufficient buffer for the call
+			// (one byte per page). The length of each slice must
+			// be page-aligned.
+			bufLen := len(s) / usermem.PageSize
+			if len(buf) < bufLen {
+				buf = make([]byte, bufLen)
+			}
+
+			// Query for new pages in core.
+			if err := checkCommitted(s, buf); err != nil {
+				checkErr = err
+				return
+			}
+
+			// Scan each page and switch out segments.
+			populatedRun := false
+			populatedRunStart := 0
+			for i := 0; i <= bufLen; i++ {
+				// We run past the end of the slice here to
+				// simplify the logic and only set populated if
+				// we're still looking at elements.
+				populated := false
+				if i < bufLen {
+					populated = buf[i]&0x1 != 0
+				}
+
+				switch {
+				case populated == populatedRun:
+					// Keep the run going.
+					continue
+				case populated && !populatedRun:
+					// Begin the run.
+					populatedRun = true
+					populatedRunStart = i
+					// Keep going.
+					continue
+				case !populated && populatedRun:
+					// Finish the run by changing this segment.
+					runRange := platform.FileRange{
+						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
+						End:   r.Start + uint64(i*usermem.PageSize),
+					}
+					seg = f.usage.Isolate(seg, runRange)
+					seg.ValuePtr().knownCommitted = true
+					// Advance the segment only if we still
+					// have work to do in the context of
+					// the original segment from the for
+					// loop. Otherwise, the for loop itself
+					// will advance the segment
+					// appropriately.
+					if runRange.End != r.End {
+						seg = seg.NextSegment()
+					}
+					amount := runRange.Length()
+					usage.MemoryAccounting.Inc(amount, val.kind)
+					f.usageExpected += amount
+					changedAny = true
+					populatedRun = false
+				}
+			}
+
+			// Advance r.Start.
+			r.Start += uint64(len(s))
+		})
+		if checkErr != nil {
+			return checkErr
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// TotalUsage returns an aggregate usage for all memory statistics except
+// Mapped (which is external to MemoryFile). This is generally much cheaper
+// than UpdateUsage, but will not provide a fine-grained breakdown.
+func (f *MemoryFile) TotalUsage() (uint64, error) {
+	// Stat the underlying file to discover the underlying usage. stat(2)
+	// always reports the allocated block count in units of 512 bytes. This
+	// includes pages in the page cache and swapped pages.
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(int(f.file.Fd()), &stat); err != nil {
+		return 0, err
+	}
+	return uint64(stat.Blocks * 512), nil
+}
+
+// TotalSize returns the current size of the backing file in bytes, which is an
+// upper bound on the amount of memory that can currently be allocated from the
+// MemoryFile. The value returned by TotalSize is permitted to change.
+func (f *MemoryFile) TotalSize() uint64 {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return uint64(f.fileSize)
+}
+
+// File returns the backing file.
+func (f *MemoryFile) File() *os.File {
+	return f.file
+}
+
+// String implements fmt.Stringer.String.
+//
+// Note that because f.String locks f.mu, calling f.String internally
+// (including indirectly through the fmt package) risks recursive locking.
+// Within the pgalloc package, use f.usage directly instead.
+func (f *MemoryFile) String() string {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.usage.String()
+}
+
+type usageSetFunctions struct{}
+
+func (usageSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (usageSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (usageSetFunctions) ClearValue(val *usageInfo) {
+}
+
+func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
+	return val1, val1 == val2
+}
+
+func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
+	return val, val
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
new file mode 100644
index 000000000..726623c1a
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	page     = usermem.PageSize
+	hugepage = usermem.HugePageSize
+)
+
+func TestFindUnallocatedRange(t *testing.T) {
+	for _, test := range []struct {
+		desc           string
+		usage          *usageSegmentDataSlices
+		start          uint64
+		length         uint64
+		alignment      uint64
+		unallocated    uint64
+		minUnallocated uint64
+	}{
+		{
+			desc:           "Initial allocation succeeds",
+			usage:          &usageSegmentDataSlices{},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    0,
+			minUnallocated: 0,
+		},
+		{
+			desc: "Allocation begins at start of file",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page},
+				End:    []uint64{2 * page},
+				Values: []usageInfo{{refs: 1}},
+			},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    0,
+			minUnallocated: 0,
+		},
+		{
+			desc: "In-use frames are not allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, page},
+				End:    []uint64{page, 2 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
+		},
+		{
+			desc: "Reclaimable frames are not allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, page, 2 * page},
+				End:    []uint64{page, 2 * page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
+			},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    3 * page,
+			minUnallocated: 3 * page,
+		},
+		{
+			desc: "Gaps between in-use frames are allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 2 * page},
+				End:    []uint64{page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    page,
+			minUnallocated: page,
+		},
+		{
+			desc: "Inadequately-sized gaps are rejected",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 2 * page},
+				End:    []uint64{page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			start:          0,
+			length:         2 * page,
+			alignment:      page,
+			unallocated:    3 * page,
+			minUnallocated: page,
+		},
+		{
+			desc: "Hugepage alignment is honored",
+			usage: &usageSegmentDataSlices{
+				Start: []uint64{0, hugepage + page},
+				// Hugepage-sized gap here that shouldn't be allocated from
+				// since it's incorrectly aligned.
+				End:    []uint64{page, hugepage + 2*page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			start:          0,
+			length:         hugepage,
+			alignment:      hugepage,
+			unallocated:    2 * hugepage,
+			minUnallocated: page,
+		},
+		{
+			desc: "Pages before start ignored",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 3 * page},
+				End:    []uint64{2 * page, 4 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			start:          page,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
+		},
+		{
+			desc: "start may be in the middle of segment",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 3 * page},
+				End:    []uint64{2 * page, 4 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			start:          page,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			var usage usageSet
+			if err := usage.ImportSortedSlices(test.usage); err != nil {
+				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
+			}
+			unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
+			if unallocated != test.unallocated {
+				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
+			}
+			if minUnallocated != test.minUnallocated {
+				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_unsafe.go b/pkg/sentry/pgalloc/pgalloc_unsafe.go
new file mode 100644
index 000000000..33b0a68a8
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc_unsafe.go
@@ -0,0 +1,40 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+)
+
+func unsafeSlice(addr uintptr, length int) (slice []byte) {
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	sh.Data = addr
+	sh.Len = length
+	sh.Cap = length
+	return
+}
+
+func mincore(s []byte, buf []byte) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_MINCORE,
+		uintptr(unsafe.Pointer(&s[0])),
+		uintptr(len(s)),
+		uintptr(unsafe.Pointer(&buf[0]))); errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
new file mode 100644
index 000000000..21024e656
--- /dev/null
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -0,0 +1,205 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"runtime"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+// SaveTo writes f's state to the given stream.
+func (f *MemoryFile) SaveTo(w io.Writer) error {
+	// Wait for reclaim.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for f.reclaimable {
+		f.reclaimCond.Signal()
+		f.mu.Unlock()
+		runtime.Gosched()
+		f.mu.Lock()
+	}
+
+	// Ensure that all pages that contain data have knownCommitted set, since
+	// we only store knownCommitted pages below.
+	zeroPage := make([]byte, usermem.PageSize)
+	err := f.updateUsageLocked(0, func(bs []byte, committed []byte) error {
+		for pgoff := 0; pgoff < len(bs); pgoff += usermem.PageSize {
+			i := pgoff / usermem.PageSize
+			pg := bs[pgoff : pgoff+usermem.PageSize]
+			if !bytes.Equal(pg, zeroPage) {
+				committed[i] = 1
+				continue
+			}
+			committed[i] = 0
+			// Reading the page caused it to be committed; decommit it to
+			// reduce memory usage.
+			//
+			// "MADV_REMOVE [...] Free up a given range of pages and its
+			// associated backing store. This is equivalent to punching a hole
+			// in the corresponding byte range of the backing store (see
+			// fallocate(2))." - madvise(2)
+			if err := syscall.Madvise(pg, syscall.MADV_REMOVE); err != nil {
+				// This doesn't impact the correctness of saved memory, it
+				// just means that we're incrementally more likely to OOM.
+				// Complain, but don't abort saving.
+				log.Warningf("Decommitting page %p while saving failed: %v", pg, err)
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	// Save metadata.
+	if err := state.Save(w, &f.fileSize, nil); err != nil {
+		return err
+	}
+	if err := state.Save(w, &f.usage, nil); err != nil {
+		return err
+	}
+
+	// Dump out committed pages.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		if !seg.Value().knownCommitted {
+			continue
+		}
+		// Write a header to distinguish from objects.
+		if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil {
+			return err
+		}
+		// Write out data.
+		var ioErr error
+		err := f.forEachMappingSlice(seg.Range(), func(s []byte) {
+			if ioErr != nil {
+				return
+			}
+			_, ioErr = w.Write(s)
+		})
+		if ioErr != nil {
+			return ioErr
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// LoadFrom loads MemoryFile state from the given stream.
+func (f *MemoryFile) LoadFrom(r io.Reader) error {
+	// Load metadata.
+	if err := state.Load(r, &f.fileSize, nil); err != nil {
+		return err
+	}
+	if err := f.file.Truncate(f.fileSize); err != nil {
+		return err
+	}
+	newMappings := make([]uintptr, f.fileSize>>chunkShift)
+	f.mappings.Store(newMappings)
+	if err := state.Load(r, &f.usage, nil); err != nil {
+		return err
+	}
+
+	// Try to map committed chunks concurrently: For any given chunk, either
+	// this loop or the following one will mmap the chunk first and cache it in
+	// f.mappings for the other, but this loop is likely to run ahead of the
+	// other since it doesn't do any work between mmaps. The rest of this
+	// function doesn't mutate f.usage, so it's safe to iterate concurrently.
+	mapperDone := make(chan struct{})
+	mapperCanceled := int32(0)
+	go func() { // S/R-SAFE: see comment
+		defer func() { close(mapperDone) }()
+		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			if atomic.LoadInt32(&mapperCanceled) != 0 {
+				return
+			}
+			if seg.Value().knownCommitted {
+				f.forEachMappingSlice(seg.Range(), func(s []byte) {})
+			}
+		}
+	}()
+	defer func() {
+		atomic.StoreInt32(&mapperCanceled, 1)
+		<-mapperDone
+	}()
+
+	// Load committed pages.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		if !seg.Value().knownCommitted {
+			continue
+		}
+		// Verify header.
+		length, object, err := state.ReadHeader(r)
+		if err != nil {
+			return err
+		}
+		if object {
+			// Not expected.
+			return fmt.Errorf("unexpected object")
+		}
+		if expected := uint64(seg.Range().Length()); length != expected {
+			// Size mismatch.
+			return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length)
+		}
+		// Read data.
+		var ioErr error
+		err = f.forEachMappingSlice(seg.Range(), func(s []byte) {
+			if ioErr != nil {
+				return
+			}
+			_, ioErr = io.ReadFull(r, s)
+		})
+		if ioErr != nil {
+			return ioErr
+		}
+		if err != nil {
+			return err
+		}
+
+		// Update accounting for restored pages. We need to do this here since
+		// these segments are marked as "known committed", and will be skipped
+		// over on accounting scans.
+		usage.MemoryAccounting.Inc(seg.End()-seg.Start(), seg.Value().kind)
+	}
+
+	return nil
+}
+
+// MemoryFileProvider provides the MemoryFile method.
+//
+// This type exists to work around a save/restore defect. The only object in a
+// saved object graph that S/R allows to be replaced at time of restore is the
+// starting point of the restore, kernel.Kernel. However, the MemoryFile
+// changes between save and restore as well, so objects that need persistent
+// access to the MemoryFile must instead store a pointer to the Kernel and call
+// Kernel.MemoryFile() as required. In most cases, depending on the kernel
+// package directly would create a package dependency loop, so the stored
+// pointer must instead be a MemoryProvider interface object. Correspondingly,
+// kernel.Kernel is the only implementation of this interface.
+type MemoryFileProvider interface {
+	// MemoryFile returns the Kernel MemoryFile.
+	MemoryFile() *MemoryFile
+}
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
deleted file mode 100644
index 1a61cfaa5..000000000
--- a/pkg/sentry/platform/filemem/BUILD
+++ /dev/null
@@ -1,56 +0,0 @@
-package(licenses = ["notice"])
-
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
-go_template_instance(
-    name = "usage_set",
-    out = "usage_set.go",
-    consts = {
-        "minDegree": "10",
-    },
-    imports = {
-        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
-    },
-    package = "filemem",
-    prefix = "usage",
-    template = "//pkg/segment:generic_set",
-    types = {
-        "Key": "uint64",
-        "Range": "platform.FileRange",
-        "Value": "usageInfo",
-        "Functions": "usageSetFunctions",
-    },
-)
-
-go_library(
-    name = "filemem",
-    srcs = [
-        "filemem.go",
-        "filemem_state.go",
-        "filemem_unsafe.go",
-        "usage_set.go",
-    ],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem",
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/log",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/memutil",
-        "//pkg/sentry/platform",
-        "//pkg/sentry/safemem",
-        "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
-        "//pkg/state",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "filemem_test",
-    size = "small",
-    srcs = ["filemem_test.go"],
-    embed = [":filemem"],
-    deps = ["//pkg/sentry/usermem"],
-)
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
deleted file mode 100644
index f41c70ba5..000000000
--- a/pkg/sentry/platform/filemem/filemem.go
+++ /dev/null
@@ -1,879 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package filemem provides a reusable implementation of platform.Memory.
-//
-// It enables memory to be sourced from a memfd file.
-//
-// Lock order:
-//
-// filemem.FileMem.mu
-//   filemem.FileMem.mappingsMu
-package filemem
-
-import (
-	"fmt"
-	"math"
-	"os"
-	"sync"
-	"sync/atomic"
-	"syscall"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-)
-
-// FileMem is a platform.Memory that allocates from a host file that it owns.
-type FileMem struct {
-	// Filemem models the backing file as follows:
-	//
-	// Each page in the file can be committed or uncommitted. A page is
-	// committed if the host kernel is spending resources to store its contents
-	// and uncommitted otherwise. This definition includes pages that the host
-	// kernel has swapped; this is intentional, to ensure that accounting does
-	// not change even if host kernel swapping behavior changes, and that
-	// memory used by pseudo-swap mechanisms like zswap is still accounted.
-	//
-	// The initial contents of uncommitted pages are implicitly zero bytes. A
-	// read or write to the contents of an uncommitted page causes it to be
-	// committed. This is the only event that can cause a uncommitted page to
-	// be committed.
-	//
-	// fallocate(FALLOC_FL_PUNCH_HOLE) (FileMem.Decommit) causes committed
-	// pages to be uncommitted. This is the only event that can cause a
-	// committed page to be uncommitted.
-	//
-	// Filemem's accounting is based on identifying the set of committed pages.
-	// Since filemem does not have direct access to the MMU, tracking reads and
-	// writes to uncommitted pages to detect commitment would introduce
-	// additional page faults, which would be prohibitively expensive. Instead,
-	// filemem queries the host kernel to determine which pages are committed.
-
-	// file is the backing memory file. The file pointer is immutable.
-	file *os.File
-
-	mu sync.Mutex
-
-	// usage maps each page in the file to metadata for that page. Pages for
-	// which no segment exists in usage are both unallocated (not in use) and
-	// uncommitted.
-	//
-	// Since usage stores usageInfo objects by value, clients should usually
-	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
-	// pointer to the usageInfo rather than a copy.
-	//
-	// usage must be kept maximally merged (that is, there should never be two
-	// adjacent segments with the same values). At least markReclaimed depends
-	// on this property.
-	//
-	// usage is protected by mu.
-	usage usageSet
-
-	// The UpdateUsage function scans all segments with knownCommitted set
-	// to false, sees which pages are committed and creates corresponding
-	// segments with knownCommitted set to true.
-	//
-	// In order to avoid unnecessary scans, usageExpected tracks the total
-	// file blocks expected. This is used to elide the scan when this
-	// matches the underlying file blocks.
-	//
-	// To track swapped pages, usageSwapped tracks the discrepency between
-	// what is observed in core and what is reported by the file. When
-	// usageSwapped is non-zero, a sweep will be performed at least every
-	// second. The start of the last sweep is recorded in usageLast.
-	//
-	// All usage attributes are all protected by mu.
-	usageExpected uint64
-	usageSwapped  uint64
-	usageLast     time.Time
-
-	// minUnallocatedPage is the minimum page that may be unallocated.
-	// i.e., there are no unallocated pages below minUnallocatedPage.
-	//
-	// minUnallocatedPage is protected by mu.
-	minUnallocatedPage uint64
-
-	// fileSize is the size of the backing memory file in bytes. fileSize is
-	// always a power-of-two multiple of chunkSize.
-	//
-	// fileSize is protected by mu.
-	fileSize int64
-
-	// destroyed is set by Destroy to instruct the reclaimer goroutine to
-	// release resources and exit. destroyed is protected by mu.
-	destroyed bool
-
-	// reclaimable is true if usage may contain reclaimable pages. reclaimable
-	// is protected by mu.
-	reclaimable bool
-
-	// minReclaimablePage is the minimum page that may be reclaimable.
-	// i.e., all reclaimable pages are >= minReclaimablePage.
-	//
-	// minReclaimablePage is protected by mu.
-	minReclaimablePage uint64
-
-	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
-	// transitions from false to true.
-	reclaimCond sync.Cond
-
-	// Filemem pages are mapped into the local address space on the granularity
-	// of large pieces called chunks. mappings is a []uintptr that stores, for
-	// each chunk, the start address of a mapping of that chunk in the current
-	// process' address space, or 0 if no such mapping exists. Once a chunk is
-	// mapped, it is never remapped or unmapped until the filemem is destroyed.
-	//
-	// Mutating the mappings slice or its contents requires both holding
-	// mappingsMu and using atomic memory operations. (The slice is mutated
-	// whenever the file is expanded. Per the above, the only permitted
-	// mutation of the slice's contents is the assignment of a mapping to a
-	// chunk that was previously unmapped.) Reading the slice or its contents
-	// only requires *either* holding mappingsMu or using atomic memory
-	// operations. This allows FileMem.AccessPhysical to avoid locking in the
-	// common case where chunk mappings already exist.
-
-	mappingsMu sync.Mutex
-	mappings   atomic.Value
-}
-
-// usage tracks usage information.
-//
-// +stateify savable
-type usageInfo struct {
-	// kind is the usage kind.
-	kind usage.MemoryKind
-
-	// knownCommitted indicates whether this region is known to be
-	// committed. If this is false, then the region may or may not have
-	// been touched. If it is true however, then mincore (below) has
-	// indicated that the page is present at least once.
-	knownCommitted bool
-
-	refs uint64
-}
-
-const (
-	chunkShift = 24
-	chunkSize  = 1 << chunkShift // 16 MB
-	chunkMask  = chunkSize - 1
-
-	initialSize = chunkSize
-
-	// maxPage is the highest 64-bit page.
-	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
-)
-
-// newFromFile creates a FileMem backed by the given file.
-func newFromFile(file *os.File) (*FileMem, error) {
-	if err := file.Truncate(initialSize); err != nil {
-		return nil, err
-	}
-	f := &FileMem{
-		fileSize: initialSize,
-		file:     file,
-		// No pages are reclaimable. DecRef will always be able to
-		// decrease minReclaimablePage from this point.
-		minReclaimablePage: maxPage,
-	}
-	f.reclaimCond.L = &f.mu
-	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
-	go f.runReclaim() // S/R-SAFE: f.mu
-
-	// The Linux kernel contains an optional feature called "Integrity
-	// Measurement Architecture" (IMA). If IMA is enabled, it will checksum
-	// binaries the first time they are mapped PROT_EXEC. This is bad news for
-	// executable pages mapped from FileMem, which can grow to terabytes in
-	// (sparse) size. If IMA attempts to checksum a file that large, it will
-	// allocate all of the sparse pages and quickly exhaust all memory.
-	//
-	// Work around IMA by immediately creating a temporary PROT_EXEC mapping,
-	// while FileMem is still small. IMA will ignore any future mappings.
-	m, _, errno := syscall.Syscall6(
-		syscall.SYS_MMAP,
-		0,
-		usermem.PageSize,
-		syscall.PROT_EXEC,
-		syscall.MAP_SHARED,
-		f.file.Fd(),
-		0)
-	if errno != 0 {
-		// This isn't fatal to filemem (IMA may not even be in use). Log the
-		// error, but don't return it.
-		log.Warningf("Failed to pre-map FileMem PROT_EXEC: %v", errno)
-	} else {
-		syscall.Syscall(
-			syscall.SYS_MUNMAP,
-			m,
-			usermem.PageSize,
-			0)
-	}
-
-	return f, nil
-}
-
-// New creates a FileMem backed by a memfd file.
-func New(name string) (*FileMem, error) {
-	fd, err := memutil.CreateMemFD(name, 0)
-	if err != nil {
-		if e, ok := err.(syscall.Errno); ok && e == syscall.ENOSYS {
-			return nil, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher")
-		}
-		return nil, err
-	}
-	return newFromFile(os.NewFile(uintptr(fd), name))
-}
-
-// Destroy implements platform.Memory.Destroy.
-func (f *FileMem) Destroy() {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	f.destroyed = true
-	f.reclaimCond.Signal()
-}
-
-// Allocate implements platform.Memory.Allocate.
-func (f *FileMem) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
-	if length == 0 || length%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid allocation length: %#x", length))
-	}
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	// Align hugepage-and-larger allocations on hugepage boundaries to try
-	// to take advantage of hugetmpfs.
-	alignment := uint64(usermem.PageSize)
-	if length >= usermem.HugePageSize {
-		alignment = usermem.HugePageSize
-	}
-
-	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
-	end := start + length
-	// File offsets are int64s. Since length must be strictly positive, end
-	// cannot legitimately be 0.
-	if end < start || int64(end) <= 0 {
-		return platform.FileRange{}, syserror.ENOMEM
-	}
-
-	// Expand the file if needed. Double the file size on each expansion;
-	// uncommitted pages have effectively no cost.
-	fileSize := f.fileSize
-	for int64(end) > fileSize {
-		if fileSize >= 2*fileSize {
-			// fileSize overflow.
-			return platform.FileRange{}, syserror.ENOMEM
-		}
-		fileSize *= 2
-	}
-	if fileSize > f.fileSize {
-		if err := f.file.Truncate(fileSize); err != nil {
-			return platform.FileRange{}, err
-		}
-		f.fileSize = fileSize
-		f.mappingsMu.Lock()
-		oldMappings := f.mappings.Load().([]uintptr)
-		newMappings := make([]uintptr, fileSize>>chunkShift)
-		copy(newMappings, oldMappings)
-		f.mappings.Store(newMappings)
-		f.mappingsMu.Unlock()
-	}
-
-	// Mark selected pages as in use.
-	fr := platform.FileRange{start, end}
-	if !f.usage.Add(fr, usageInfo{
-		kind: kind,
-		refs: 1,
-	}) {
-		panic(fmt.Sprintf("allocating %v: failed to insert into f.usage:\n%v", fr, &f.usage))
-	}
-
-	if minUnallocatedPage < start {
-		f.minUnallocatedPage = minUnallocatedPage
-	} else {
-		// start was the first unallocated page. The next must be
-		// somewhere beyond end.
-		f.minUnallocatedPage = end
-	}
-
-	return fr, nil
-}
-
-// findUnallocatedRange returns the first unallocated page in usage of the
-// specified length and alignment beginning at page start and the first single
-// unallocated page.
-func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
-	// Only searched until the first page is found.
-	firstPage := start
-	foundFirstPage := false
-	alignMask := alignment - 1
-	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
-		r := seg.Range()
-
-		if !foundFirstPage && r.Start > firstPage {
-			foundFirstPage = true
-		}
-
-		if start >= r.End {
-			// start was rounded up to an alignment boundary from the end
-			// of a previous segment and is now beyond r.End.
-			continue
-		}
-		// This segment represents allocated or reclaimable pages; only the
-		// range from start to the segment's beginning is allocatable, and the
-		// next allocatable range begins after the segment.
-		if r.Start > start && r.Start-start >= length {
-			break
-		}
-		start = (r.End + alignMask) &^ alignMask
-		if !foundFirstPage {
-			firstPage = r.End
-		}
-	}
-	return start, firstPage
-}
-
-// fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
-const (
-	_FALLOC_FL_KEEP_SIZE  = 1
-	_FALLOC_FL_PUNCH_HOLE = 2
-)
-
-// Decommit implements platform.Memory.Decommit.
-func (f *FileMem) Decommit(fr platform.FileRange) error {
-	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-
-	// "After a successful call, subsequent reads from this range will
-	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
-	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
-	err := syscall.Fallocate(
-		int(f.file.Fd()),
-		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
-		int64(fr.Start),
-		int64(fr.Length()))
-	if err != nil {
-		return err
-	}
-	f.markDecommitted(fr)
-	return nil
-}
-
-func (f *FileMem) markDecommitted(fr platform.FileRange) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	// Since we're changing the knownCommitted attribute, we need to merge
-	// across the entire range to ensure that the usage tree is minimal.
-	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
-		val := seg.ValuePtr()
-		if val.knownCommitted {
-			// Drop the usageExpected appropriately.
-			amount := seg.Range().Length()
-			usage.MemoryAccounting.Dec(amount, val.kind)
-			f.usageExpected -= amount
-			val.knownCommitted = false
-		}
-	})
-	if gap.Ok() {
-		panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
-	}
-	f.usage.MergeRange(fr)
-}
-
-// runReclaim implements the reclaimer goroutine, which continuously decommits
-// reclaimable frames in order to reduce memory usage.
-func (f *FileMem) runReclaim() {
-	for {
-		fr, ok := f.findReclaimable()
-		if !ok {
-			break
-		}
-
-		if err := f.Decommit(fr); err != nil {
-			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
-			// Zero the frames manually. This won't reduce memory usage, but at
-			// least ensures that the frames will be zero when reallocated.
-			f.forEachMappingSlice(fr, func(bs []byte) {
-				for i := range bs {
-					bs[i] = 0
-				}
-			})
-			// Pretend the frames were decommitted even though they weren't,
-			// since the memory accounting implementation has no idea how to
-			// deal with this.
-			f.markDecommitted(fr)
-		}
-		f.markReclaimed(fr)
-	}
-	// We only get here if findReclaimable finds f.destroyed set and returns
-	// false.
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	if !f.destroyed {
-		panic("findReclaimable broke out of reclaim loop, but f.destroyed is no longer set")
-	}
-	f.file.Close()
-	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
-	// that has possibly been reassigned.
-	f.file = nil
-	mappings := f.mappings.Load().([]uintptr)
-	for i, m := range mappings {
-		if m != 0 {
-			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
-			if errno != 0 {
-				log.Warningf("Failed to unmap mapping %#x for filemem chunk %d: %v", m, i, errno)
-			}
-		}
-	}
-	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
-	f.mappings.Store([]uintptr{})
-}
-
-func (f *FileMem) findReclaimable() (platform.FileRange, bool) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	for {
-		for {
-			if f.destroyed {
-				return platform.FileRange{}, false
-			}
-			if f.reclaimable {
-				break
-			}
-			f.reclaimCond.Wait()
-		}
-		// Allocate returns the first usable range in offset order and is
-		// currently a linear scan, so reclaiming from the beginning of the
-		// file minimizes the expected latency of Allocate.
-		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
-			if seg.ValuePtr().refs == 0 {
-				f.minReclaimablePage = seg.End()
-				return seg.Range(), true
-			}
-		}
-		f.reclaimable = false
-		// No pages are reclaimable.
-		f.minReclaimablePage = maxPage
-	}
-}
-
-func (f *FileMem) markReclaimed(fr platform.FileRange) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	seg := f.usage.FindSegment(fr.Start)
-	// All of fr should be mapped to a single uncommitted reclaimable segment
-	// accounted to System.
-	if !seg.Ok() {
-		panic(fmt.Sprintf("Reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
-	}
-	if !seg.Range().IsSupersetOf(fr) {
-		panic(fmt.Sprintf("Reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
-	}
-	if got, want := seg.Value(), (usageInfo{
-		kind:           usage.System,
-		knownCommitted: false,
-		refs:           0,
-	}); got != want {
-		panic(fmt.Sprintf("Reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
-	}
-	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
-	// caller of markReclaimed may not have decommitted it, so we can only mark
-	// fr as reclaimed.
-	f.usage.Remove(f.usage.Isolate(seg, fr))
-	if fr.Start < f.minUnallocatedPage {
-		// We've deallocated at least one lower page.
-		f.minUnallocatedPage = fr.Start
-	}
-}
-
-// IncRef implements platform.File.IncRef.
-func (f *FileMem) IncRef(fr platform.FileRange) {
-	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
-		seg.ValuePtr().refs++
-	})
-	if gap.Ok() {
-		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
-	}
-
-	f.usage.MergeAdjacent(fr)
-}
-
-// DecRef implements platform.File.DecRef.
-func (f *FileMem) DecRef(fr platform.FileRange) {
-	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-
-	var freed bool
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
-		seg = f.usage.Isolate(seg, fr)
-		val := seg.ValuePtr()
-		if val.refs == 0 {
-			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
-		}
-		val.refs--
-		if val.refs == 0 {
-			freed = true
-			// Reclassify memory as System, until it's freed by the reclaim
-			// goroutine.
-			if val.knownCommitted {
-				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind)
-			}
-			val.kind = usage.System
-		}
-	}
-	f.usage.MergeAdjacent(fr)
-
-	if freed {
-		if fr.Start < f.minReclaimablePage {
-			// We've freed at least one lower page.
-			f.minReclaimablePage = fr.Start
-		}
-		f.reclaimable = true
-		f.reclaimCond.Signal()
-	}
-}
-
-// MapInternal implements platform.File.MapInternal.
-func (f *FileMem) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
-	if !fr.WellFormed() || fr.Length() == 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-	if at.Execute {
-		return safemem.BlockSeq{}, syserror.EACCES
-	}
-
-	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
-	if chunks == 1 {
-		// Avoid an unnecessary slice allocation.
-		var seq safemem.BlockSeq
-		err := f.forEachMappingSlice(fr, func(bs []byte) {
-			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
-		})
-		return seq, err
-	}
-	blocks := make([]safemem.Block, 0, chunks)
-	err := f.forEachMappingSlice(fr, func(bs []byte) {
-		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
-	})
-	return safemem.BlockSeqFromSlice(blocks), err
-}
-
-// forEachMappingSlice invokes fn on a sequence of byte slices that
-// collectively map all bytes in fr.
-func (f *FileMem) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
-	mappings := f.mappings.Load().([]uintptr)
-	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
-		chunk := int(chunkStart >> chunkShift)
-		m := atomic.LoadUintptr(&mappings[chunk])
-		if m == 0 {
-			var err error
-			mappings, m, err = f.getChunkMapping(chunk)
-			if err != nil {
-				return err
-			}
-		}
-		startOff := uint64(0)
-		if chunkStart < fr.Start {
-			startOff = fr.Start - chunkStart
-		}
-		endOff := uint64(chunkSize)
-		if chunkStart+chunkSize > fr.End {
-			endOff = fr.End - chunkStart
-		}
-		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
-	}
-	return nil
-}
-
-func (f *FileMem) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
-	f.mappingsMu.Lock()
-	defer f.mappingsMu.Unlock()
-	// Another thread may have replaced f.mappings altogether due to file
-	// expansion.
-	mappings := f.mappings.Load().([]uintptr)
-	// Another thread may have already mapped the chunk.
-	if m := mappings[chunk]; m != 0 {
-		return mappings, m, nil
-	}
-	m, _, errno := syscall.Syscall6(
-		syscall.SYS_MMAP,
-		0,
-		chunkSize,
-		syscall.PROT_READ|syscall.PROT_WRITE,
-		syscall.MAP_SHARED,
-		f.file.Fd(),
-		uintptr(chunk<<chunkShift))
-	if errno != 0 {
-		return nil, 0, errno
-	}
-	atomic.StoreUintptr(&mappings[chunk], m)
-	return mappings, m, nil
-}
-
-// FD implements platform.File.FD.
-func (f *FileMem) FD() int {
-	return int(f.file.Fd())
-}
-
-// UpdateUsage implements platform.Memory.UpdateUsage.
-func (f *FileMem) UpdateUsage() error {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	// If the underlying usage matches where the usage tree already
-	// represents, then we can just avoid the entire scan (we know it's
-	// accurate).
-	currentUsage, err := f.TotalUsage()
-	if err != nil {
-		return err
-	}
-	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
-		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
-		return nil
-	}
-	// If the current usage matches the expected but there's swap
-	// accounting, then ensure a scan takes place at least every second
-	// (when requested).
-	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
-		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
-		return nil
-	}
-
-	f.usageLast = time.Now()
-	err = f.updateUsageLocked(currentUsage, mincore)
-	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
-		currentUsage, f.usageExpected, f.usageSwapped)
-	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
-	return err
-}
-
-// updateUsageLocked attempts to detect commitment of previous-uncommitted
-// pages by invoking checkCommitted, which is a function that, for each page i
-// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
-//
-// Precondition: f.mu must be held.
-func (f *FileMem) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
-	// Track if anything changed to elide the merge. In the common case, we
-	// expect all segments to be committed and no merge to occur.
-	changedAny := false
-	defer func() {
-		if changedAny {
-			f.usage.MergeAll()
-		}
-
-		// Adjust the swap usage to reflect reality.
-		if f.usageExpected < currentUsage {
-			// Since no pages may be decommitted while we hold usageMu, we
-			// know that usage may have only increased since we got the
-			// last current usage. Therefore, if usageExpected is still
-			// short of currentUsage, we must assume that the difference is
-			// in pages that have been swapped.
-			newUsageSwapped := currentUsage - f.usageExpected
-			if f.usageSwapped < newUsageSwapped {
-				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System)
-			} else {
-				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System)
-			}
-			f.usageSwapped = newUsageSwapped
-		} else if f.usageSwapped != 0 {
-			// We have more usage accounted for than the file itself.
-			// That's fine, we probably caught a race where pages were
-			// being committed while the above loop was running. Just
-			// report the higher number that we found and ignore swap.
-			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
-			f.usageSwapped = 0
-		}
-	}()
-
-	// Reused mincore buffer, will generally be <= 4096 bytes.
-	var buf []byte
-
-	// Iterate over all usage data. There will only be usage segments
-	// present when there is an associated reference.
-	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		val := seg.Value()
-
-		// Already known to be committed; ignore.
-		if val.knownCommitted {
-			continue
-		}
-
-		// Assume that reclaimable pages (that aren't already known to be
-		// committed) are not committed. This isn't necessarily true, even
-		// after the reclaimer does Decommit(), because the kernel may
-		// subsequently back the hugepage-sized region containing the
-		// decommitted page with a hugepage. However, it's consistent with our
-		// treatment of unallocated pages, which have the same property.
-		if val.refs == 0 {
-			continue
-		}
-
-		// Get the range for this segment. As we touch slices, the
-		// Start value will be walked along.
-		r := seg.Range()
-
-		var checkErr error
-		err := f.forEachMappingSlice(r, func(s []byte) {
-			if checkErr != nil {
-				return
-			}
-
-			// Ensure that we have sufficient buffer for the call
-			// (one byte per page). The length of each slice must
-			// be page-aligned.
-			bufLen := len(s) / usermem.PageSize
-			if len(buf) < bufLen {
-				buf = make([]byte, bufLen)
-			}
-
-			// Query for new pages in core.
-			if err := checkCommitted(s, buf); err != nil {
-				checkErr = err
-				return
-			}
-
-			// Scan each page and switch out segments.
-			populatedRun := false
-			populatedRunStart := 0
-			for i := 0; i <= bufLen; i++ {
-				// We run past the end of the slice here to
-				// simplify the logic and only set populated if
-				// we're still looking at elements.
-				populated := false
-				if i < bufLen {
-					populated = buf[i]&0x1 != 0
-				}
-
-				switch {
-				case populated == populatedRun:
-					// Keep the run going.
-					continue
-				case populated && !populatedRun:
-					// Begin the run.
-					populatedRun = true
-					populatedRunStart = i
-					// Keep going.
-					continue
-				case !populated && populatedRun:
-					// Finish the run by changing this segment.
-					runRange := platform.FileRange{
-						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
-						End:   r.Start + uint64(i*usermem.PageSize),
-					}
-					seg = f.usage.Isolate(seg, runRange)
-					seg.ValuePtr().knownCommitted = true
-					// Advance the segment only if we still
-					// have work to do in the context of
-					// the original segment from the for
-					// loop. Otherwise, the for loop itself
-					// will advance the segment
-					// appropriately.
-					if runRange.End != r.End {
-						seg = seg.NextSegment()
-					}
-					amount := runRange.Length()
-					usage.MemoryAccounting.Inc(amount, val.kind)
-					f.usageExpected += amount
-					changedAny = true
-					populatedRun = false
-				}
-			}
-
-			// Advance r.Start.
-			r.Start += uint64(len(s))
-		})
-		if checkErr != nil {
-			return checkErr
-		}
-		if err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-// TotalUsage implements platform.Memory.TotalUsage.
-func (f *FileMem) TotalUsage() (uint64, error) {
-	// Stat the underlying file to discover the underlying usage. stat(2)
-	// always reports the allocated block count in units of 512 bytes. This
-	// includes pages in the page cache and swapped pages.
-	var stat syscall.Stat_t
-	if err := syscall.Fstat(int(f.file.Fd()), &stat); err != nil {
-		return 0, err
-	}
-	return uint64(stat.Blocks * 512), nil
-}
-
-// TotalSize implements platform.Memory.TotalSize.
-func (f *FileMem) TotalSize() uint64 {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	return uint64(f.fileSize)
-}
-
-// File returns the memory file used by f.
-func (f *FileMem) File() *os.File {
-	return f.file
-}
-
-// String implements fmt.Stringer.String.
-//
-// Note that because f.String locks f.mu, calling f.String internally
-// (including indirectly through the fmt package) risks recursive locking.
-// Within the filemem package, use f.usage directly instead.
-func (f *FileMem) String() string {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	return f.usage.String()
-}
-
-type usageSetFunctions struct{}
-
-func (usageSetFunctions) MinKey() uint64 {
-	return 0
-}
-
-func (usageSetFunctions) MaxKey() uint64 {
-	return math.MaxUint64
-}
-
-func (usageSetFunctions) ClearValue(val *usageInfo) {
-}
-
-func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
-	return val1, val1 == val2
-}
-
-func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
-	return val, val
-}
diff --git a/pkg/sentry/platform/filemem/filemem_state.go b/pkg/sentry/platform/filemem/filemem_state.go
deleted file mode 100644
index 964e2aaaa..000000000
--- a/pkg/sentry/platform/filemem/filemem_state.go
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package filemem
-
-import (
-	"bytes"
-	"fmt"
-	"io"
-	"runtime"
-	"sync/atomic"
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/state"
-)
-
-// SaveTo implements platform.Memory.SaveTo.
-func (f *FileMem) SaveTo(w io.Writer) error {
-	// Wait for reclaim.
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	for f.reclaimable {
-		f.reclaimCond.Signal()
-		f.mu.Unlock()
-		runtime.Gosched()
-		f.mu.Lock()
-	}
-
-	// Ensure that all pages that contain data have knownCommitted set, since
-	// we only store knownCommitted pages below.
-	zeroPage := make([]byte, usermem.PageSize)
-	err := f.updateUsageLocked(0, func(bs []byte, committed []byte) error {
-		for pgoff := 0; pgoff < len(bs); pgoff += usermem.PageSize {
-			i := pgoff / usermem.PageSize
-			pg := bs[pgoff : pgoff+usermem.PageSize]
-			if !bytes.Equal(pg, zeroPage) {
-				committed[i] = 1
-				continue
-			}
-			committed[i] = 0
-			// Reading the page caused it to be committed; decommit it to
-			// reduce memory usage.
-			//
-			// "MADV_REMOVE [...] Free up a given range of pages and its
-			// associated backing store. This is equivalent to punching a hole
-			// in the corresponding byte range of the backing store (see
-			// fallocate(2))." - madvise(2)
-			if err := syscall.Madvise(pg, syscall.MADV_REMOVE); err != nil {
-				// This doesn't impact the correctness of saved memory, it
-				// just means that we're incrementally more likely to OOM.
-				// Complain, but don't abort saving.
-				log.Warningf("Decommitting page %p while saving failed: %v", pg, err)
-			}
-		}
-		return nil
-	})
-	if err != nil {
-		return err
-	}
-
-	// Save metadata.
-	if err := state.Save(w, &f.fileSize, nil); err != nil {
-		return err
-	}
-	if err := state.Save(w, &f.usage, nil); err != nil {
-		return err
-	}
-
-	// Dump out committed pages.
-	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		if !seg.Value().knownCommitted {
-			continue
-		}
-		// Write a header to distinguish from objects.
-		if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil {
-			return err
-		}
-		// Write out data.
-		var ioErr error
-		err := f.forEachMappingSlice(seg.Range(), func(s []byte) {
-			if ioErr != nil {
-				return
-			}
-			_, ioErr = w.Write(s)
-		})
-		if ioErr != nil {
-			return ioErr
-		}
-		if err != nil {
-			return err
-		}
-
-		// Update accounting for restored pages. We need to do this here since
-		// these segments are marked as "known committed", and will be skipped
-		// over on accounting scans.
-		usage.MemoryAccounting.Inc(seg.Range().Length(), seg.Value().kind)
-	}
-
-	return nil
-}
-
-// LoadFrom implements platform.Memory.LoadFrom.
-func (f *FileMem) LoadFrom(r io.Reader) error {
-	// Load metadata.
-	if err := state.Load(r, &f.fileSize, nil); err != nil {
-		return err
-	}
-	if err := f.file.Truncate(f.fileSize); err != nil {
-		return err
-	}
-	newMappings := make([]uintptr, f.fileSize>>chunkShift)
-	f.mappings.Store(newMappings)
-	if err := state.Load(r, &f.usage, nil); err != nil {
-		return err
-	}
-
-	// Try to map committed chunks concurrently: For any given chunk, either
-	// this loop or the following one will mmap the chunk first and cache it in
-	// f.mappings for the other, but this loop is likely to run ahead of the
-	// other since it doesn't do any work between mmaps. The rest of this
-	// function doesn't mutate f.usage, so it's safe to iterate concurrently.
-	mapperDone := make(chan struct{})
-	mapperCanceled := int32(0)
-	go func() { // S/R-SAFE: see comment
-		defer func() { close(mapperDone) }()
-		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-			if atomic.LoadInt32(&mapperCanceled) != 0 {
-				return
-			}
-			if seg.Value().knownCommitted {
-				f.forEachMappingSlice(seg.Range(), func(s []byte) {})
-			}
-		}
-	}()
-	defer func() {
-		atomic.StoreInt32(&mapperCanceled, 1)
-		<-mapperDone
-	}()
-
-	// Load committed pages.
-	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		if !seg.Value().knownCommitted {
-			continue
-		}
-		// Verify header.
-		length, object, err := state.ReadHeader(r)
-		if err != nil {
-			return err
-		}
-		if object {
-			// Not expected.
-			return fmt.Errorf("unexpected object")
-		}
-		if expected := uint64(seg.Range().Length()); length != expected {
-			// Size mismatch.
-			return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length)
-		}
-		// Read data.
-		var ioErr error
-		err = f.forEachMappingSlice(seg.Range(), func(s []byte) {
-			if ioErr != nil {
-				return
-			}
-			_, ioErr = io.ReadFull(r, s)
-		})
-		if ioErr != nil {
-			return ioErr
-		}
-		if err != nil {
-			return err
-		}
-
-		// Update accounting for restored pages. We need to do this here since
-		// these segments are marked as "known committed", and will be skipped
-		// over on accounting scans.
-		usage.MemoryAccounting.Inc(seg.End()-seg.Start(), seg.Value().kind)
-	}
-
-	return nil
-}
diff --git a/pkg/sentry/platform/filemem/filemem_test.go b/pkg/sentry/platform/filemem/filemem_test.go
deleted file mode 100644
index 9becec25f..000000000
--- a/pkg/sentry/platform/filemem/filemem_test.go
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package filemem
-
-import (
-	"testing"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-const (
-	page     = usermem.PageSize
-	hugepage = usermem.HugePageSize
-)
-
-func TestFindUnallocatedRange(t *testing.T) {
-	for _, test := range []struct {
-		desc           string
-		usage          *usageSegmentDataSlices
-		start          uint64
-		length         uint64
-		alignment      uint64
-		unallocated    uint64
-		minUnallocated uint64
-	}{
-		{
-			desc:           "Initial allocation succeeds",
-			usage:          &usageSegmentDataSlices{},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
-		},
-		{
-			desc: "Allocation begins at start of file",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{page},
-				End:    []uint64{2 * page},
-				Values: []usageInfo{{refs: 1}},
-			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
-		},
-		{
-			desc: "In-use frames are not allocatable",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, page},
-				End:    []uint64{page, 2 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 2}},
-			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
-		},
-		{
-			desc: "Reclaimable frames are not allocatable",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, page, 2 * page},
-				End:    []uint64{page, 2 * page, 3 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
-			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: 3 * page,
-		},
-		{
-			desc: "Gaps between in-use frames are allocatable",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 2 * page},
-				End:    []uint64{page, 3 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 1}},
-			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    page,
-			minUnallocated: page,
-		},
-		{
-			desc: "Inadequately-sized gaps are rejected",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 2 * page},
-				End:    []uint64{page, 3 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 1}},
-			},
-			start:          0,
-			length:         2 * page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: page,
-		},
-		{
-			desc: "Hugepage alignment is honored",
-			usage: &usageSegmentDataSlices{
-				Start: []uint64{0, hugepage + page},
-				// Hugepage-sized gap here that shouldn't be allocated from
-				// since it's incorrectly aligned.
-				End:    []uint64{page, hugepage + 2*page},
-				Values: []usageInfo{{refs: 1}, {refs: 1}},
-			},
-			start:          0,
-			length:         hugepage,
-			alignment:      hugepage,
-			unallocated:    2 * hugepage,
-			minUnallocated: page,
-		},
-		{
-			desc: "Pages before start ignored",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{page, 3 * page},
-				End:    []uint64{2 * page, 4 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 2}},
-			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
-		},
-		{
-			desc: "start may be in the middle of segment",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 3 * page},
-				End:    []uint64{2 * page, 4 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 2}},
-			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
-		},
-	} {
-		t.Run(test.desc, func(t *testing.T) {
-			var usage usageSet
-			if err := usage.ImportSortedSlices(test.usage); err != nil {
-				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
-			}
-			unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
-			if unallocated != test.unallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
-			}
-			if minUnallocated != test.minUnallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
-			}
-		})
-	}
-}
diff --git a/pkg/sentry/platform/filemem/filemem_unsafe.go b/pkg/sentry/platform/filemem/filemem_unsafe.go
deleted file mode 100644
index 776aed74d..000000000
--- a/pkg/sentry/platform/filemem/filemem_unsafe.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package filemem
-
-import (
-	"reflect"
-	"syscall"
-	"unsafe"
-)
-
-func unsafeSlice(addr uintptr, length int) (slice []byte) {
-	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
-	sh.Data = addr
-	sh.Len = length
-	sh.Cap = length
-	return
-}
-
-func mincore(s []byte, buf []byte) error {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_MINCORE,
-		uintptr(unsafe.Pointer(&s[0])),
-		uintptr(len(s)),
-		uintptr(unsafe.Pointer(&buf[0]))); errno != 0 {
-		return errno
-	}
-	return nil
-}
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index b7bf88249..9999e58f4 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -34,7 +34,6 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
-        "//pkg/sentry/platform/filemem",
         "//pkg/sentry/platform/interrupt",
         "//pkg/sentry/platform/procid",
         "//pkg/sentry/platform/ring0",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 6d8d8e65b..f2f7ab1e8 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -20,7 +20,6 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -76,9 +75,6 @@ type addressSpace struct {
 	// Note that the page tables themselves are not locked.
 	mu sync.Mutex
 
-	// filemem is the memory instance.
-	filemem *filemem.FileMem
-
 	// machine is the underlying machine.
 	machine *machine
 
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index d4f50024d..c5a4435b1 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -23,7 +23,6 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -33,9 +32,6 @@ import (
 type KVM struct {
 	platform.NoCPUPreemptionDetection
 
-	// filemem is our memory source.
-	*filemem.FileMem
-
 	// machine is the backing VM.
 	machine *machine
 }
@@ -56,12 +52,6 @@ func OpenDevice() (*os.File, error) {
 
 // New returns a new KVM-based implementation of the platform interface.
 func New(deviceFile *os.File) (*KVM, error) {
-	// Allocate physical memory for the vCPUs.
-	fm, err := filemem.New("kvm-memory")
-	if err != nil {
-		return nil, err
-	}
-
 	fd := deviceFile.Fd()
 
 	// Ensure global initialization is done.
@@ -90,7 +80,6 @@ func New(deviceFile *os.File) (*KVM, error) {
 
 	// All set.
 	return &KVM{
-		FileMem: fm,
 		machine: machine,
 	}, nil
 }
@@ -140,7 +129,6 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru
 
 	// Return the new address space.
 	return &addressSpace{
-		filemem:    k.FileMem,
 		machine:    k.machine,
 		pageTables: pageTables,
 		dirtySet:   k.machine.newDirtySet(),
@@ -153,8 +141,3 @@ func (k *KVM) NewContext() platform.Context {
 		machine: k.machine,
 	}
 }
-
-// Memory returns the platform memory used to do allocations.
-func (k *KVM) Memory() platform.Memory {
-	return k.FileMem
-}
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index fff463a6e..361200622 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -48,7 +48,6 @@ func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) {
 		t.Fatalf("error creating KVM instance: %v", err)
 	}
 	defer k.machine.Destroy()
-	defer k.FileMem.Destroy()
 
 	// Call additional setup.
 	if setup != nil {
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index b2ce851da..d1c9458ea 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -19,17 +19,15 @@ package platform
 
 import (
 	"fmt"
-	"io"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// Platform provides abstractions for execution contexts (Context) and memory
-// management (Memory, AddressSpace).
+// Platform provides abstractions for execution contexts (Context,
+// AddressSpace).
 type Platform interface {
 	// SupportsAddressSpaceIO returns true if AddressSpaces returned by this
 	// Platform support AddressSpaceIO methods.
@@ -87,9 +85,6 @@ type Platform interface {
 	// NewContext returns a new execution context.
 	NewContext() Context
 
-	// Memory returns memory for allocations.
-	Memory() Memory
-
 	// PreemptAllCPUs causes all concurrent calls to Context.Switch(), as well
 	// as the first following call to Context.Switch() for each Context, to
 	// return ErrContextCPUPreempted.
@@ -352,84 +347,3 @@ type File interface {
 func (fr FileRange) String() string {
 	return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
 }
-
-// Memory represents an allocatable File that may be mapped into any
-// AddressSpace associated with the same Platform.
-type Memory interface {
-	File
-
-	// Allocate returns a range of initially-zeroed pages of the given length
-	// with the given accounting kind and a single reference held by the
-	// caller. When the last reference on an allocated page is released,
-	// ownership of the page is returned to the Memory, allowing it to be
-	// returned by a future call to Allocate.
-	//
-	// Preconditions: length must be page-aligned and non-zero.
-	Allocate(length uint64, kind usage.MemoryKind) (FileRange, error)
-
-	// Decommit releases resources associated with maintaining the contents of
-	// the given frames. If Decommit succeeds, future accesses of the
-	// decommitted frames will read zeroes.
-	//
-	// Preconditions: fr.Length() > 0.
-	Decommit(fr FileRange) error
-
-	// UpdateUsage updates the memory usage statistics. This must be called
-	// before the relevant memory statistics in usage.MemoryAccounting can
-	// be considered accurate.
-	UpdateUsage() error
-
-	// TotalUsage returns an aggregate usage for all memory statistics
-	// except Mapped (which is external to the Memory implementation). This
-	// is generally much cheaper than UpdateUsage, but will not provide a
-	// fine-grained breakdown.
-	TotalUsage() (uint64, error)
-
-	// TotalSize returns the current maximum size of the Memory in bytes. The
-	// value returned by TotalSize is permitted to change.
-	TotalSize() uint64
-
-	// Destroy releases all resources associated with the Memory.
-	//
-	// Preconditions: There are no remaining uses of any of the freed memory's
-	// frames.
-	//
-	// Postconditions: None of the Memory's methods may be called after Destroy.
-	Destroy()
-
-	// SaveTo saves the memory state to the given stream, which will
-	// generally be a statefile.
-	SaveTo(w io.Writer) error
-
-	// LoadFrom loads the memory state from the given stream, which will
-	// generally be a statefile.
-	LoadFrom(r io.Reader) error
-}
-
-// AllocateAndFill allocates memory of the given kind from mem and fills it by
-// calling r.ReadToBlocks() repeatedly until either length bytes are read or a
-// non-nil error is returned. It returns the memory filled by r, truncated down
-// to the nearest page. If this is shorter than length bytes due to an error
-// returned by r.ReadToBlocks(), it returns that error.
-//
-// Preconditions: length > 0. length must be page-aligned.
-func AllocateAndFill(mem Memory, length uint64, kind usage.MemoryKind, r safemem.Reader) (FileRange, error) {
-	fr, err := mem.Allocate(length, kind)
-	if err != nil {
-		return FileRange{}, err
-	}
-	dsts, err := mem.MapInternal(fr, usermem.Write)
-	if err != nil {
-		mem.DecRef(fr)
-		return FileRange{}, err
-	}
-	n, err := safemem.ReadFullToBlocks(r, dsts)
-	un := uint64(usermem.Addr(n).RoundDown())
-	if un < length {
-		// Free unused memory and update fr to contain only the memory that is
-		// still allocated.
-		mem.DecRef(FileRange{fr.Start + un, fr.End})
-		fr.End = fr.Start + un
-	}
-	return fr, err
-}
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index f86790942..e9e4a0d16 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -23,7 +23,6 @@ go_library(
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
-        "//pkg/sentry/platform/filemem",
         "//pkg/sentry/platform/interrupt",
         "//pkg/sentry/platform/procid",
         "//pkg/sentry/platform/safecopy",
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 8d3f6ac9a..3c0713e95 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -50,7 +50,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -181,7 +180,6 @@ func (c *context) Interrupt() {
 type PTrace struct {
 	platform.MMapMinAddr
 	platform.NoCPUPreemptionDetection
-	*filemem.FileMem
 }
 
 // New returns a new ptrace-based implementation of the platform interface.
@@ -202,12 +200,7 @@ func New() (*PTrace, error) {
 		globalPool.master = master
 	})
 
-	fm, err := filemem.New("ptrace-memory")
-	if err != nil {
-		return nil, err
-	}
-
-	return &PTrace{FileMem: fm}, nil
+	return &PTrace{}, nil
 }
 
 // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
@@ -243,8 +236,3 @@ func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan s
 func (*PTrace) NewContext() platform.Context {
 	return &context{}
 }
-
-// Memory returns the platform memory used to do allocations.
-func (p *PTrace) Memory() platform.Memory {
-	return p.FileMem
-}
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 42c459acc..69385e23c 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -16,7 +16,6 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/platform",
         "//pkg/sentry/watchdog",
         "//pkg/state/statefile",
     ],
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 70b33f190..67db78a56 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
 	"gvisor.googlesource.com/gvisor/pkg/state/statefile"
 )
@@ -95,7 +94,7 @@ type LoadOpts struct {
 }
 
 // Load loads the given kernel, setting the provided platform and stack.
-func (opts LoadOpts) Load(k *kernel.Kernel, p platform.Platform, n inet.Stack) error {
+func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack) error {
 	// Open the file.
 	r, m, err := statefile.NewReader(opts.Source, opts.Key)
 	if err != nil {
@@ -105,5 +104,5 @@ func (opts LoadOpts) Load(k *kernel.Kernel, p platform.Platform, n inet.Stack) e
 	previousMetadata = m
 
 	// Restore the Kernel object graph.
-	return k.LoadFrom(r, p, n)
+	return k.LoadFrom(r, n)
 }
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 5eeb3ba58..6f7acf98f 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -25,10 +25,10 @@ import (
 func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 
-	mem := t.Kernel().Platform.Memory()
-	mem.UpdateUsage()
+	mf := t.Kernel().MemoryFile()
+	mf.UpdateUsage()
 	_, totalUsage := usage.MemoryAccounting.Copy()
-	totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage)
+	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
 
 	// Only a subset of the fields in sysinfo_t make sense to return.
 	si := linux.Sysinfo{
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 7e065cb76..5be9ed9c6 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -122,9 +122,6 @@ func Init() error {
 	const name = "memory-usage"
 	fd, err := memutil.CreateMemFD(name, 0)
 	if err != nil {
-		if e, ok := err.(syscall.Errno); ok && e == syscall.ENOSYS {
-			return fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher")
-		}
 		return fmt.Errorf("error creating usage file: %v", err)
 	}
 	file := os.NewFile(uintptr(fd), name)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index daa197437..df9907e52 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -51,6 +51,8 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
+        "//pkg/sentry/memutil",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm",
         "//pkg/sentry/platform/ptrace",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index a864be720..14e1eba5b 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -332,6 +332,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	k := &kernel.Kernel{
 		Platform: p,
 	}
+	mf, err := createMemoryFile()
+	if err != nil {
+		return fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
 	cm.l.k = k
 
 	// Set up the restore environment.
@@ -362,7 +367,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	loadOpts := state.LoadOpts{
 		Source: o.FilePayload.Files[0],
 	}
-	if err := loadOpts.Load(k, p, networkStack); err != nil {
+	if err := loadOpts.Load(k, networkStack); err != nil {
 		return err
 	}
 
@@ -384,7 +389,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.mu.Lock()
 	eid := execID{cid: o.SandboxID}
 	cm.l.processes = map[execID]*execProcess{
-		eid: &execProcess{
+		eid: {
 			tg: cm.l.k.GlobalInit(),
 		},
 	}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index f954b8c0b..717adfedd 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -68,7 +68,7 @@ func (cm *containerManager) Event(_ *struct{}, out *Event) error {
 }
 
 func (s *Stats) populateMemory(k *kernel.Kernel) {
-	mem := k.Platform.Memory()
+	mem := k.MemoryFile()
 	mem.UpdateUsage()
 	_, totalUsage := usage.MemoryAccounting.Copy()
 	s.Memory.Usage = MemoryEntry{
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9ebe64dce..56cb137f0 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -37,6 +37,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
@@ -189,6 +191,13 @@ func New(args Args) (*Loader, error) {
 		Platform: p,
 	}
 
+	// Create memory file.
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
+
 	// Create VDSO.
 	//
 	// Pass k as the platform since it is savable, unlike the actual platform.
@@ -297,7 +306,7 @@ func New(args Args) (*Loader, error) {
 		stdioFDs:     args.StdioFDs,
 		rootProcArgs: procArgs,
 		sandboxID:    args.ID,
-		processes:    map[execID]*execProcess{eid: &execProcess{}},
+		processes:    map[execID]*execProcess{eid: {}},
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -404,6 +413,21 @@ func createPlatform(conf *Config, deviceFD int) (platform.Platform, error) {
 	}
 }
 
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "runsc-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile)
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
+
 // Run runs the root container..
 func (l *Loader) Run() error {
 	err := l.run()
-- 
cgit v1.2.3


From e420cc3e5d2066674d32d16ad885bee6b30da210 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 18 Mar 2019 12:29:43 -0700
Subject: Add support for mount propagation

Properly handle propagation options for root and mounts. Now usage of
mount options shared, rshared, and noexec cause error to start. shared/
rshared breaks sandbox=>host isolation. slave however can be supported
because changes propagate from host to sandbox.

Root FS setup moved inside the gofer. Apart from simplifying the code,
it keeps all mounts inside the namespace. And they are torn down when
the namespace is destroyed (DestroyFS is no longer needed).

PiperOrigin-RevId: 239037661
Change-Id: I8b5ee4d50da33c042ea34fa68e56514ebe20e6e0
---
 runsc/cmd/BUILD                   |   1 +
 runsc/cmd/boot.go                 |  16 +++
 runsc/cmd/gofer.go                | 279 +++++++++++++++++++++++++++++++-----
 runsc/cmd/gofer_test.go           | 164 ++++++++++++++++++++++
 runsc/container/BUILD             |   2 -
 runsc/container/container.go      |  75 ++++------
 runsc/container/container_test.go | 165 ++++++++++++++++++++++
 runsc/container/fs.go             | 287 --------------------------------------
 runsc/container/fs_test.go        | 158 ---------------------
 runsc/sandbox/sandbox.go          |  23 +--
 runsc/specutils/BUILD             |   1 +
 runsc/specutils/fs.go             | 139 ++++++++++++++++++
 runsc/specutils/namespace.go      |  16 +--
 runsc/specutils/specutils.go      |  52 ++++---
 runsc/specutils/specutils_test.go |  31 ++++
 15 files changed, 834 insertions(+), 575 deletions(-)
 create mode 100644 runsc/cmd/gofer_test.go
 delete mode 100644 runsc/container/fs.go
 delete mode 100644 runsc/container/fs_test.go
 create mode 100644 runsc/specutils/fs.go

(limited to 'runsc')

diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 9e2be0d37..dabf18c5f 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -60,6 +60,7 @@ go_test(
         "capability_test.go",
         "delete_test.go",
         "exec_test.go",
+        "gofer_test.go",
     ],
     data = [
         "//runsc",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 3039b389f..ff2fa2fb9 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -76,6 +76,11 @@ type Boot struct {
 	// startSyncFD is the file descriptor to synchronize runsc and sandbox.
 	startSyncFD int
 
+	// mountsFD is the file descriptor to read list of mounts after they have
+	// been resolved (direct paths, no symlinks). They are resolved outside the
+	// sandbox (e.g. gofer) and sent through this FD.
+	mountsFD int
+
 	// pidns is set if the sanadbox is in its own pid namespace.
 	pidns bool
 }
@@ -111,6 +116,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
 	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
 	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
+	f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
 }
 
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
@@ -191,6 +197,16 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		panic("setCapsAndCallSelf must never return success")
 	}
 
+	// Read resolved mount list and replace the original one from the spec.
+	mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file")
+	cleanMounts, err := specutils.ReadMounts(mountsFile)
+	if err != nil {
+		mountsFile.Close()
+		Fatalf("Error reading mounts file: %v", err)
+	}
+	mountsFile.Close()
+	spec.Mounts = cleanMounts
+
 	// Create the loader.
 	bootArgs := boot.Args{
 		ID:           f.Arg(0),
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 6f9711518..e712244ef 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -16,7 +16,11 @@ package cmd
 
 import (
 	"context"
+	"encoding/json"
+	"fmt"
 	"os"
+	"path/filepath"
+	"strings"
 	"sync"
 	"syscall"
 
@@ -59,6 +63,7 @@ type Gofer struct {
 
 	panicOnWrite bool
 	specFD       int
+	mountsFD     int
 }
 
 // Name implements subcommands.Command.
@@ -84,6 +89,7 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
 	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
 	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
+	f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
 }
 
 // Execute implements subcommands.Command.
@@ -100,45 +106,13 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("reading spec: %v", err)
 	}
 
-	// Find what path is going to be served by this gofer.
-	root := spec.Root.Path
-
 	conf := args[0].(*boot.Config)
 
-	if g.setUpRoot && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		// Convert all shared mounts into slave to be sure that nothing will be
-		// propagated outside of our namespace.
-		if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
-			Fatalf("error converting mounts: %v", err)
-		}
-
-		// FIXME: runsc can't be re-executed without
-		// /proc, so we create a tmpfs mount, mount ./proc and ./root
-		// there, then move this mount to the root and after
-		// setCapsAndCallSelf, runsc will chroot into /root.
-		//
-		// We need a directory to construct a new root and we know that
-		// runsc can't start without /proc, so we can use it for this.
-		flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC)
-		if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil {
-			Fatalf("error mounting tmpfs: %v", err)
-		}
-		os.Mkdir("/proc/proc", 0755)
-		os.Mkdir("/proc/root", 0755)
-		if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil {
-			Fatalf("error mounting proc: %v", err)
-		}
-		if err := syscall.Mount(root, "/proc/root", "", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
-			Fatalf("error mounting root: %v", err)
-		}
-		if err := pivotRoot("/proc"); err != nil {
-			Fatalf("faild to change the root file system: %v", err)
-		}
-		if err := os.Chdir("/"); err != nil {
-			Fatalf("failed to change working directory")
+	if g.setUpRoot {
+		if err := setupRootFS(spec, conf); err != nil {
+			Fatalf("Error setting up root FS: %v", err)
 		}
 	}
-
 	if g.applyCaps {
 		// Disable caps when calling myself again.
 		// Note: minimal argument handling for the default case to keep it simple.
@@ -150,15 +124,34 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		panic("unreachable")
 	}
 
+	// Find what path is going to be served by this gofer.
+	root := spec.Root.Path
+	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		root = "/root"
+	}
+
+	// Resolve mount points paths, then replace mounts from our spec and send the
+	// mount list over to the sandbox, so they are both in sync.
+	//
+	// Note that all mount points have been mounted in the proper location in
+	// setupRootFS().
+	cleanMounts, err := resolveMounts(spec.Mounts, root)
+	if err != nil {
+		Fatalf("Failure to resolve mounts: %v", err)
+	}
+	spec.Mounts = cleanMounts
+	go func() {
+		if err := g.writeMounts(cleanMounts); err != nil {
+			panic(fmt.Sprintf("Failed to write mounts: %v", err))
+		}
+	}()
+
 	specutils.LogSpec(spec)
 
 	// fsgofer should run with a umask of 0, because we want to preserve file
 	// modes exactly as sent by the sandbox, which will have applied its own umask.
 	syscall.Umask(0)
 
-	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		root = "/root"
-	}
 	if err := syscall.Chroot(root); err != nil {
 		Fatalf("failed to chroot to %q: %v", root, err)
 	}
@@ -232,6 +225,25 @@ func runServers(ats []p9.Attacher, ioFDs []int) {
 	log.Infof("All 9P servers exited.")
 }
 
+func (g *Gofer) writeMounts(mounts []specs.Mount) error {
+	bytes, err := json.Marshal(mounts)
+	if err != nil {
+		return err
+	}
+
+	f := os.NewFile(uintptr(g.mountsFD), "mounts file")
+	defer f.Close()
+
+	for written := 0; written < len(bytes); {
+		w, err := f.Write(bytes[written:])
+		if err != nil {
+			return err
+		}
+		written += w
+	}
+	return nil
+}
+
 func isReadonlyMount(opts []string) bool {
 	for _, o := range opts {
 		if o == "ro" {
@@ -240,3 +252,194 @@ func isReadonlyMount(opts []string) bool {
 	}
 	return false
 }
+
+func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
+	// Convert all shared mounts into slaves to be sure that nothing will be
+	// propagated outside of our namespace.
+	if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
+		Fatalf("error converting mounts: %v", err)
+	}
+
+	root := spec.Root.Path
+	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		// FIXME: runsc can't be re-executed without
+		// /proc, so we create a tmpfs mount, mount ./proc and ./root
+		// there, then move this mount to the root and after
+		// setCapsAndCallSelf, runsc will chroot into /root.
+		//
+		// We need a directory to construct a new root and we know that
+		// runsc can't start without /proc, so we can use it for this.
+		flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC)
+		if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil {
+			Fatalf("error mounting tmpfs: %v", err)
+		}
+
+		// Prepare tree structure for pivot_root(2).
+		os.Mkdir("/proc/proc", 0755)
+		os.Mkdir("/proc/root", 0755)
+		if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil {
+			Fatalf("error mounting proc: %v", err)
+		}
+		root = "/proc/root"
+	}
+
+	// Mount root path followed by submounts.
+	if err := syscall.Mount(spec.Root.Path, root, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
+		return fmt.Errorf("mounting root on root (%q) err: %v", spec.Root.Path, err)
+	}
+	flags := uint32(syscall.MS_SLAVE | syscall.MS_REC)
+	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
+		flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
+	}
+	if err := syscall.Mount("", spec.Root.Path, "", uintptr(flags), ""); err != nil {
+		return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", spec.Root.Path, flags, err)
+	}
+
+	// Replace the current spec, with the clean spec with symlinks resolved.
+	if err := setupMounts(spec.Mounts, root); err != nil {
+		Fatalf("error setting up FS: %v", err)
+	}
+
+	// Create working directory if needed.
+	if spec.Process.Cwd != "" {
+		dst, err := resolveSymlinks(root, spec.Process.Cwd)
+		if err != nil {
+			return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
+		}
+		if err := os.MkdirAll(dst, 0755); err != nil {
+			return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
+		}
+	}
+
+	// Check if root needs to be remounted as readonly.
+	if spec.Root.Readonly {
+		// If root is a mount point but not read-only, we can change mount options
+		// to make it read-only for extra safety.
+		log.Infof("Remounting root as readonly: %q", spec.Root.Path)
+		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
+		if err := syscall.Mount(spec.Root.Path, spec.Root.Path, "bind", flags, ""); err != nil {
+			return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
+		}
+	}
+
+	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		if err := pivotRoot("/proc"); err != nil {
+			Fatalf("faild to change the root file system: %v", err)
+		}
+		if err := os.Chdir("/"); err != nil {
+			Fatalf("failed to change working directory")
+		}
+	}
+	return nil
+}
+
+// setupMounts binds mount all mounts specified in the spec in their correct
+// location inside root. It will resolve relative paths and symlinks. It also
+// creates directories as needed.
+func setupMounts(mounts []specs.Mount, root string) error {
+	for _, m := range mounts {
+		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+			continue
+		}
+
+		dst, err := resolveSymlinks(root, m.Destination)
+		if err != nil {
+			return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
+		}
+
+		flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND
+		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
+		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
+			return fmt.Errorf("mounting %v: %v", m, err)
+		}
+
+		// Set propagation options that cannot be set together with other options.
+		flags = specutils.PropOptionsToFlags(m.Options)
+		if flags != 0 {
+			if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
+				return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err)
+			}
+		}
+	}
+	return nil
+}
+
+// resolveMounts resolved relative paths and symlinks to mount points.
+//
+// Note: mount points must already be in place for resolution to work.
+// Otherwise, it may follow symlinks to locations that would be overwritten
+// with another mount point and return the wrong location. In short, make sure
+// setupMounts() has been called before.
+func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) {
+	cleanMounts := make([]specs.Mount, 0, len(mounts))
+	for _, m := range mounts {
+		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+			cleanMounts = append(cleanMounts, m)
+			continue
+		}
+		dst, err := resolveSymlinks(root, m.Destination)
+		if err != nil {
+			return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
+		}
+		relDst, err := filepath.Rel(root, dst)
+		if err != nil {
+			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
+		}
+		cpy := m
+		cpy.Destination = filepath.Join("/", relDst)
+		cleanMounts = append(cleanMounts, cpy)
+	}
+	return cleanMounts, nil
+}
+
+// ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are
+// symlinks, they are evaluated relative to 'root' to ensure the end result is
+// the same as if the process was running inside the container.
+func resolveSymlinks(root, rel string) (string, error) {
+	return resolveSymlinksImpl(root, root, rel, 255)
+}
+
+func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
+	if followCount == 0 {
+		return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
+	}
+
+	rel = filepath.Clean(rel)
+	for _, name := range strings.Split(rel, string(filepath.Separator)) {
+		if name == "" {
+			continue
+		}
+		// Note that Join() resolves things like ".." and returns a clean path.
+		path := filepath.Join(base, name)
+		if !strings.HasPrefix(path, root) {
+			// One cannot '..' their way out of root.
+			path = root
+			continue
+		}
+		fi, err := os.Lstat(path)
+		if err != nil {
+			if !os.IsNotExist(err) {
+				return "", err
+			}
+			// Not found means there is no symlink to check. Just keep walking dirs.
+			base = path
+			continue
+		}
+		if fi.Mode()&os.ModeSymlink != 0 {
+			link, err := os.Readlink(path)
+			if err != nil {
+				return "", err
+			}
+			if filepath.IsAbs(link) {
+				base = root
+			}
+			base, err = resolveSymlinksImpl(root, base, link, followCount-1)
+			if err != nil {
+				return "", err
+			}
+			continue
+		}
+		base = path
+	}
+	return base, nil
+}
diff --git a/runsc/cmd/gofer_test.go b/runsc/cmd/gofer_test.go
new file mode 100644
index 000000000..8e692feb9
--- /dev/null
+++ b/runsc/cmd/gofer_test.go
@@ -0,0 +1,164 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"path/filepath"
+	"testing"
+)
+
+func tmpDir() string {
+	dir := os.Getenv("TEST_TMPDIR")
+	if dir == "" {
+		dir = "/tmp"
+	}
+	return dir
+}
+
+type dir struct {
+	rel  string
+	link string
+}
+
+func construct(root string, dirs []dir) error {
+	for _, d := range dirs {
+		p := path.Join(root, d.rel)
+		if d.link == "" {
+			if err := os.MkdirAll(p, 0755); err != nil {
+				return fmt.Errorf("error creating dir: %v", err)
+			}
+		} else {
+			if err := os.MkdirAll(path.Dir(p), 0755); err != nil {
+				return fmt.Errorf("error creating dir: %v", err)
+			}
+			if err := os.Symlink(d.link, p); err != nil {
+				return fmt.Errorf("error creating symlink: %v", err)
+			}
+		}
+	}
+	return nil
+}
+
+func TestResolveSymlinks(t *testing.T) {
+	root, err := ioutil.TempDir(tmpDir(), "root")
+	if err != nil {
+		t.Fatal("ioutil.TempDir() failed:", err)
+	}
+	dirs := []dir{
+		{"dir1/dir11/dir111/dir1111", ""}, // Just a boring dir
+		{"dir1/lnk12", "dir11"},           // Link to sibling
+		{"dir1/lnk13", "./dir11"},         // Link to sibling through self
+		{"dir1/lnk14", "../dir1/dir11"},   // Link to sibling through parent
+		{"dir1/dir15/lnk151", ".."},       // Link to parent
+		{"dir1/lnk16", "dir11/dir111"},    // Link to child
+		{"dir1/lnk17", "."},               // Link to self
+		{"dir1/lnk18", "lnk13"},           // Link to link
+		{"lnk2", "dir1/lnk13"},            // Link to link to link
+		{"dir3/dir21/lnk211", "../.."},    // Link to root relative
+		{"dir3/lnk22", "/"},               // Link to root absolute
+		{"dir3/lnk23", "/dir1"},           // Link to dir absolute
+		{"dir3/lnk24", "/dir1/lnk12"},     // Link to link absolute
+		{"lnk5", "../../.."},              // Link outside root
+	}
+	if err := construct(root, dirs); err != nil {
+		t.Fatal("construct failed:", err)
+	}
+
+	tests := []struct {
+		name        string
+		rel         string
+		want        string
+		compareHost bool
+	}{
+		{name: "root", rel: "/", want: "/", compareHost: true},
+		{name: "basic dir", rel: "/dir1/dir11/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dot 1", rel: "/dir1/dir11/./dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dot 2", rel: "/dir1/././dir11/./././././dir111/.", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dotdot 1", rel: "/dir1/dir11/../dir15", want: "/dir1/dir15", compareHost: true},
+		{name: "dotdot 2", rel: "/dir1/dir11/dir1111/../..", want: "/dir1", compareHost: true},
+
+		{name: "link sibling", rel: "/dir1/lnk12", want: "/dir1/dir11", compareHost: true},
+		{name: "link sibling + dir", rel: "/dir1/lnk12/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link sibling through self", rel: "/dir1/lnk13", want: "/dir1/dir11", compareHost: true},
+		{name: "link sibling through parent", rel: "/dir1/lnk14", want: "/dir1/dir11", compareHost: true},
+
+		{name: "link parent", rel: "/dir1/dir15/lnk151", want: "/dir1", compareHost: true},
+		{name: "link parent + dir", rel: "/dir1/dir15/lnk151/dir11", want: "/dir1/dir11", compareHost: true},
+		{name: "link child", rel: "/dir1/lnk16", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link child + dir", rel: "/dir1/lnk16/dir1111", want: "/dir1/dir11/dir111/dir1111", compareHost: true},
+		{name: "link self", rel: "/dir1/lnk17", want: "/dir1", compareHost: true},
+		{name: "link self + dir", rel: "/dir1/lnk17/dir11", want: "/dir1/dir11", compareHost: true},
+
+		{name: "link^2", rel: "/dir1/lnk18", want: "/dir1/dir11", compareHost: true},
+		{name: "link^2 + dir", rel: "/dir1/lnk18/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link^3", rel: "/lnk2", want: "/dir1/dir11", compareHost: true},
+		{name: "link^3 + dir", rel: "/lnk2/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+
+		{name: "link abs", rel: "/dir3/lnk23", want: "/dir1"},
+		{name: "link abs + dir", rel: "/dir3/lnk23/dir11", want: "/dir1/dir11"},
+		{name: "link^2 abs", rel: "/dir3/lnk24", want: "/dir1/dir11"},
+		{name: "link^2 abs + dir", rel: "/dir3/lnk24/dir111", want: "/dir1/dir11/dir111"},
+
+		{name: "root link rel", rel: "/dir3/dir21/lnk211", want: "/", compareHost: true},
+		{name: "root link abs", rel: "/dir3/lnk22", want: "/"},
+		{name: "root contain link", rel: "/lnk5/dir1", want: "/dir1"},
+		{name: "root contain dotdot", rel: "/dir1/dir11/../../../../../../../..", want: "/"},
+
+		{name: "crazy", rel: "/dir3/dir21/lnk211/dir3/lnk22/dir1/dir11/../../lnk5/dir3/../dir3/lnk24/dir111/dir1111/..", want: "/dir1/dir11/dir111"},
+	}
+	for _, tst := range tests {
+		t.Run(tst.name, func(t *testing.T) {
+			got, err := resolveSymlinks(root, tst.rel)
+			if err != nil {
+				t.Errorf("resolveSymlinks(root, %q) failed: %v", tst.rel, err)
+			}
+			want := path.Join(root, tst.want)
+			if got != want {
+				t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, got, want)
+			}
+			if tst.compareHost {
+				// Check that host got to the same end result.
+				host, err := filepath.EvalSymlinks(path.Join(root, tst.rel))
+				if err != nil {
+					t.Errorf("path.EvalSymlinks(root, %q) failed: %v", tst.rel, err)
+				}
+				if host != got {
+					t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, host, got)
+				}
+			}
+		})
+	}
+}
+
+func TestResolveSymlinksLoop(t *testing.T) {
+	root, err := ioutil.TempDir(tmpDir(), "root")
+	if err != nil {
+		t.Fatal("ioutil.TempDir() failed:", err)
+	}
+	dirs := []dir{
+		{"loop1", "loop2"},
+		{"loop2", "loop1"},
+	}
+	if err := construct(root, dirs); err != nil {
+		t.Fatal("construct failed:", err)
+	}
+	if _, err := resolveSymlinks(root, "loop1"); err == nil {
+		t.Errorf("resolveSymlinks() should have failed")
+	}
+}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 3b25ff79a..2936b7cdf 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -6,7 +6,6 @@ go_library(
     name = "container",
     srcs = [
         "container.go",
-        "fs.go",
         "hook.go",
         "status.go",
     ],
@@ -34,7 +33,6 @@ go_test(
     srcs = [
         "console_test.go",
         "container_test.go",
-        "fs_test.go",
         "multi_container_test.go",
         "shared_volume_test.go",
     ],
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 6f092a5ce..fdcf8d7b7 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -281,18 +281,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	if specutils.ShouldCreateSandbox(spec) {
 		log.Debugf("Creating new sandbox for container %q", id)
 
-		// Setup rootfs and mounts. It returns a new mount list with destination
-		// paths resolved. Since the spec for the root container is read from disk,
-		// Write the new spec to a new file that will be used by the sandbox.
-		cleanMounts, err := setupFS(spec, conf, bundleDir)
-		if err != nil {
-			return nil, fmt.Errorf("setup mounts: %v", err)
-		}
-		spec.Mounts = cleanMounts
-		if err := specutils.WriteCleanSpec(bundleDir, spec); err != nil {
-			return nil, fmt.Errorf("writing clean spec: %v", err)
-		}
-
 		// Create and join cgroup before processes are created to ensure they are
 		// part of the cgroup from the start (and all tneir children processes).
 		cg, err := cgroup.New(spec)
@@ -306,14 +294,14 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 			}
 		}
 		if err := runInCgroup(cg, func() error {
-			ioFiles, err := c.createGoferProcess(spec, conf, bundleDir)
+			ioFiles, specFile, err := c.createGoferProcess(spec, conf, bundleDir)
 			if err != nil {
 				return err
 			}
 
 			// Start a new sandbox for this container. Any errors after this point
 			// must destroy the container.
-			c.Sandbox, err = sandbox.New(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles, cg)
+			c.Sandbox, err = sandbox.New(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles, specFile, cg)
 			return err
 		}); err != nil {
 			return nil, err
@@ -387,26 +375,22 @@ func (c *Container) Start(conf *boot.Config) error {
 			return err
 		}
 	} else {
-		// Setup rootfs and mounts. It returns a new mount list with destination
-		// paths resolved. Replace the original spec with new mount list and start
-		// container.
-		cleanMounts, err := setupFS(c.Spec, conf, c.BundleDir)
-		if err != nil {
-			return fmt.Errorf("setup mounts: %v", err)
-		}
-		c.Spec.Mounts = cleanMounts
-		if err := specutils.WriteCleanSpec(c.BundleDir, c.Spec); err != nil {
-			return fmt.Errorf("writing clean spec: %v", err)
-		}
-
 		// Join cgroup to strt gofer process to ensure it's part of the cgroup from
 		// the start (and all tneir children processes).
 		if err := runInCgroup(c.Sandbox.Cgroup, func() error {
 			// Create the gofer process.
-			ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+			ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
 			if err != nil {
 				return err
 			}
+			defer mountsFile.Close()
+
+			cleanMounts, err := specutils.ReadMounts(mountsFile)
+			if err != nil {
+				return fmt.Errorf("reading mounts file: %v", err)
+			}
+			c.Spec.Mounts = cleanMounts
+
 			return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles)
 		}); err != nil {
 			return err
@@ -665,12 +649,6 @@ func (c *Container) Destroy() error {
 		errs = append(errs, err.Error())
 	}
 
-	if err := destroyFS(c.Spec); err != nil {
-		err = fmt.Errorf("destroying container fs: %v", err)
-		log.Warningf("%v", err)
-		errs = append(errs, err.Error())
-	}
-
 	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
 		err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err)
 		log.Warningf("%v", err)
@@ -787,7 +765,7 @@ func (c *Container) waitForStopped() error {
 	return backoff.Retry(op, b)
 }
 
-func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, error) {
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) {
 	// Start with the general config flags.
 	args := conf.ToFlags()
 
@@ -800,7 +778,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	if conf.LogFilename != "" {
 		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
 		if err != nil {
-			return nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
+			return nil, nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
 		}
 		defer logFile.Close()
 		goferEnds = append(goferEnds, logFile)
@@ -811,7 +789,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	if conf.DebugLog != "" {
 		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer")
 		if err != nil {
-			return nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
+			return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
 		}
 		defer debugLogFile.Close()
 		goferEnds = append(goferEnds, debugLogFile)
@@ -825,30 +803,39 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	// Open the spec file to donate to the sandbox.
-	specFile, err := specutils.OpenCleanSpec(bundleDir)
+	specFile, err := specutils.OpenSpec(bundleDir)
 	if err != nil {
-		return nil, fmt.Errorf("opening spec file: %v", err)
+		return nil, nil, fmt.Errorf("opening spec file: %v", err)
 	}
 	defer specFile.Close()
 	goferEnds = append(goferEnds, specFile)
 	args = append(args, "--spec-fd="+strconv.Itoa(nextFD))
 	nextFD++
 
+	// Create pipe that allows gofer to send mount list to sandbox after all paths
+	// have been resolved.
+	mountsSand, mountsGofer, err := os.Pipe()
+	if err != nil {
+		return nil, nil, err
+	}
+	defer mountsGofer.Close()
+	goferEnds = append(goferEnds, mountsGofer)
+	args = append(args, fmt.Sprintf("--mounts-fd=%d", nextFD))
+	nextFD++
+
 	// Add root mount and then add any other additional mounts.
 	mountCount := 1
-
-	// Add additional mounts.
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			mountCount++
 		}
 	}
-	sandEnds := make([]*os.File, 0, mountCount)
 
+	sandEnds := make([]*os.File, 0, mountCount)
 	for i := 0; i < mountCount; i++ {
 		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
 		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD"))
 
@@ -884,12 +871,12 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
 	c.GoferPid = cmd.Process.Pid
 	c.goferIsChild = true
-	return sandEnds, nil
+	return sandEnds, mountsSand, nil
 }
 
 // changeStatus transitions from one status to another ensuring that the
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 06a25de6d..f17155175 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1594,6 +1594,171 @@ func TestCreateWorkingDir(t *testing.T) {
 	}
 }
 
+// TestMountPropagation verifies that mount propagates to slave but not to
+// private mounts.
+func TestMountPropagation(t *testing.T) {
+	// Setup dir structure:
+	//   - src: is mounted as shared and is used as source for both private and
+	//     slave mounts
+	//   - dir: will be bind mounted inside src and should propagate to slave
+	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "mount")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed: %v", err)
+	}
+	src := filepath.Join(tmpDir, "src")
+	srcMnt := filepath.Join(src, "mnt")
+	dir := filepath.Join(tmpDir, "dir")
+	for _, path := range []string{src, srcMnt, dir} {
+		if err := os.MkdirAll(path, 0777); err != nil {
+			t.Fatalf("MkdirAll(%q): %v", path, err)
+		}
+	}
+	dirFile := filepath.Join(dir, "file")
+	f, err := os.Create(dirFile)
+	if err != nil {
+		t.Fatalf("os.Create(%q): %v", dirFile, err)
+	}
+	f.Close()
+
+	// Setup src as a shared mount.
+	if err := syscall.Mount(src, src, "bind", syscall.MS_BIND, ""); err != nil {
+		t.Fatalf("mount(%q, %q, MS_BIND): %v", dir, srcMnt, err)
+	}
+	if err := syscall.Mount("", src, "", syscall.MS_SHARED, ""); err != nil {
+		t.Fatalf("mount(%q, MS_SHARED): %v", srcMnt, err)
+	}
+
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	priv := filepath.Join(tmpDir, "priv")
+	slave := filepath.Join(tmpDir, "slave")
+	spec.Mounts = []specs.Mount{
+		{
+			Source:      src,
+			Destination: priv,
+			Type:        "bind",
+			Options:     []string{"private"},
+		},
+		{
+			Source:      src,
+			Destination: slave,
+			Type:        "bind",
+			Options:     []string{"slave"},
+		},
+	}
+
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("creating container: %v", err)
+	}
+	defer cont.Destroy()
+
+	if err := cont.Start(conf); err != nil {
+		t.Fatalf("starting container: %v", err)
+	}
+
+	// After the container is started, mount dir inside source and check what
+	// happens to both destinations.
+	if err := syscall.Mount(dir, srcMnt, "bind", syscall.MS_BIND, ""); err != nil {
+		t.Fatalf("mount(%q, %q, MS_BIND): %v", dir, srcMnt, err)
+	}
+
+	// Check that mount didn't propagate to private mount.
+	privFile := filepath.Join(priv, "mnt", "file")
+	args := &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "!", "-f", privFile},
+	}
+	if ws, err := cont.executeSync(args); err != nil || ws != 0 {
+		t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err)
+	}
+
+	// Check that mount propagated to slave mount.
+	slaveFile := filepath.Join(slave, "mnt", "file")
+	args = &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", slaveFile},
+	}
+	if ws, err := cont.executeSync(args); err != nil || ws != 0 {
+		t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err)
+	}
+}
+
+func TestMountSymlink(t *testing.T) {
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir() failed: %v", err)
+		}
+
+		source := path.Join(dir, "source")
+		target := path.Join(dir, "target")
+		for _, path := range []string{source, target} {
+			if err := os.MkdirAll(path, 0777); err != nil {
+				t.Fatalf("os.MkdirAll(): %v", err)
+			}
+		}
+		f, err := os.Create(path.Join(source, "file"))
+		if err != nil {
+			t.Fatalf("os.Create(): %v", err)
+		}
+		f.Close()
+
+		link := path.Join(dir, "link")
+		if err := os.Symlink(target, link); err != nil {
+			t.Fatalf("os.Symlink(%q, %q): %v", target, link, err)
+		}
+
+		spec := testutil.NewSpecWithArgs("/bin/sleep", "1000")
+
+		// Mount to a symlink to ensure the mount code will follow it and mount
+		// at the symlink target.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: link,
+			Source:      source,
+		})
+
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("creating container: %v", err)
+		}
+		defer cont.Destroy()
+
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("starting container: %v", err)
+		}
+
+		// Check that symlink was resolved and mount was created where the symlink
+		// is pointing to.
+		file := path.Join(target, "file")
+		args := &control.ExecArgs{
+			Filename: "/usr/bin/test",
+			Argv:     []string{"test", "-f", file},
+		}
+		if ws, err := cont.executeSync(args); err != nil || ws != 0 {
+			t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
+		}
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
deleted file mode 100644
index 998160487..000000000
--- a/runsc/container/fs.go
+++ /dev/null
@@ -1,287 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package container
-
-import (
-	"bufio"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-	"syscall"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/runsc/boot"
-	"gvisor.googlesource.com/gvisor/runsc/specutils"
-)
-
-type mapping struct {
-	set bool
-	val uint32
-}
-
-var optionsMap = map[string]mapping{
-	"acl":           {set: true, val: syscall.MS_POSIXACL},
-	"async":         {set: false, val: syscall.MS_SYNCHRONOUS},
-	"atime":         {set: false, val: syscall.MS_NOATIME},
-	"bind":          {set: true, val: syscall.MS_BIND},
-	"defaults":      {set: true, val: 0},
-	"dev":           {set: false, val: syscall.MS_NODEV},
-	"diratime":      {set: false, val: syscall.MS_NODIRATIME},
-	"dirsync":       {set: true, val: syscall.MS_DIRSYNC},
-	"exec":          {set: false, val: syscall.MS_NOEXEC},
-	"iversion":      {set: true, val: syscall.MS_I_VERSION},
-	"loud":          {set: false, val: syscall.MS_SILENT},
-	"mand":          {set: true, val: syscall.MS_MANDLOCK},
-	"noacl":         {set: false, val: syscall.MS_POSIXACL},
-	"noatime":       {set: true, val: syscall.MS_NOATIME},
-	"nodev":         {set: true, val: syscall.MS_NODEV},
-	"nodiratime":    {set: true, val: syscall.MS_NODIRATIME},
-	"noexec":        {set: true, val: syscall.MS_NOEXEC},
-	"noiversion":    {set: false, val: syscall.MS_I_VERSION},
-	"nomand":        {set: false, val: syscall.MS_MANDLOCK},
-	"norelatime":    {set: false, val: syscall.MS_RELATIME},
-	"nostrictatime": {set: false, val: syscall.MS_STRICTATIME},
-	"nosuid":        {set: true, val: syscall.MS_NOSUID},
-	"private":       {set: true, val: syscall.MS_PRIVATE},
-	"rbind":         {set: true, val: syscall.MS_BIND | syscall.MS_REC},
-	"relatime":      {set: true, val: syscall.MS_RELATIME},
-	"remount":       {set: true, val: syscall.MS_REMOUNT},
-	"ro":            {set: true, val: syscall.MS_RDONLY},
-	"rprivate":      {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC},
-	"rw":            {set: false, val: syscall.MS_RDONLY},
-	"silent":        {set: true, val: syscall.MS_SILENT},
-	"strictatime":   {set: true, val: syscall.MS_STRICTATIME},
-	"suid":          {set: false, val: syscall.MS_NOSUID},
-	"sync":          {set: true, val: syscall.MS_SYNCHRONOUS},
-}
-
-// setupFS creates the container directory structure under 'spec.Root.Path'.
-// This allows the gofer serving the containers to be chroot under this
-// directory to create an extra layer to security in case the gofer gets
-// compromised.
-// Returns list of mounts equivalent to 'spec.Mounts' with all destination paths
-// cleaned and with symlinks resolved.
-func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mount, error) {
-	rv := make([]specs.Mount, 0, len(spec.Mounts))
-	for _, m := range spec.Mounts {
-		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
-			rv = append(rv, m)
-			continue
-		}
-
-		// It's possible that 'm.Destination' follows symlinks inside the
-		// container.
-		dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
-		if err != nil {
-			return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
-		}
-
-		flags := optionsToFlags(m.Options)
-		flags |= syscall.MS_BIND
-		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
-		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
-			return nil, fmt.Errorf("mounting %v: %v", m, err)
-		}
-
-		// Make the mount a slave, so that for recursive bind mount, umount won't
-		// propagate to the source.
-		flags = syscall.MS_SLAVE | syscall.MS_REC
-		if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
-			return nil, fmt.Errorf("mount rslave dst: %q, flags: %#x, err: %v", dst, flags, err)
-		}
-
-		cpy := m
-		relDst, err := filepath.Rel(spec.Root.Path, dst)
-		if err != nil {
-			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, spec.Root.Path, err))
-		}
-		cpy.Destination = filepath.Join("/", relDst)
-		rv = append(rv, cpy)
-	}
-
-	if spec.Process.Cwd != "" {
-		dst, err := resolveSymlinks(spec.Root.Path, spec.Process.Cwd)
-		if err != nil {
-			return nil, fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
-		}
-		if err := os.MkdirAll(dst, 0755); err != nil {
-			return nil, err
-		}
-	}
-
-	// If root is read only, check if it needs to be remounted as readonly.
-	if spec.Root.Readonly {
-		isMountPoint, readonly, err := mountInfo(spec.Root.Path)
-		if err != nil {
-			return nil, err
-		}
-		if readonly {
-			return rv, nil
-		}
-		if !isMountPoint {
-			// Readonly root is not a mount point nor read-only. Can't do much other
-			// than just logging a warning. The gofer will prevent files to be open
-			// in write mode.
-			log.Warningf("Mount where root is located is not read-only and cannot be changed: %q", spec.Root.Path)
-			return rv, nil
-		}
-
-		// If root is a mount point but not read-only, we can change mount options
-		// to make it read-only for extra safety.
-		log.Infof("Remounting root as readonly: %q", spec.Root.Path)
-		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
-		src := spec.Root.Path
-		if err := syscall.Mount(src, src, "bind", flags, ""); err != nil {
-			return nil, fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
-		}
-	}
-	return rv, nil
-}
-
-// mountInfo returns whether the path is a mount point and whether the mount
-// that path belongs to is read-only.
-func mountInfo(path string) (bool, bool, error) {
-	// Mounts are listed by their real paths.
-	realPath, err := filepath.EvalSymlinks(path)
-	if err != nil {
-		return false, false, err
-	}
-	f, err := os.Open("/proc/mounts")
-	if err != nil {
-		return false, false, err
-	}
-	scanner := bufio.NewScanner(f)
-
-	var mountPoint string
-	var readonly bool
-	for scanner.Scan() {
-		line := scanner.Text()
-		parts := strings.Split(line, " ")
-		if len(parts) < 4 {
-			return false, false, fmt.Errorf("invalid /proc/mounts line format %q", line)
-		}
-		mp := parts[1]
-		opts := strings.Split(parts[3], ",")
-
-		// Find the closest submount to the path.
-		if strings.Contains(realPath, mp) && len(mp) > len(mountPoint) {
-			mountPoint = mp
-			readonly = specutils.ContainsStr(opts, "ro")
-		}
-	}
-	if err := scanner.Err(); err != nil {
-		return false, false, err
-	}
-	return mountPoint == realPath, readonly, nil
-}
-
-// destroyFS unmounts mounts done by runsc under `spec.Root.Path`. This
-// recovers the container rootfs into the original state.
-func destroyFS(spec *specs.Spec) error {
-	for _, m := range spec.Mounts {
-		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
-			continue
-		}
-
-		// It's possible that 'm.Destination' follows symlinks inside the
-		// container.
-		dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
-		if err != nil {
-			return err
-		}
-
-		flags := syscall.MNT_DETACH
-		log.Infof("Unmounting dst: %q, flags: %#x", dst, flags)
-		// Do not return error if dst is not a mountpoint.
-		// Based on http://man7.org/linux/man-pages/man2/umount.2.html
-		// For kernel version 2.6+ and MNT_DETACH flag, EINVAL means
-		// the dst is not a mount point.
-		if err := syscall.Unmount(dst, flags); err != nil &&
-			!os.IsNotExist(err) && err != syscall.EINVAL {
-			return err
-		}
-	}
-	return nil
-}
-
-// resolveSymlinks walks 'rel' having 'root' as the root directory. If there are
-// symlinks, they are evaluated relative to 'root' to ensure the end result is
-// the same as if the process was running inside the container.
-func resolveSymlinks(root, rel string) (string, error) {
-	return resolveSymlinksImpl(root, root, rel, 255)
-}
-
-func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
-	if followCount == 0 {
-		return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
-	}
-
-	rel = filepath.Clean(rel)
-	for _, name := range strings.Split(rel, string(filepath.Separator)) {
-		if name == "" {
-			continue
-		}
-		// Note that Join() resolves things like ".." and returns a clean path.
-		path := filepath.Join(base, name)
-		if !strings.HasPrefix(path, root) {
-			// One cannot '..' their way out of root.
-			path = root
-			continue
-		}
-		fi, err := os.Lstat(path)
-		if err != nil {
-			if !os.IsNotExist(err) {
-				return "", err
-			}
-			// Not found means there is no symlink to check. Just keep walking dirs.
-			base = path
-			continue
-		}
-		if fi.Mode()&os.ModeSymlink != 0 {
-			link, err := os.Readlink(path)
-			if err != nil {
-				return "", err
-			}
-			if filepath.IsAbs(link) {
-				base = root
-			}
-			base, err = resolveSymlinksImpl(root, base, link, followCount-1)
-			if err != nil {
-				return "", err
-			}
-			continue
-		}
-		base = path
-	}
-	return base, nil
-}
-
-func optionsToFlags(opts []string) uint32 {
-	var rv uint32
-	for _, opt := range opts {
-		if m, ok := optionsMap[opt]; ok {
-			if m.set {
-				rv |= m.val
-			} else {
-				rv ^= m.val
-			}
-		} else {
-			log.Warningf("Ignoring mount option %q", opt)
-		}
-	}
-	return rv
-}
diff --git a/runsc/container/fs_test.go b/runsc/container/fs_test.go
deleted file mode 100644
index 87cdb078e..000000000
--- a/runsc/container/fs_test.go
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package container
-
-import (
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path"
-	"path/filepath"
-	"testing"
-
-	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
-)
-
-type dir struct {
-	rel  string
-	link string
-}
-
-func construct(root string, dirs []dir) error {
-	for _, d := range dirs {
-		p := path.Join(root, d.rel)
-		if d.link == "" {
-			if err := os.MkdirAll(p, 0755); err != nil {
-				return fmt.Errorf("error creating dir: %v", err)
-			}
-		} else {
-			if err := os.MkdirAll(path.Dir(p), 0755); err != nil {
-				return fmt.Errorf("error creating dir: %v", err)
-			}
-			if err := os.Symlink(d.link, p); err != nil {
-				return fmt.Errorf("error creating symlink: %v", err)
-			}
-		}
-	}
-	return nil
-}
-
-func TestResolveSymlinks(t *testing.T) {
-	root, err := ioutil.TempDir(testutil.TmpDir(), "root")
-	if err != nil {
-		t.Fatal("ioutil.TempDir() failed:", err)
-	}
-	dirs := []dir{
-		{"dir1/dir11/dir111/dir1111", ""}, // Just a boring dir
-		{"dir1/lnk12", "dir11"},           // Link to sibling
-		{"dir1/lnk13", "./dir11"},         // Link to sibling through self
-		{"dir1/lnk14", "../dir1/dir11"},   // Link to sibling through parent
-		{"dir1/dir15/lnk151", ".."},       // Link to parent
-		{"dir1/lnk16", "dir11/dir111"},    // Link to child
-		{"dir1/lnk17", "."},               // Link to self
-		{"dir1/lnk18", "lnk13"},           // Link to link
-		{"lnk2", "dir1/lnk13"},            // Link to link to link
-		{"dir3/dir21/lnk211", "../.."},    // Link to root relative
-		{"dir3/lnk22", "/"},               // Link to root absolute
-		{"dir3/lnk23", "/dir1"},           // Link to dir absolute
-		{"dir3/lnk24", "/dir1/lnk12"},     // Link to link absolute
-		{"lnk5", "../../.."},              // Link outside root
-	}
-	if err := construct(root, dirs); err != nil {
-		t.Fatal("construct failed:", err)
-	}
-
-	tests := []struct {
-		name        string
-		rel         string
-		want        string
-		compareHost bool
-	}{
-		{name: "root", rel: "/", want: "/", compareHost: true},
-		{name: "basic dir", rel: "/dir1/dir11/dir111", want: "/dir1/dir11/dir111", compareHost: true},
-		{name: "dot 1", rel: "/dir1/dir11/./dir111", want: "/dir1/dir11/dir111", compareHost: true},
-		{name: "dot 2", rel: "/dir1/././dir11/./././././dir111/.", want: "/dir1/dir11/dir111", compareHost: true},
-		{name: "dotdot 1", rel: "/dir1/dir11/../dir15", want: "/dir1/dir15", compareHost: true},
-		{name: "dotdot 2", rel: "/dir1/dir11/dir1111/../..", want: "/dir1", compareHost: true},
-
-		{name: "link sibling", rel: "/dir1/lnk12", want: "/dir1/dir11", compareHost: true},
-		{name: "link sibling + dir", rel: "/dir1/lnk12/dir111", want: "/dir1/dir11/dir111", compareHost: true},
-		{name: "link sibling through self", rel: "/dir1/lnk13", want: "/dir1/dir11", compareHost: true},
-		{name: "link sibling through parent", rel: "/dir1/lnk14", want: "/dir1/dir11", compareHost: true},
-
-		{name: "link parent", rel: "/dir1/dir15/lnk151", want: "/dir1", compareHost: true},
-		{name: "link parent + dir", rel: "/dir1/dir15/lnk151/dir11", want: "/dir1/dir11", compareHost: true},
-		{name: "link child", rel: "/dir1/lnk16", want: "/dir1/dir11/dir111", compareHost: true},
-		{name: "link child + dir", rel: "/dir1/lnk16/dir1111", want: "/dir1/dir11/dir111/dir1111", compareHost: true},
-		{name: "link self", rel: "/dir1/lnk17", want: "/dir1", compareHost: true},
-		{name: "link self + dir", rel: "/dir1/lnk17/dir11", want: "/dir1/dir11", compareHost: true},
-
-		{name: "link^2", rel: "/dir1/lnk18", want: "/dir1/dir11", compareHost: true},
-		{name: "link^2 + dir", rel: "/dir1/lnk18/dir111", want: "/dir1/dir11/dir111", compareHost: true},
-		{name: "link^3", rel: "/lnk2", want: "/dir1/dir11", compareHost: true},
-		{name: "link^3 + dir", rel: "/lnk2/dir111", want: "/dir1/dir11/dir111", compareHost: true},
-
-		{name: "link abs", rel: "/dir3/lnk23", want: "/dir1"},
-		{name: "link abs + dir", rel: "/dir3/lnk23/dir11", want: "/dir1/dir11"},
-		{name: "link^2 abs", rel: "/dir3/lnk24", want: "/dir1/dir11"},
-		{name: "link^2 abs + dir", rel: "/dir3/lnk24/dir111", want: "/dir1/dir11/dir111"},
-
-		{name: "root link rel", rel: "/dir3/dir21/lnk211", want: "/", compareHost: true},
-		{name: "root link abs", rel: "/dir3/lnk22", want: "/"},
-		{name: "root contain link", rel: "/lnk5/dir1", want: "/dir1"},
-		{name: "root contain dotdot", rel: "/dir1/dir11/../../../../../../../..", want: "/"},
-
-		{name: "crazy", rel: "/dir3/dir21/lnk211/dir3/lnk22/dir1/dir11/../../lnk5/dir3/../dir3/lnk24/dir111/dir1111/..", want: "/dir1/dir11/dir111"},
-	}
-	for _, tst := range tests {
-		t.Run(tst.name, func(t *testing.T) {
-			got, err := resolveSymlinks(root, tst.rel)
-			if err != nil {
-				t.Errorf("resolveSymlinks(root, %q) failed: %v", tst.rel, err)
-			}
-			want := path.Join(root, tst.want)
-			if got != want {
-				t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, got, want)
-			}
-			if tst.compareHost {
-				// Check that host got to the same end result.
-				host, err := filepath.EvalSymlinks(path.Join(root, tst.rel))
-				if err != nil {
-					t.Errorf("path.EvalSymlinks(root, %q) failed: %v", tst.rel, err)
-				}
-				if host != got {
-					t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, host, got)
-				}
-			}
-		})
-	}
-}
-
-func TestResolveSymlinksLoop(t *testing.T) {
-	root, err := ioutil.TempDir(testutil.TmpDir(), "root")
-	if err != nil {
-		t.Fatal("ioutil.TempDir() failed:", err)
-	}
-	dirs := []dir{
-		{"loop1", "loop2"},
-		{"loop2", "loop1"},
-	}
-	if err := construct(root, dirs); err != nil {
-		t.Fatal("construct failed:", err)
-	}
-	if _, err := resolveSymlinks(root, "loop1"); err == nil {
-		t.Errorf("resolveSymlinks() should have failed")
-	}
-}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 2698e3f86..ae6375e13 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -75,7 +75,7 @@ type Sandbox struct {
 
 // New creates the sandbox process. The caller must call Destroy() on the
 // sandbox.
-func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, cg *cgroup.Cgroup) (*Sandbox, error) {
+func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, specFile *os.File, cg *cgroup.Cgroup) (*Sandbox, error) {
 	s := &Sandbox{ID: id, Cgroup: cg}
 	// The Cleanup object cleans up partially created sandboxes when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
@@ -86,17 +86,14 @@ func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	defer c.Clean()
 
 	// Create pipe to synchronize when sandbox process has been booted.
-	fds := make([]int, 2)
-	if err := syscall.Pipe(fds); err != nil {
+	clientSyncFile, sandboxSyncFile, err := os.Pipe()
+	if err != nil {
 		return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
 	}
-	clientSyncFile := os.NewFile(uintptr(fds[0]), "client sandbox sync")
 	defer clientSyncFile.Close()
 
-	sandboxSyncFile := os.NewFile(uintptr(fds[1]), "sandbox sync")
-
 	// Create the sandbox process.
-	err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles, sandboxSyncFile)
+	err = s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles, specFile, sandboxSyncFile)
 	// sandboxSyncFile has to be closed to be able to detect when the sandbox
 	// process exits unexpectedly.
 	sandboxSyncFile.Close()
@@ -294,7 +291,7 @@ func (s *Sandbox) connError(err error) error {
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, startSyncFile *os.File) error {
+func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, mountsFile, startSyncFile *os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -345,10 +342,14 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD))
 	nextFD++
 
-	// Open the spec file to donate to the sandbox.
-	specFile, err := specutils.OpenCleanSpec(bundleDir)
+	defer mountsFile.Close()
+	cmd.ExtraFiles = append(cmd.ExtraFiles, mountsFile)
+	cmd.Args = append(cmd.Args, "--mounts-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
+	specFile, err := specutils.OpenSpec(bundleDir)
 	if err != nil {
-		return fmt.Errorf("opening spec file: %v", err)
+		return err
 	}
 	defer specFile.Close()
 	cmd.ExtraFiles = append(cmd.ExtraFiles, specFile)
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 372799850..15476de6f 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_library(
     name = "specutils",
     srcs = [
+        "fs.go",
         "namespace.go",
         "specutils.go",
     ],
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
new file mode 100644
index 000000000..b812a5fbd
--- /dev/null
+++ b/runsc/specutils/fs.go
@@ -0,0 +1,139 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	"fmt"
+	"path"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+type mapping struct {
+	set bool
+	val uint32
+}
+
+// optionsMap maps mount propagation-related OCI filesystem options to mount(2)
+// syscall flags.
+var optionsMap = map[string]mapping{
+	"acl":           {set: true, val: syscall.MS_POSIXACL},
+	"async":         {set: false, val: syscall.MS_SYNCHRONOUS},
+	"atime":         {set: false, val: syscall.MS_NOATIME},
+	"bind":          {set: true, val: syscall.MS_BIND},
+	"defaults":      {set: true, val: 0},
+	"dev":           {set: false, val: syscall.MS_NODEV},
+	"diratime":      {set: false, val: syscall.MS_NODIRATIME},
+	"dirsync":       {set: true, val: syscall.MS_DIRSYNC},
+	"exec":          {set: false, val: syscall.MS_NOEXEC},
+	"iversion":      {set: true, val: syscall.MS_I_VERSION},
+	"loud":          {set: false, val: syscall.MS_SILENT},
+	"mand":          {set: true, val: syscall.MS_MANDLOCK},
+	"noacl":         {set: false, val: syscall.MS_POSIXACL},
+	"noatime":       {set: true, val: syscall.MS_NOATIME},
+	"nodev":         {set: true, val: syscall.MS_NODEV},
+	"nodiratime":    {set: true, val: syscall.MS_NODIRATIME},
+	"noiversion":    {set: false, val: syscall.MS_I_VERSION},
+	"nomand":        {set: false, val: syscall.MS_MANDLOCK},
+	"norelatime":    {set: false, val: syscall.MS_RELATIME},
+	"nostrictatime": {set: false, val: syscall.MS_STRICTATIME},
+	"nosuid":        {set: true, val: syscall.MS_NOSUID},
+	"rbind":         {set: true, val: syscall.MS_BIND | syscall.MS_REC},
+	"relatime":      {set: true, val: syscall.MS_RELATIME},
+	"remount":       {set: true, val: syscall.MS_REMOUNT},
+	"ro":            {set: true, val: syscall.MS_RDONLY},
+	"rw":            {set: false, val: syscall.MS_RDONLY},
+	"silent":        {set: true, val: syscall.MS_SILENT},
+	"strictatime":   {set: true, val: syscall.MS_STRICTATIME},
+	"suid":          {set: false, val: syscall.MS_NOSUID},
+	"sync":          {set: true, val: syscall.MS_SYNCHRONOUS},
+}
+
+// propOptionsMap is similar to optionsMap, but it lists propagation options
+// that cannot be used together with other flags.
+var propOptionsMap = map[string]mapping{
+	"private":     {set: true, val: syscall.MS_PRIVATE},
+	"rprivate":    {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC},
+	"slave":       {set: true, val: syscall.MS_SLAVE},
+	"rslave":      {set: true, val: syscall.MS_SLAVE | syscall.MS_REC},
+	"unbindable":  {set: true, val: syscall.MS_UNBINDABLE},
+	"runbindable": {set: true, val: syscall.MS_UNBINDABLE | syscall.MS_REC},
+}
+
+// invalidOptions list options not allowed.
+//   - shared: sandbox must be isolated from the host. Propagating mount changes
+//     from the sandbox to the host breaks the isolation.
+//   - noexec: not yet supported. Don't ignore it since it could break
+//     in-sandbox security.
+var invalidOptions = []string{"shared", "rshared", "noexec"}
+
+// OptionsToFlags converts mount options to syscall flags.
+func OptionsToFlags(opts []string) uint32 {
+	return optionsToFlags(opts, optionsMap)
+}
+
+// PropOptionsToFlags converts propagation mount options to syscall flags.
+// Propagation options cannot be set other with other options and must be
+// handled separatedly.
+func PropOptionsToFlags(opts []string) uint32 {
+	return optionsToFlags(opts, propOptionsMap)
+}
+
+func optionsToFlags(opts []string, source map[string]mapping) uint32 {
+	var rv uint32
+	for _, opt := range opts {
+		if m, ok := source[opt]; ok {
+			if m.set {
+				rv |= m.val
+			} else {
+				rv ^= m.val
+			}
+		}
+	}
+	return rv
+}
+
+// ValidateMount validates that spec mounts are correct.
+func validateMount(mnt *specs.Mount) error {
+	if !path.IsAbs(mnt.Destination) {
+		return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
+	}
+
+	if mnt.Type == "bind" {
+		for _, o := range mnt.Options {
+			if ContainsStr(invalidOptions, o) {
+				return fmt.Errorf("mount option %q is not supported: %v", o, mnt)
+			}
+			_, ok1 := optionsMap[o]
+			_, ok2 := propOptionsMap[o]
+			if !ok1 && !ok2 {
+				log.Warningf("Ignoring unknown mount option %q", o)
+			}
+		}
+	}
+	return nil
+}
+
+// ValidateRootfsPropagation validates that rootfs propagation options are
+// correct.
+func validateRootfsPropagation(opt string) error {
+	flags := PropOptionsToFlags([]string{opt})
+	if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
+		return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
+	}
+	return nil
+}
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 73fab13e1..35da789f4 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -105,9 +105,9 @@ func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNam
 	return out
 }
 
-// SetNS sets the namespace of the given type.  It must be called with
+// setNS sets the namespace of the given type.  It must be called with
 // OSThreadLocked.
-func SetNS(fd, nsType uintptr) error {
+func setNS(fd, nsType uintptr) error {
 	if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
 		return err
 	}
@@ -119,30 +119,30 @@ func SetNS(fd, nsType uintptr) error {
 //
 // Preconditions: Must be called with os thread locked.
 func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
-	log.Infof("applying namespace %v at path %q", ns.Type, ns.Path)
+	log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
 	newNS, err := os.Open(ns.Path)
 	if err != nil {
 		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
 	}
 	defer newNS.Close()
 
-	// Store current netns to restore back after child is started.
+	// Store current namespace to restore back.
 	curPath := nsPath(ns.Type)
 	oldNS, err := os.Open(curPath)
 	if err != nil {
 		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
 	}
 
-	// Set netns to the one requested and setup function to restore it back.
+	// Set namespace to the one requested and setup function to restore it back.
 	flag := nsCloneFlag(ns.Type)
-	if err := SetNS(newNS.Fd(), flag); err != nil {
+	if err := setNS(newNS.Fd(), flag); err != nil {
 		oldNS.Close()
 		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
 	}
 	return func() {
-		log.Infof("restoring namespace %v", ns.Type)
+		log.Infof("Restoring namespace %v", ns.Type)
 		defer oldNS.Close()
-		if err := SetNS(oldNS.Fd(), flag); err != nil {
+		if err := setNS(oldNS.Fd(), flag); err != nil {
 			panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
 		}
 	}, nil
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 4e7893ab4..cbf099c64 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -92,9 +92,14 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("Seccomp spec is being ignored")
 	}
 
-	for i, m := range spec.Mounts {
-		if !path.IsAbs(m.Destination) {
-			return fmt.Errorf("Spec.Mounts[%d] Mount.Destination must be an absolute path: %v", i, m)
+	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
+		if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
+			return err
+		}
+	}
+	for _, m := range spec.Mounts {
+		if err := validateMount(&m); err != nil {
+			return err
 		}
 	}
 
@@ -129,15 +134,19 @@ func absPath(base, rel string) string {
 	return filepath.Join(base, rel)
 }
 
+// OpenSpec opens an OCI runtime spec from the given bundle directory.
+func OpenSpec(bundleDir string) (*os.File, error) {
+	// The spec file must be named "config.json" inside the bundle directory.
+	return os.Open(filepath.Join(bundleDir, "config.json"))
+}
+
 // ReadSpec reads an OCI runtime spec from the given bundle directory.
 // ReadSpec also normalizes all potential relative paths into absolute
 // path, e.g. spec.Root.Path, mount.Source.
 func ReadSpec(bundleDir string) (*specs.Spec, error) {
-	// The spec file must be in "config.json" inside the bundle directory.
-	specPath := filepath.Join(bundleDir, "config.json")
-	specFile, err := os.Open(specPath)
+	specFile, err := OpenSpec(bundleDir)
 	if err != nil {
-		return nil, fmt.Errorf("error opening spec file %q: %v", specPath, err)
+		return nil, fmt.Errorf("error opening spec file %q: %v", specFile.Name(), err)
 	}
 	defer specFile.Close()
 	return ReadSpecFromFile(bundleDir, specFile)
@@ -171,27 +180,17 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error)
 	return &spec, nil
 }
 
-// OpenCleanSpec opens spec file that has destination mount paths resolved to
-// their absolute location.
-func OpenCleanSpec(bundleDir string) (*os.File, error) {
-	f, err := os.Open(filepath.Join(bundleDir, "config.clean.json"))
+// ReadMounts reads mount list from a file.
+func ReadMounts(f *os.File) ([]specs.Mount, error) {
+	bytes, err := ioutil.ReadAll(f)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("error reading mounts: %v", err)
 	}
-	if _, err := f.Seek(0, os.SEEK_SET); err != nil {
-		f.Close()
-		return nil, fmt.Errorf("error seeking to beginning of file %q: %v", f.Name(), err)
-	}
-	return f, nil
-}
-
-// WriteCleanSpec writes a spec file that has destination mount paths resolved.
-func WriteCleanSpec(bundleDir string, spec *specs.Spec) error {
-	bytes, err := json.Marshal(spec)
-	if err != nil {
-		return err
+	var mounts []specs.Mount
+	if err := json.Unmarshal(bytes, &mounts); err != nil {
+		return nil, fmt.Errorf("error unmarshaling mounts: %v\n %s", err, string(bytes))
 	}
-	return ioutil.WriteFile(filepath.Join(bundleDir, "config.clean.json"), bytes, 0755)
+	return mounts, nil
 }
 
 // Capabilities takes in spec and returns a TaskCapabilities corresponding to
@@ -407,8 +406,7 @@ func Mount(src, dst, typ string, flags uint32) error {
 	// source (file or directory).
 	var isDir bool
 	if typ == "proc" {
-		// Special case, as there is no source directory for proc
-		// mounts.
+		// Special case, as there is no source directory for proc mounts.
 		isDir = true
 	} else if fi, err := os.Stat(src); err != nil {
 		return fmt.Errorf("Stat(%q) failed: %v", src, err)
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index b61f1ca62..02af6e6ad 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -219,6 +219,37 @@ func TestSpecInvalid(t *testing.T) {
 			},
 			error: "must be an absolute path",
 		},
+		{
+			name: "invalid mount option",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Mounts: []specs.Mount{
+					{
+						Source:      "/src",
+						Destination: "/dst",
+						Type:        "bind",
+						Options:     []string{"shared"},
+					},
+				},
+			},
+			error: "is not supported",
+		},
+		{
+			name: "invalid rootfs propagation",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Linux: &specs.Linux{
+					RootfsPropagation: "foo",
+				},
+			},
+			error: "root mount propagation option must specify private or slave",
+		},
 	} {
 		err := ValidateSpec(&test.spec)
 		if len(test.error) == 0 {
-- 
cgit v1.2.3


From 87cce0ec08b9d629a5e3a88be411b1721d767301 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 19 Mar 2019 17:32:23 -0700
Subject: netstack: reduce MSS from SYN to account tcp options

See: https://tools.ietf.org/html/rfc6691#section-2
PiperOrigin-RevId: 239305632
Change-Id: Ie8eb912a43332e6490045dc95570709c5b81855e
---
 pkg/sentry/fs/proc/README.md        |  2 --
 pkg/tcpip/transport/tcp/endpoint.go | 10 ++++++++++
 pkg/tcpip/transport/tcp/snd.go      | 13 +++++++------
 runsc/test/README.md                |  2 --
 runsc/test/root/crictl_test.go      |  3 +++
 test/syscalls/linux/exec.cc         |  1 +
 test/syscalls/linux/preadv.cc       |  1 +
 test/syscalls/linux/proc.cc         |  2 ++
 test/syscalls/linux/sigaltstack.cc  |  1 +
 test/syscalls/linux/time.cc         |  1 +
 test/util/temp_path.cc              |  1 +
 test/util/test_util.cc              |  2 ++
 test/util/test_util.h               |  1 +
 13 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 686d40f0c..3cc5f197c 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -11,7 +11,6 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
-
 | File /proc/                 | Content                                               |
 | :------------------------   | :---------------------------------------------------- |
 | [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
@@ -23,7 +22,6 @@ The following files are implemented:
 | [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
 | [version](#version)         | Kernel version                                        |
 
-
 ### cpuinfo
 
 ```bash
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 7d18e3612..5656890f6 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1596,6 +1596,16 @@ func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
 	}
 }
 
+// maxOptionSize return the maximum size of TCP options.
+func (e *endpoint) maxOptionSize() (size int) {
+	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
+	options := e.makeOptions(maxSackBlocks[:])
+	size = len(options)
+	putOptions(options)
+
+	return size
+}
+
 // completeState makes a full copy of the endpoint and returns it. This is used
 // before invoking the probe. The state returned may not be fully consistent if
 // there are intervening syscalls when the state is being copied.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index e38932df7..18365a673 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -172,6 +172,11 @@ type fastRecovery struct {
 }
 
 func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
+	// The sender MUST reduce the TCP data length to account for any IP or
+	// TCP options that it is including in the packets that it sends.
+	// See: https://tools.ietf.org/html/rfc6691#section-2
+	maxPayloadSize := int(mss) - ep.maxOptionSize()
+
 	s := &sender{
 		ep:               ep,
 		sndCwnd:          InitialCwnd,
@@ -183,7 +188,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 		rto:              1 * time.Second,
 		rttMeasureSeqNum: iss + 1,
 		lastSendTime:     time.Now(),
-		maxPayloadSize:   int(mss),
+		maxPayloadSize:   maxPayloadSize,
 		maxSentAck:       irs + 1,
 		fr: fastRecovery{
 			// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
@@ -226,11 +231,7 @@ func (s *sender) initCongestionControl(congestionControlName CongestionControlOp
 func (s *sender) updateMaxPayloadSize(mtu, count int) {
 	m := mtu - header.TCPMinimumSize
 
-	// Calculate the maximum option size.
-	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
-	options := s.ep.makeOptions(maxSackBlocks[:])
-	m -= len(options)
-	putOptions(options)
+	m -= s.ep.maxOptionSize()
 
 	// We don't adjust up for now.
 	if m >= s.maxPayloadSize {
diff --git a/runsc/test/README.md b/runsc/test/README.md
index 5929cbeb6..f22a8e017 100644
--- a/runsc/test/README.md
+++ b/runsc/test/README.md
@@ -12,13 +12,11 @@ they may need extra setup in the test machine and extra configuration to run.
 
 The following setup steps are required in order to run these tests:
 
-
      `./runsc/test/install.sh [--runtime <name>]`
 
 The tests expect the runtime name to be provided in the `RUNSC_RUNTIME`
 environment variable (default: `runsc-test`). To run the tests execute:
 
-
 ```
 bazel test --test_env=RUNSC_RUNTIME=runsc-test \
   //runsc/test/image:image_test \
diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
index 556d95fff..37fe53ba3 100644
--- a/runsc/test/root/crictl_test.go
+++ b/runsc/test/root/crictl_test.go
@@ -36,6 +36,7 @@ import (
 
 // Tests for crictl have to be run as root (rather than in a user namespace)
 // because crictl creates named network namespaces in /var/run/netns/.
+
 func TestCrictlSanity(t *testing.T) {
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
@@ -58,6 +59,7 @@ func TestCrictlSanity(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
 func TestMountPaths(t *testing.T) {
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
@@ -80,6 +82,7 @@ func TestMountPaths(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
 func TestMountOverSymlinks(t *testing.T) {
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 2d2287c2a..d5a938a98 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -58,6 +58,7 @@ std::string WorkloadPath(absl::string_view binary) {
   if (test_src) {
     full_path = JoinPath(test_src, "__main__/test/syscalls/linux", binary);
   }
+
   TEST_CHECK(full_path.empty() == false);
   return full_path;
 }
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index 8d3aed43c..4a31123d8 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -37,6 +37,7 @@ namespace gvisor {
 namespace testing {
 
 namespace {
+
 TEST(PreadvTest, MMConcurrencyStress) {
   // Fill a one-page file with zeroes (the contents don't really matter).
   const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 6ffe9aed6..0da682e7b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1258,6 +1258,7 @@ TEST(ProcPidSymlink, SubprocessRunning) {
   EXPECT_THAT(ReadlinkWhileRunning("ns/user", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 }
+
 // FIXME: Inconsistent behavior between gVisor and linux
 // on proc files.
 TEST(ProcPidSymlink, SubprocessZombied) {
@@ -1362,6 +1363,7 @@ TEST(ProcPidFile, SubprocessRunning) {
 // Test whether /proc/PID/ files can be read for a zombie process.
 TEST(ProcPidFile, SubprocessZombie) {
   char buf[1];
+
   // 4.17: Succeeds and returns 1
   // gVisor: Succeds and returns 0
   EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index b1845ac85..5741720f4 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -101,6 +101,7 @@ TEST(SigaltstackTest, ResetByExecve) {
   if (test_src) {
     full_path = JoinPath(test_src, "../../linux/sigaltstack_check");
   }
+
   ASSERT_FALSE(full_path.empty());
 
   pid_t child_pid = -1;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 3abcd8098..5a3dfd026 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -61,6 +61,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
   EXPECT_EXIT(vsyscall_time(reinterpret_cast<time_t*>(0x1)),
               ::testing::KilledBySignal(SIGSEGV), "");
 }
+
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
   constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index e45909655..11c14fb1a 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -75,6 +75,7 @@ std::string NewTempRelPath() { return NextTempBasename(); }
 std::string GetAbsoluteTestTmpdir() {
   char* env_tmpdir = getenv("TEST_TMPDIR");
   std::string tmp_dir = env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp";
+
   return MakeAbsolute(tmp_dir, "").ValueOrDie();
 }
 
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 7b40260d1..ebcbca238 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -26,6 +26,7 @@
 
 #include <ctime>
 #include <vector>
+
 #include "absl/base/attributes.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -234,6 +235,7 @@ bool Equivalent(uint64_t current, uint64_t target, double tolerance) {
   auto abs_diff = target > current ? target - current : current - target;
   return abs_diff <= static_cast<uint64_t>(tolerance * target);
 }
+
 void TestInit(int* argc, char*** argv) {
   ::testing::InitGoogleTest(argc, *argv);
   ::gflags::ParseCommandLineFlags(argc, argv, true);
diff --git a/test/util/test_util.h b/test/util/test_util.h
index cd71fdd64..37e40de8e 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -184,6 +184,7 @@
 #include <thread>  // NOLINT: using std::thread::hardware_concurrency().
 #include <utility>
 #include <vector>
+
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include "gmock/gmock.h"
-- 
cgit v1.2.3


From c7877b0a14778af9165eb2b841513b6f7dfdcbee Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 20 Mar 2019 10:35:13 -0700
Subject: Fail in case mount option is unknown

PiperOrigin-RevId: 239425816
Change-Id: I3b1479c61b4222c3931a416c4efc909157044330
---
 runsc/specutils/fs.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index b812a5fbd..aa17d4eb9 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -20,7 +20,6 @@ import (
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
 type mapping struct {
@@ -121,7 +120,7 @@ func validateMount(mnt *specs.Mount) error {
 			_, ok1 := optionsMap[o]
 			_, ok2 := propOptionsMap[o]
 			if !ok1 && !ok2 {
-				log.Warningf("Ignoring unknown mount option %q", o)
+				return fmt.Errorf("unknown mount option %q", o)
 			}
 		}
 	}
-- 
cgit v1.2.3


From beb71ab681dadb2eed3407bc9188bfe85694eb22 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 26 Mar 2019 23:43:30 -0700
Subject: Merge fsgofer 'controlFile' and 'openedFile'

This reduces the number of FDs used for writable files.

#149

PiperOrigin-RevId: 240502097
Change-Id: Ib44489f65bce23dd1a995f620d69e65dce003f7c
---
 runsc/fsgofer/fsgofer.go | 164 ++++++++++++++++++++++-------------------------
 1 file changed, 78 insertions(+), 86 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 9955d0750..1e422f30b 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -175,23 +175,21 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 }
 
 // localFile implements p9.File wrapping a local file. The underlying file
-// is opened during Walk() and stored in 'controlFile' to be used with other
-// operations. The control file is opened as readonly, unless it's a symlink
-// which requires O_PATH. 'controlFile' is dup'ed when Walk(nil) is called
-// to clone the file. This reduces the number of walks that need to be done by
-// the host file system when files are reused.
+// is opened during Walk() and stored in 'file' to be used with other
+// operations. The file is opened as readonly, unless it's a symlink which
+// requires O_PATH. 'file' is dup'ed when Walk(nil) is called to clone the file.
+// This reduces the number of walks that need to be done by the host file
+// system when files are reused.
 //
-// 'openedFile' is assigned when Open() is called. If requested open mode is
-// a subset of controlFile's mode, it's possible to use the same file. If mode
-// is not a subset, then another file is opened. Consequently, 'openedFile'
-// could have a mode wider than requested and must be verified before read/write
-// operations. Before the file is opened and after it's closed, 'mode' is set to
-// an invalid value to prevent an unopened file from being used.
+// The file may be reopened if the requested mode in Open() is not a subset of
+// current mode. Consequently, 'file' could have a mode wider than requested and
+// must be verified before read/write operations. Before the file is opened and
+// after it's closed, 'mode' is set to an invalid value to prevent an unopened
+// file from being used.
 //
-// The reason that the control file is never opened as read-write is for better
+// The reason that the file is not opened initially as read-write is for better
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
-// multiple files are being opened for read-only (esp. startup).
 type localFile struct {
 	p9.DefaultWalkGetAttr
 
@@ -201,12 +199,9 @@ type localFile struct {
 	// hostPath will be safely updated by the Renamed hook.
 	hostPath string
 
-	// controlFile is opened when localFile is created and it's never nil.
-	controlFile *os.File
-
-	// openedFile is nil until localFile is opened. It may point to controlFile
-	// or be a new file struct. See struct comment for more details.
-	openedFile *os.File
+	// file is opened when localFile is created and it's never nil. It may be
+	// reopened...
+	file *os.File
 
 	// mode is the mode in which the file was opened. Set to invalidMode
 	// if localFile isn't opened.
@@ -228,7 +223,7 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 	var err error
 	var fd int
 	for i, mode := range modes {
-		fd, err = syscall.Openat(parent.controlFD(), name, openFlags|mode, 0)
+		fd, err = syscall.Openat(parent.fd(), name, openFlags|mode, 0)
 		if err == nil {
 			// openat succeeded, we're done.
 			break
@@ -240,11 +235,11 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 		}
 		// openat failed. Try again with next mode, preserving 'err' in case this
 		// was the last attempt.
-		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|mode, parent.controlFile.Name(), name, err)
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|mode, parent.file.Name(), name, err)
 	}
 	if err != nil {
 		// All attempts to open file have failed, return the last error.
-		log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.controlFile.Name(), name, err)
+		log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.file.Name(), name, err)
 		return nil, "", extractErrno(err)
 	}
 
@@ -267,7 +262,7 @@ func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_
 	return &localFile{
 		attachPoint: a,
 		hostPath:    path,
-		controlFile: file,
+		file:        file,
 		mode:        invalidMode,
 		ft:          ft,
 	}, nil
@@ -302,33 +297,26 @@ func fchown(fd int, uid p9.UID, gid p9.GID) error {
 	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
 }
 
-func (l *localFile) controlFD() int {
-	return int(l.controlFile.Fd())
-}
-
-func (l *localFile) openedFD() int {
-	if l.openedFile == nil {
-		panic(fmt.Sprintf("trying to use an unopened file: %q", l.controlFile.Name()))
-	}
-	return int(l.openedFile.Fd())
+func (l *localFile) fd() int {
+	return int(l.file.Fd())
 }
 
 // Open implements p9.File.
 func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
-	if l.openedFile != nil {
-		panic(fmt.Sprintf("attempting to open already opened file: %q", l.controlFile.Name()))
+	if l.isOpen() {
+		panic(fmt.Sprintf("attempting to open already opened file: %q", l.file.Name()))
 	}
 
 	// Check if control file can be used or if a new open must be created.
 	var newFile *os.File
 	if mode == p9.ReadOnly {
-		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name())
-		newFile = l.controlFile
+		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.file.Name())
+		newFile = l.file
 	} else {
 		// Ideally reopen would call name_to_handle_at (with empty name) and
 		// open_by_handle_at to reopen the file without using 'hostPath'. However,
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
-		log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
+		log.Debugf("Open reopening file, mode: %v, %q", mode, l.file.Name())
 		var err error
 
 		newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
@@ -339,7 +327,9 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 
 	stat, err := stat(int(newFile.Fd()))
 	if err != nil {
-		newFile.Close()
+		if newFile != l.file {
+			newFile.Close()
+		}
 		return nil, p9.QID{}, 0, extractErrno(err)
 	}
 
@@ -349,8 +339,13 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		fd = newFDMaybe(newFile)
 	}
 
-	// Set fields on success
-	l.openedFile = newFile
+	// Close old file in case a new one was created.
+	if newFile != l.file {
+		if err := l.file.Close(); err != nil {
+			log.Warningf("Error closing file %q: %v", l.file.Name(), err)
+		}
+		l.file = newFile
+	}
 	l.mode = mode
 	return fd, l.attachPoint.makeQID(stat), 0, nil
 }
@@ -365,10 +360,9 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		return nil, nil, p9.QID{}, 0, syscall.EBADF
 	}
 
-	// Use a single file for both 'controlFile' and 'openedFile'. Mode must
-	// include read for control and whichever else was requested by caller. Note
-	// that resulting file might have a wider mode than needed for each particular
-	// case.
+	// 'file' may be used for other operations (e.g. Walk), so read access is
+	// always added to flags. Note that resulting file might have a wider mode
+	// than needed for each particular case.
 	flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
 	if mode == p9.WriteOnly {
 		flags |= syscall.O_RDWR
@@ -376,14 +370,14 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		flags |= mode.OSFlags()
 	}
 
-	fd, err := syscall.Openat(l.controlFD(), name, flags, uint32(perm.Permissions()))
+	fd, err := syscall.Openat(l.fd(), name, flags, uint32(perm.Permissions()))
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		syscall.Close(fd)
 		// Best effort attempt to remove the file in case of failure.
-		if err := syscall.Unlinkat(l.controlFD(), name); err != nil {
+		if err := syscall.Unlinkat(l.fd(), name); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -402,13 +396,12 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	c := &localFile{
 		attachPoint: l.attachPoint,
 		hostPath:    cPath,
-		controlFile: f,
-		openedFile:  f,
+		file:        f,
 		mode:        mode,
 	}
 
 	cu.Release()
-	return newFDMaybe(c.openedFile), c, l.attachPoint.makeQID(stat), 0, nil
+	return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
 }
 
 // Mkdir implements p9.File.
@@ -421,12 +414,12 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
+	if err := syscall.Mkdirat(l.fd(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		// Best effort attempt to remove the dir in case of failure.
-		if err := unix.Unlinkat(l.controlFD(), name, unix.AT_REMOVEDIR); err != nil {
+		if err := unix.Unlinkat(l.fd(), name, unix.AT_REMOVEDIR); err != nil {
 			log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -434,7 +427,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 
 	// Open directory to change ownership and stat it.
 	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
-	fd, err := syscall.Openat(l.controlFD(), name, flags, 0)
+	fd, err := syscall.Openat(l.fd(), name, flags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -456,7 +449,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	// Duplicate current file if 'names' is empty.
 	if len(names) == 0 {
-		newFd, err := syscall.Dup(l.controlFD())
+		newFd, err := syscall.Dup(l.fd())
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
@@ -469,7 +462,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		c := &localFile{
 			attachPoint: l.attachPoint,
 			hostPath:    l.hostPath,
-			controlFile: os.NewFile(uintptr(newFd), l.hostPath),
+			file:        os.NewFile(uintptr(newFd), l.hostPath),
 			mode:        invalidMode,
 		}
 		return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
@@ -484,10 +477,12 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		}
 		stat, err := stat(int(f.Fd()))
 		if err != nil {
+			f.Close()
 			return nil, nil, extractErrno(err)
 		}
 		c, err := newLocalFile(last.attachPoint, f, path, stat)
 		if err != nil {
+			f.Close()
 			return nil, nil, extractErrno(err)
 		}
 
@@ -500,7 +495,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 // StatFS implements p9.File.
 func (l *localFile) StatFS() (p9.FSStat, error) {
 	var s syscall.Statfs_t
-	if err := syscall.Fstatfs(l.controlFD(), &s); err != nil {
+	if err := syscall.Fstatfs(l.fd(), &s); err != nil {
 		return p9.FSStat{}, extractErrno(err)
 	}
 
@@ -519,10 +514,10 @@ func (l *localFile) StatFS() (p9.FSStat, error) {
 
 // FSync implements p9.File.
 func (l *localFile) FSync() error {
-	if l.openedFile == nil {
+	if !l.isOpen() {
 		return syscall.EBADF
 	}
-	if err := l.openedFile.Sync(); err != nil {
+	if err := l.file.Sync(); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -530,7 +525,7 @@ func (l *localFile) FSync() error {
 
 // GetAttr implements p9.File.
 func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
-	stat, err := stat(l.controlFD())
+	stat, err := stat(l.fd())
 	if err != nil {
 		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
 	}
@@ -598,11 +593,11 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// Handle all the sanity checks up front so that the client gets a
 	// consistent result that is not attribute dependent.
 	if !valid.IsSubsetOf(allowed) {
-		log.Warningf("SetAttr() failed for %q, mask: %v", l.controlFile.Name(), valid)
+		log.Warningf("SetAttr() failed for %q, mask: %v", l.file.Name(), valid)
 		return syscall.EPERM
 	}
 
-	fd := l.controlFD()
+	fd := l.fd()
 	if l.ft == regular {
 		// Regular files are opened in RO mode, thus it needs to be reopened here
 		// for write.
@@ -719,7 +714,7 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
 	}
 
 	newParent := directory.(*localFile)
-	if err := renameat(l.controlFD(), oldName, newParent.controlFD(), newName); err != nil {
+	if err := renameat(l.fd(), oldName, newParent.fd(), newName); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -730,11 +725,11 @@ func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
 		return 0, syscall.EBADF
 	}
-	if l.openedFile == nil {
+	if !l.isOpen() {
 		return 0, syscall.EBADF
 	}
 
-	r, err := l.openedFile.ReadAt(p, int64(offset))
+	r, err := l.file.ReadAt(p, int64(offset))
 	switch err {
 	case nil, io.EOF:
 		return r, nil
@@ -748,11 +743,11 @@ func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
 		return 0, syscall.EBADF
 	}
-	if l.openedFile == nil {
+	if !l.isOpen() {
 		return 0, syscall.EBADF
 	}
 
-	w, err := l.openedFile.WriteAt(p, int64(offset))
+	w, err := l.file.WriteAt(p, int64(offset))
 	if err != nil {
 		return w, extractErrno(err)
 	}
@@ -769,19 +764,19 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
+	if err := unix.Symlinkat(target, l.fd(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		// Best effort attempt to remove the symlink in case of failure.
-		if err := syscall.Unlinkat(l.controlFD(), newName); err != nil {
+		if err := syscall.Unlinkat(l.fd(), newName); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
 		}
 	})
 	defer cu.Clean()
 
 	// Open symlink to change ownership and stat it.
-	fd, err := syscall.Openat(l.controlFD(), newName, unix.O_PATH|openFlags, 0)
+	fd, err := syscall.Openat(l.fd(), newName, unix.O_PATH|openFlags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -810,7 +805,7 @@ func (l *localFile) Link(target p9.File, newName string) error {
 	}
 
 	targetFile := target.(*localFile)
-	if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil {
+	if err := unix.Linkat(targetFile.fd(), "", l.fd(), newName, linux.AT_EMPTY_PATH); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -833,7 +828,7 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 		return syscall.EBADF
 	}
 
-	if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil {
+	if err := unix.Unlinkat(l.fd(), name, int(flags)); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -844,7 +839,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
 		return nil, syscall.EBADF
 	}
-	if l.openedFile == nil {
+	if !l.isOpen() {
 		return nil, syscall.EBADF
 	}
 
@@ -852,11 +847,11 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	// reading all directory contents. Take a lock because this operation is
 	// stateful.
 	l.readDirMu.Lock()
-	if _, err := l.openedFile.Seek(0, 0); err != nil {
+	if _, err := l.file.Seek(0, 0); err != nil {
 		l.readDirMu.Unlock()
 		return nil, extractErrno(err)
 	}
-	names, err := l.openedFile.Readdirnames(-1)
+	names, err := l.file.Readdirnames(-1)
 	if err != nil {
 		l.readDirMu.Unlock()
 		return nil, extractErrno(err)
@@ -865,7 +860,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 
 	var dirents []p9.Dirent
 	for i := int(offset); i >= 0 && i < len(names); i++ {
-		stat, err := statAt(l.openedFD(), names[i])
+		stat, err := statAt(l.fd(), names[i])
 		if err != nil {
 			continue
 		}
@@ -883,9 +878,10 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 // Readlink implements p9.File.
 func (l *localFile) Readlink() (string, error) {
 	// Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
-	for len := 128; len < 1024*1024; len *= 2 {
+	const limit = 1024 * 1024
+	for len := 128; len < limit; len *= 2 {
 		b := make([]byte, len)
-		n, err := unix.Readlinkat(l.controlFD(), "", b)
+		n, err := unix.Readlinkat(l.fd(), "", b)
 		if err != nil {
 			return "", extractErrno(err)
 		}
@@ -908,20 +904,16 @@ func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
 
 // Close implements p9.File.
 func (l *localFile) Close() error {
-	err := l.controlFile.Close()
-
-	// Close only once in case opened and control files point to
-	// the same os.File struct.
-	if l.openedFile != nil && l.openedFile != l.controlFile {
-		err = l.openedFile.Close()
-	}
-
-	l.openedFile = nil
-	l.controlFile = nil
 	l.mode = invalidMode
+	err := l.file.Close()
+	l.file = nil
 	return err
 }
 
+func (l *localFile) isOpen() bool {
+	return l.mode != invalidMode
+}
+
 // Renamed implements p9.Renamed.
 func (l *localFile) Renamed(newDir p9.File, newName string) {
 	l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
-- 
cgit v1.2.3


From 5d94c893ae38f09f5132ab43d48204ab49121960 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 27 Mar 2019 11:08:33 -0700
Subject: gvisor/runsc: address typos from github

Fixes: https://github.com/google/gvisor/issues/143
Fixes #143
PiperOrigin-RevId: 240600719
Change-Id: Id1731b9969f98e32e52e144a6643e12b0b70f168
---
 runsc/cmd/list.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index 481066225..1dcea2af0 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -42,7 +42,7 @@ func (*List) Name() string {
 
 // Synopsis implements subcommands.Command.Synopsis.
 func (*List) Synopsis() string {
-	return "list contaners started by runsc with the given root"
+	return "list containers started by runsc with the given root"
 }
 
 // Usage implements subcommands.Command.Usage.
-- 
cgit v1.2.3


From 6cb0b1881a6dacf2a85d1e904460a2aaae63e562 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 27 Mar 2019 15:45:46 -0700
Subject: Automated rollback of changelist 240502097

PiperOrigin-RevId: 240657604
Change-Id: Ida15dee83337867c560427eae0b4b9ce1051dbb8
---
 runsc/fsgofer/fsgofer.go | 164 +++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 78 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 1e422f30b..9955d0750 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -175,21 +175,23 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 }
 
 // localFile implements p9.File wrapping a local file. The underlying file
-// is opened during Walk() and stored in 'file' to be used with other
-// operations. The file is opened as readonly, unless it's a symlink which
-// requires O_PATH. 'file' is dup'ed when Walk(nil) is called to clone the file.
-// This reduces the number of walks that need to be done by the host file
-// system when files are reused.
+// is opened during Walk() and stored in 'controlFile' to be used with other
+// operations. The control file is opened as readonly, unless it's a symlink
+// which requires O_PATH. 'controlFile' is dup'ed when Walk(nil) is called
+// to clone the file. This reduces the number of walks that need to be done by
+// the host file system when files are reused.
 //
-// The file may be reopened if the requested mode in Open() is not a subset of
-// current mode. Consequently, 'file' could have a mode wider than requested and
-// must be verified before read/write operations. Before the file is opened and
-// after it's closed, 'mode' is set to an invalid value to prevent an unopened
-// file from being used.
+// 'openedFile' is assigned when Open() is called. If requested open mode is
+// a subset of controlFile's mode, it's possible to use the same file. If mode
+// is not a subset, then another file is opened. Consequently, 'openedFile'
+// could have a mode wider than requested and must be verified before read/write
+// operations. Before the file is opened and after it's closed, 'mode' is set to
+// an invalid value to prevent an unopened file from being used.
 //
-// The reason that the file is not opened initially as read-write is for better
+// The reason that the control file is never opened as read-write is for better
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
+// multiple files are being opened for read-only (esp. startup).
 type localFile struct {
 	p9.DefaultWalkGetAttr
 
@@ -199,9 +201,12 @@ type localFile struct {
 	// hostPath will be safely updated by the Renamed hook.
 	hostPath string
 
-	// file is opened when localFile is created and it's never nil. It may be
-	// reopened...
-	file *os.File
+	// controlFile is opened when localFile is created and it's never nil.
+	controlFile *os.File
+
+	// openedFile is nil until localFile is opened. It may point to controlFile
+	// or be a new file struct. See struct comment for more details.
+	openedFile *os.File
 
 	// mode is the mode in which the file was opened. Set to invalidMode
 	// if localFile isn't opened.
@@ -223,7 +228,7 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 	var err error
 	var fd int
 	for i, mode := range modes {
-		fd, err = syscall.Openat(parent.fd(), name, openFlags|mode, 0)
+		fd, err = syscall.Openat(parent.controlFD(), name, openFlags|mode, 0)
 		if err == nil {
 			// openat succeeded, we're done.
 			break
@@ -235,11 +240,11 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 		}
 		// openat failed. Try again with next mode, preserving 'err' in case this
 		// was the last attempt.
-		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|mode, parent.file.Name(), name, err)
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|mode, parent.controlFile.Name(), name, err)
 	}
 	if err != nil {
 		// All attempts to open file have failed, return the last error.
-		log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.file.Name(), name, err)
+		log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.controlFile.Name(), name, err)
 		return nil, "", extractErrno(err)
 	}
 
@@ -262,7 +267,7 @@ func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_
 	return &localFile{
 		attachPoint: a,
 		hostPath:    path,
-		file:        file,
+		controlFile: file,
 		mode:        invalidMode,
 		ft:          ft,
 	}, nil
@@ -297,26 +302,33 @@ func fchown(fd int, uid p9.UID, gid p9.GID) error {
 	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
 }
 
-func (l *localFile) fd() int {
-	return int(l.file.Fd())
+func (l *localFile) controlFD() int {
+	return int(l.controlFile.Fd())
+}
+
+func (l *localFile) openedFD() int {
+	if l.openedFile == nil {
+		panic(fmt.Sprintf("trying to use an unopened file: %q", l.controlFile.Name()))
+	}
+	return int(l.openedFile.Fd())
 }
 
 // Open implements p9.File.
 func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
-	if l.isOpen() {
-		panic(fmt.Sprintf("attempting to open already opened file: %q", l.file.Name()))
+	if l.openedFile != nil {
+		panic(fmt.Sprintf("attempting to open already opened file: %q", l.controlFile.Name()))
 	}
 
 	// Check if control file can be used or if a new open must be created.
 	var newFile *os.File
 	if mode == p9.ReadOnly {
-		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.file.Name())
-		newFile = l.file
+		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name())
+		newFile = l.controlFile
 	} else {
 		// Ideally reopen would call name_to_handle_at (with empty name) and
 		// open_by_handle_at to reopen the file without using 'hostPath'. However,
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
-		log.Debugf("Open reopening file, mode: %v, %q", mode, l.file.Name())
+		log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
 		var err error
 
 		newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
@@ -327,9 +339,7 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 
 	stat, err := stat(int(newFile.Fd()))
 	if err != nil {
-		if newFile != l.file {
-			newFile.Close()
-		}
+		newFile.Close()
 		return nil, p9.QID{}, 0, extractErrno(err)
 	}
 
@@ -339,13 +349,8 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		fd = newFDMaybe(newFile)
 	}
 
-	// Close old file in case a new one was created.
-	if newFile != l.file {
-		if err := l.file.Close(); err != nil {
-			log.Warningf("Error closing file %q: %v", l.file.Name(), err)
-		}
-		l.file = newFile
-	}
+	// Set fields on success
+	l.openedFile = newFile
 	l.mode = mode
 	return fd, l.attachPoint.makeQID(stat), 0, nil
 }
@@ -360,9 +365,10 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		return nil, nil, p9.QID{}, 0, syscall.EBADF
 	}
 
-	// 'file' may be used for other operations (e.g. Walk), so read access is
-	// always added to flags. Note that resulting file might have a wider mode
-	// than needed for each particular case.
+	// Use a single file for both 'controlFile' and 'openedFile'. Mode must
+	// include read for control and whichever else was requested by caller. Note
+	// that resulting file might have a wider mode than needed for each particular
+	// case.
 	flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
 	if mode == p9.WriteOnly {
 		flags |= syscall.O_RDWR
@@ -370,14 +376,14 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		flags |= mode.OSFlags()
 	}
 
-	fd, err := syscall.Openat(l.fd(), name, flags, uint32(perm.Permissions()))
+	fd, err := syscall.Openat(l.controlFD(), name, flags, uint32(perm.Permissions()))
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		syscall.Close(fd)
 		// Best effort attempt to remove the file in case of failure.
-		if err := syscall.Unlinkat(l.fd(), name); err != nil {
+		if err := syscall.Unlinkat(l.controlFD(), name); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -396,12 +402,13 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	c := &localFile{
 		attachPoint: l.attachPoint,
 		hostPath:    cPath,
-		file:        f,
+		controlFile: f,
+		openedFile:  f,
 		mode:        mode,
 	}
 
 	cu.Release()
-	return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
+	return newFDMaybe(c.openedFile), c, l.attachPoint.makeQID(stat), 0, nil
 }
 
 // Mkdir implements p9.File.
@@ -414,12 +421,12 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if err := syscall.Mkdirat(l.fd(), name, uint32(perm.Permissions())); err != nil {
+	if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		// Best effort attempt to remove the dir in case of failure.
-		if err := unix.Unlinkat(l.fd(), name, unix.AT_REMOVEDIR); err != nil {
+		if err := unix.Unlinkat(l.controlFD(), name, unix.AT_REMOVEDIR); err != nil {
 			log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -427,7 +434,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 
 	// Open directory to change ownership and stat it.
 	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
-	fd, err := syscall.Openat(l.fd(), name, flags, 0)
+	fd, err := syscall.Openat(l.controlFD(), name, flags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -449,7 +456,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	// Duplicate current file if 'names' is empty.
 	if len(names) == 0 {
-		newFd, err := syscall.Dup(l.fd())
+		newFd, err := syscall.Dup(l.controlFD())
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
@@ -462,7 +469,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		c := &localFile{
 			attachPoint: l.attachPoint,
 			hostPath:    l.hostPath,
-			file:        os.NewFile(uintptr(newFd), l.hostPath),
+			controlFile: os.NewFile(uintptr(newFd), l.hostPath),
 			mode:        invalidMode,
 		}
 		return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
@@ -477,12 +484,10 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		}
 		stat, err := stat(int(f.Fd()))
 		if err != nil {
-			f.Close()
 			return nil, nil, extractErrno(err)
 		}
 		c, err := newLocalFile(last.attachPoint, f, path, stat)
 		if err != nil {
-			f.Close()
 			return nil, nil, extractErrno(err)
 		}
 
@@ -495,7 +500,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 // StatFS implements p9.File.
 func (l *localFile) StatFS() (p9.FSStat, error) {
 	var s syscall.Statfs_t
-	if err := syscall.Fstatfs(l.fd(), &s); err != nil {
+	if err := syscall.Fstatfs(l.controlFD(), &s); err != nil {
 		return p9.FSStat{}, extractErrno(err)
 	}
 
@@ -514,10 +519,10 @@ func (l *localFile) StatFS() (p9.FSStat, error) {
 
 // FSync implements p9.File.
 func (l *localFile) FSync() error {
-	if !l.isOpen() {
+	if l.openedFile == nil {
 		return syscall.EBADF
 	}
-	if err := l.file.Sync(); err != nil {
+	if err := l.openedFile.Sync(); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -525,7 +530,7 @@ func (l *localFile) FSync() error {
 
 // GetAttr implements p9.File.
 func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
-	stat, err := stat(l.fd())
+	stat, err := stat(l.controlFD())
 	if err != nil {
 		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
 	}
@@ -593,11 +598,11 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// Handle all the sanity checks up front so that the client gets a
 	// consistent result that is not attribute dependent.
 	if !valid.IsSubsetOf(allowed) {
-		log.Warningf("SetAttr() failed for %q, mask: %v", l.file.Name(), valid)
+		log.Warningf("SetAttr() failed for %q, mask: %v", l.controlFile.Name(), valid)
 		return syscall.EPERM
 	}
 
-	fd := l.fd()
+	fd := l.controlFD()
 	if l.ft == regular {
 		// Regular files are opened in RO mode, thus it needs to be reopened here
 		// for write.
@@ -714,7 +719,7 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
 	}
 
 	newParent := directory.(*localFile)
-	if err := renameat(l.fd(), oldName, newParent.fd(), newName); err != nil {
+	if err := renameat(l.controlFD(), oldName, newParent.controlFD(), newName); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -725,11 +730,11 @@ func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
 		return 0, syscall.EBADF
 	}
-	if !l.isOpen() {
+	if l.openedFile == nil {
 		return 0, syscall.EBADF
 	}
 
-	r, err := l.file.ReadAt(p, int64(offset))
+	r, err := l.openedFile.ReadAt(p, int64(offset))
 	switch err {
 	case nil, io.EOF:
 		return r, nil
@@ -743,11 +748,11 @@ func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
 		return 0, syscall.EBADF
 	}
-	if !l.isOpen() {
+	if l.openedFile == nil {
 		return 0, syscall.EBADF
 	}
 
-	w, err := l.file.WriteAt(p, int64(offset))
+	w, err := l.openedFile.WriteAt(p, int64(offset))
 	if err != nil {
 		return w, extractErrno(err)
 	}
@@ -764,19 +769,19 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if err := unix.Symlinkat(target, l.fd(), newName); err != nil {
+	if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		// Best effort attempt to remove the symlink in case of failure.
-		if err := syscall.Unlinkat(l.fd(), newName); err != nil {
+		if err := syscall.Unlinkat(l.controlFD(), newName); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
 		}
 	})
 	defer cu.Clean()
 
 	// Open symlink to change ownership and stat it.
-	fd, err := syscall.Openat(l.fd(), newName, unix.O_PATH|openFlags, 0)
+	fd, err := syscall.Openat(l.controlFD(), newName, unix.O_PATH|openFlags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -805,7 +810,7 @@ func (l *localFile) Link(target p9.File, newName string) error {
 	}
 
 	targetFile := target.(*localFile)
-	if err := unix.Linkat(targetFile.fd(), "", l.fd(), newName, linux.AT_EMPTY_PATH); err != nil {
+	if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -828,7 +833,7 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 		return syscall.EBADF
 	}
 
-	if err := unix.Unlinkat(l.fd(), name, int(flags)); err != nil {
+	if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -839,7 +844,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
 		return nil, syscall.EBADF
 	}
-	if !l.isOpen() {
+	if l.openedFile == nil {
 		return nil, syscall.EBADF
 	}
 
@@ -847,11 +852,11 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	// reading all directory contents. Take a lock because this operation is
 	// stateful.
 	l.readDirMu.Lock()
-	if _, err := l.file.Seek(0, 0); err != nil {
+	if _, err := l.openedFile.Seek(0, 0); err != nil {
 		l.readDirMu.Unlock()
 		return nil, extractErrno(err)
 	}
-	names, err := l.file.Readdirnames(-1)
+	names, err := l.openedFile.Readdirnames(-1)
 	if err != nil {
 		l.readDirMu.Unlock()
 		return nil, extractErrno(err)
@@ -860,7 +865,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 
 	var dirents []p9.Dirent
 	for i := int(offset); i >= 0 && i < len(names); i++ {
-		stat, err := statAt(l.fd(), names[i])
+		stat, err := statAt(l.openedFD(), names[i])
 		if err != nil {
 			continue
 		}
@@ -878,10 +883,9 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 // Readlink implements p9.File.
 func (l *localFile) Readlink() (string, error) {
 	// Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
-	const limit = 1024 * 1024
-	for len := 128; len < limit; len *= 2 {
+	for len := 128; len < 1024*1024; len *= 2 {
 		b := make([]byte, len)
-		n, err := unix.Readlinkat(l.fd(), "", b)
+		n, err := unix.Readlinkat(l.controlFD(), "", b)
 		if err != nil {
 			return "", extractErrno(err)
 		}
@@ -904,16 +908,20 @@ func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
 
 // Close implements p9.File.
 func (l *localFile) Close() error {
+	err := l.controlFile.Close()
+
+	// Close only once in case opened and control files point to
+	// the same os.File struct.
+	if l.openedFile != nil && l.openedFile != l.controlFile {
+		err = l.openedFile.Close()
+	}
+
+	l.openedFile = nil
+	l.controlFile = nil
 	l.mode = invalidMode
-	err := l.file.Close()
-	l.file = nil
 	return err
 }
 
-func (l *localFile) isOpen() bool {
-	return l.mode != invalidMode
-}
-
 // Renamed implements p9.Renamed.
 func (l *localFile) Renamed(newDir p9.File, newName string) {
 	l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
-- 
cgit v1.2.3


From 1d7e2bc3776f90e1b2b31346e1bec47da6e568ff Mon Sep 17 00:00:00 2001
From: Liu Hua <sdu.liu@huawei.com>
Date: Thu, 28 Mar 2019 11:41:37 -0700
Subject: gofer: some fixs in setupRootFS

1.use root instead of spec.Root.path as mountpoint
2.put remount readonly logic ahead to avoid device busy errors

Signed-off-by: Liu Hua <sdu.liu@huawei.com>
Change-Id: I9222b4695f917136a97b0898ac6f75fcff296e5d
PiperOrigin-RevId: 240818182
---
 runsc/cmd/gofer.go | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index e712244ef..82487887c 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -285,14 +285,15 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 
 	// Mount root path followed by submounts.
 	if err := syscall.Mount(spec.Root.Path, root, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
-		return fmt.Errorf("mounting root on root (%q) err: %v", spec.Root.Path, err)
+		return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
 	}
+
 	flags := uint32(syscall.MS_SLAVE | syscall.MS_REC)
 	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
 		flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
 	}
-	if err := syscall.Mount("", spec.Root.Path, "", uintptr(flags), ""); err != nil {
-		return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", spec.Root.Path, flags, err)
+	if err := syscall.Mount("", root, "", uintptr(flags), ""); err != nil {
+		return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
 	}
 
 	// Replace the current spec, with the clean spec with symlinks resolved.
@@ -315,10 +316,10 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 	if spec.Root.Readonly {
 		// If root is a mount point but not read-only, we can change mount options
 		// to make it read-only for extra safety.
-		log.Infof("Remounting root as readonly: %q", spec.Root.Path)
+		log.Infof("Remounting root as readonly: %q", root)
 		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
-		if err := syscall.Mount(spec.Root.Path, spec.Root.Path, "bind", flags, ""); err != nil {
-			return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
+		if err := syscall.Mount(root, root, "bind", flags, ""); err != nil {
+			return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
 		}
 	}
 
-- 
cgit v1.2.3


From dcf66133314712b9ba042dbbb289c29d00a2497a Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 29 Mar 2019 14:54:17 -0700
Subject: Set container.CreatedAt in Create().

PiperOrigin-RevId: 241056805
Change-Id: I13ea8f5dbfb01ca02a3b0ab887b8c3bdf4d556a6
---
 runsc/container/container.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index fdcf8d7b7..687b89935 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -267,6 +267,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		BundleDir:     bundleDir,
 		Root:          containerRoot,
 		Status:        Creating,
+		CreatedAt:     time.Now(),
 		Owner:         os.Getenv("USER"),
 	}
 	// The Cleanup object cleans up partially created containers when an error occurs.
-- 
cgit v1.2.3


From a046054ba35e8d8c4882f9311dc964eaa1497d58 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 29 Mar 2019 16:26:36 -0700
Subject: gvisor/runsc: enable generic segmentation offload (GSO)

The linux packet socket can handle GSO packets, so we can segment packets to
64K instead of the MTU which is usually 1500.

Here are numbers for the nginx-1m test:
runsc:		579330.01 [Kbytes/sec] received
runsc-gso:	1794121.66 [Kbytes/sec] received
runc:		2122139.06 [Kbytes/sec] received

and for tcp_benchmark:

$ tcp_benchmark  --duration 15   --ideal
[  4]  0.0-15.0 sec  86647 MBytes  48456 Mbits/sec

$ tcp_benchmark --client --duration 15   --ideal
[  4]  0.0-15.0 sec  2173 MBytes  1214 Mbits/sec

$ tcp_benchmark --client --duration 15   --ideal --gso 65536
[  4]  0.0-15.0 sec  19357 MBytes  10825 Mbits/sec

PiperOrigin-RevId: 241072403
Change-Id: I20b03063a1a6649362b43609cbbc9b59be06e6d5
---
 WORKSPACE                       |  2 +-
 runsc/boot/config.go            |  3 +++
 runsc/boot/filter/config.go     |  8 ++++++
 runsc/boot/network.go           | 10 +++++---
 runsc/main.go                   |  2 ++
 runsc/sandbox/BUILD             |  2 ++
 runsc/sandbox/network.go        | 18 +++++++++++--
 runsc/sandbox/network_unsafe.go | 56 +++++++++++++++++++++++++++++++++++++++++
 8 files changed, 94 insertions(+), 7 deletions(-)
 create mode 100644 runsc/sandbox/network_unsafe.go

(limited to 'runsc')

diff --git a/WORKSPACE b/WORKSPACE
index 34c0a7abb..975ecd84d 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -87,7 +87,7 @@ go_repository(
 go_repository(
     name = "com_github_vishvananda_netlink",
     importpath = "github.com/vishvananda/netlink",
-    commit = "d35d6b58e1cb692b27b94fc403170bf44058ac3e",
+    commit = "adb577d4a45e341da53c4d9196ad4222c9a23e69",
 )
 
 go_repository(
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 626fcabdd..2523077fd 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -175,6 +175,9 @@ type Config struct {
 	// Network indicates what type of network to use.
 	Network NetworkType
 
+	// GSO indicates that generic segmentation offload is enabled.
+	GSO bool
+
 	// LogPackets indicates that all network packets should be logged.
 	LogPackets bool
 
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 1ba5b7257..9c72e3b1a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -256,12 +256,20 @@ var allowedSyscalls = seccomp.SyscallRules{
 		},
 	},
 	syscall.SYS_WRITE: {},
+	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
+	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
+	// option is enabled for a packet socket.
 	syscall.SYS_WRITEV: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
 			seccomp.AllowValue(2),
 		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(3),
+		},
 	},
 }
 
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index f025a42f1..77291415b 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -52,10 +52,11 @@ type DefaultRoute struct {
 
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
-	Name      string
-	MTU       int
-	Addresses []net.IP
-	Routes    []Route
+	Name       string
+	MTU        int
+	Addresses  []net.IP
+	Routes     []Route
+	GSOMaxSize uint32
 }
 
 // LoopbackLink configures a loopback li nk.
@@ -140,6 +141,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			EthernetHeader:     true,
 			Address:            mac,
 			PacketDispatchMode: fdbased.PacketMMap,
+			GSOMaxSize:         link.GSOMaxSize,
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
diff --git a/runsc/main.go b/runsc/main.go
index 82c37ec11..4b3f55ad1 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -59,6 +59,7 @@ var (
 	// Flags that control sandbox runtime behavior.
 	platform       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
 	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+	gso            = flag.Bool("gso", true, "enable generic segmenation offload")
 	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
@@ -141,6 +142,7 @@ func main() {
 		FileAccess:     fsAccess,
 		Overlay:        *overlay,
 		Network:        netType,
+		GSO:            *gso,
 		LogPackets:     *logPackets,
 		Platform:       platformType,
 		Strace:         *strace,
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 2ed793333..c0de9a28f 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "sandbox",
     srcs = [
         "network.go",
+        "network_unsafe.go",
         "sandbox.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/sandbox",
@@ -27,5 +28,6 @@ go_library(
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
         "@com_github_vishvananda_netlink//:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index ec0a252d1..be924ae25 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -26,6 +26,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/vishvananda/netlink"
+	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
@@ -67,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
-		if err := createInterfacesAndRoutesFromNS(conn, nsPath); err != nil {
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
 	case boot.NetworkHost:
@@ -137,7 +138,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
@@ -246,6 +247,19 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
 			return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
 		}
 
+		if enableGSO {
+			gso, err := isGSOEnabled(fd, iface.Name)
+			if err != nil {
+				return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+			}
+			if gso {
+				if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+					return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+				}
+				link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize
+			}
+		}
+
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
 		for _, addr := range ip4addrs {
diff --git a/runsc/sandbox/network_unsafe.go b/runsc/sandbox/network_unsafe.go
new file mode 100644
index 000000000..f7447f002
--- /dev/null
+++ b/runsc/sandbox/network_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+type ethtoolValue struct {
+	cmd uint32
+	val uint32
+}
+
+type ifreq struct {
+	ifrName [unix.IFNAMSIZ]byte
+	ifrData *ethtoolValue
+}
+
+const (
+	_ETHTOOL_GGSO = 0x00000023
+)
+
+func isGSOEnabled(fd int, intf string) (bool, error) {
+	val := ethtoolValue{
+		cmd: _ETHTOOL_GGSO,
+	}
+
+	var name [unix.IFNAMSIZ]byte
+	copy(name[:], []byte(intf))
+
+	ifr := ifreq{
+		ifrName: name,
+		ifrData: &val,
+	}
+
+	if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), unix.SIOCETHTOOL, uintptr(unsafe.Pointer(&ifr))); err != 0 {
+		return false, err
+	}
+
+	return val.val != 0, nil
+}
-- 
cgit v1.2.3


From 33c644bc0b8f60544264dcd5cc5f8c70436cd874 Mon Sep 17 00:00:00 2001
From: Liu Hua <sdu.liu@huawei.com>
Date: Mon, 1 Apr 2019 15:00:49 -0700
Subject: gofer: ignore unsupported files

'ls' will hang if there is any FIFO in this path. So
return EPERM if unsupported file occurs and add NONBLOCK flag
when opening file to avoid blocking on FIFO read.

Signed-off-by: Liu Hua <sdu.liu@huawei.com>
Change-Id: I8b9a2a48322118d8ad531dd226395438123eb047
PiperOrigin-RevId: 241406726
---
 runsc/fsgofer/fsgofer.go | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 9955d0750..1d64458e5 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -52,6 +52,7 @@ const (
 	regular fileType = iota
 	directory
 	symlink
+	unknown
 )
 
 // String implements fmt.Stringer.
@@ -221,9 +222,11 @@ type localFile struct {
 
 func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 	// Attempt to open file in the following mode in order:
-	//   1. RDONLY: for all files, works for directories and ro mounts too
+	//   1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too.
+	//      Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option
+	//      has no effect on regular files.
 	//   2. PATH: for symlinks
-	modes := []int{syscall.O_RDONLY, unix.O_PATH}
+	modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
 
 	var err error
 	var fd int
@@ -252,7 +255,7 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 	return os.NewFile(uintptr(fd), newPath), newPath, nil
 }
 
-func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) {
+func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
 	var ft fileType
 	switch stat.Mode & syscall.S_IFMT {
 	case syscall.S_IFREG:
@@ -262,8 +265,17 @@ func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_
 	case syscall.S_IFLNK:
 		ft = symlink
 	default:
-		return nil, syscall.EINVAL
+		return unknown, syscall.EPERM
 	}
+	return ft, nil
+}
+
+func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) {
+	ft, err := getSupportedFileType(stat)
+	if err != nil {
+		return nil, err
+	}
+
 	return &localFile{
 		attachPoint: a,
 		hostPath:    path,
@@ -484,10 +496,12 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		}
 		stat, err := stat(int(f.Fd()))
 		if err != nil {
+			f.Close()
 			return nil, nil, extractErrno(err)
 		}
 		c, err := newLocalFile(last.attachPoint, f, path, stat)
 		if err != nil {
+			f.Close()
 			return nil, nil, extractErrno(err)
 		}
 
-- 
cgit v1.2.3


From 7543e9ec2043af7d071373aeec04b92a98051087 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 1 Apr 2019 16:17:40 -0700
Subject: Add release hook and version flag

PiperOrigin-RevId: 241421671
Change-Id: Ic0cebfe3efd458dc42c49f7f812c13318705199a
---
 runsc/BUILD                  |  6 +++--
 runsc/main.go                | 15 +++++++++----
 runsc/specutils/specutils.go |  3 +++
 runsc/version.go             | 18 +++++++++++++++
 tools/tag_release.sh         | 53 ++++++++++++++++++++++++++++++++++++++++++++
 tools/workspace_status.sh    |  2 +-
 6 files changed, 90 insertions(+), 7 deletions(-)
 create mode 100644 runsc/version.go
 create mode 100755 tools/tag_release.sh

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index e390b7bae..eb7503502 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -6,12 +6,13 @@ go_binary(
     name = "runsc",
     srcs = [
         "main.go",
+        "version.go",
     ],
     pure = "on",
     visibility = [
         "//visibility:public",
     ],
-    x_defs = {"main.gitRevision": "{GIT_REVISION}"},
+    x_defs = {"main.version": "{VERSION}"},
     deps = [
         "//pkg/log",
         "//runsc/boot",
@@ -36,12 +37,13 @@ go_binary(
     name = "runsc-race",
     srcs = [
         "main.go",
+        "version.go",
     ],
     static = "on",
     visibility = [
         "//visibility:public",
     ],
-    x_defs = {"main.gitRevision": "{GIT_REVISION}"},
+    x_defs = {"main.version": "{VERSION}"},
     deps = [
         "//pkg/log",
         "//runsc/boot",
diff --git a/runsc/main.go b/runsc/main.go
index 4b3f55ad1..bbf08228c 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -18,6 +18,7 @@ package main
 
 import (
 	"context"
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
@@ -40,6 +41,7 @@ var (
 	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
 	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s")
 	debug       = flag.Bool("debug", false, "enable debug logging")
+	showVersion = flag.Bool("version", false, "show version and exit")
 
 	// These flags are unique to runsc, and are used to configure parts of the
 	// system that are not covered by the runtime spec.
@@ -69,9 +71,6 @@ var (
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
 )
 
-// gitRevision is set during linking.
-var gitRevision = ""
-
 func main() {
 	// Help and flags commands are generated automatically.
 	subcommands.Register(subcommands.HelpCommand(), "")
@@ -107,6 +106,14 @@ func main() {
 	// All subcommands must be registered before flag parsing.
 	flag.Parse()
 
+	// Are we showing the version?
+	if *showVersion {
+		// The format here is the same as runc.
+		fmt.Fprintf(os.Stdout, "runsc version %s\n", version)
+		fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version)
+		os.Exit(0)
+	}
+
 	platformType, err := boot.MakePlatformType(*platform)
 	if err != nil {
 		cmd.Fatalf("%v", err)
@@ -215,7 +222,7 @@ func main() {
 
 	log.Infof("***************************")
 	log.Infof("Args: %s", os.Args)
-	log.Infof("Git Revision: %s", gitRevision)
+	log.Infof("Version %s", version)
 	log.Infof("PID: %d", os.Getpid())
 	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
 	log.Infof("Configuration:")
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index cbf099c64..af8d34535 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -38,6 +38,9 @@ import (
 // changed in tests that aren't linked in the same binary.
 var ExePath = "/proc/self/exe"
 
+// Version is the supported spec version.
+var Version = specs.Version
+
 // LogSpec logs the spec in a human-friendly way.
 func LogSpec(spec *specs.Spec) {
 	log.Debugf("Spec: %+v", spec)
diff --git a/runsc/version.go b/runsc/version.go
new file mode 100644
index 000000000..4894f2de6
--- /dev/null
+++ b/runsc/version.go
@@ -0,0 +1,18 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+// version is set during linking.
+var version = ""
diff --git a/tools/tag_release.sh b/tools/tag_release.sh
new file mode 100755
index 000000000..6906a952f
--- /dev/null
+++ b/tools/tag_release.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script will optionally map a PiperOrigin-RevId to a given commit,
+# validate a provided release name, create a tag and push it. It must be
+# run manually when a release is created.
+
+set -euxo pipefail
+
+# Check arguments.
+if [ "$#" -ne 2 ]; then
+  echo "usage: $0 <commit|revid> <release.rc>"
+  exit 1
+fi
+
+commit=$1
+release=$2
+
+# Is the passed identifier a sha commit?
+if ! git show "${commit}" &> /dev/null; then
+  # Extract the commit given a piper ID.
+  commit=$(git log|grep -E "(^commit |^    PiperOrigin-RevId:)" |grep -B1 "RevId: ${commit}"| head -n1|cut -d" " -f2)
+fi
+if ! git show "${commit}" &> /dev/null; then
+  echo "unknown commit: ${commit}"
+  exit 1
+fi
+
+# Is the release name sane? Must be a date with patch/rc.
+if ! [[ "${release}" =~ ^20[0-9]{6}\.[0-9]+$ ]]; then
+  expected=$(date +%Y%m%d.0) # Use today's date.
+  echo "unexpected release format: ${release}"
+  echo "  ... expected like ${expected}"
+  exit 1
+fi
+
+# Tag the given commit.
+tag="release-${release}"
+(git tag "${tag}" "${commit}" && git push origin tag "${tag}") || \
+  (git tag -d "${tag}" && false)
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
index 7d44dad37..a0e646e45 100755
--- a/tools/workspace_status.sh
+++ b/tools/workspace_status.sh
@@ -14,4 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-echo GIT_REVISION $(git describe --always --abbrev=40 --dirty)
+echo VERSION $(git describe --always --tags --abbrev=12 --dirty)
-- 
cgit v1.2.3


From 1df3fa69977477092efa65a8de407bd6f0f88db4 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 1 Apr 2019 17:28:54 -0700
Subject: Automated rollback of changelist 240657604

PiperOrigin-RevId: 241434161
Change-Id: I9ec734e50cef5b39203e8bf37de2d91d24943f1e
---
 runsc/fsgofer/fsgofer.go      | 218 +++++++++++++++++++++++-------------------
 runsc/fsgofer/fsgofer_test.go | 168 ++++++++++++++++++--------------
 2 files changed, 212 insertions(+), 174 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 1d64458e5..45b455430 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -176,23 +176,21 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 }
 
 // localFile implements p9.File wrapping a local file. The underlying file
-// is opened during Walk() and stored in 'controlFile' to be used with other
-// operations. The control file is opened as readonly, unless it's a symlink
-// which requires O_PATH. 'controlFile' is dup'ed when Walk(nil) is called
-// to clone the file. This reduces the number of walks that need to be done by
-// the host file system when files are reused.
+// is opened during Walk() and stored in 'file' to be used with other
+// operations. The file is opened as readonly, unless it's a symlink or there is
+// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is
+// called to clone the file. This reduces the number of walks that need to be
+// done by the host file system when files are reused.
 //
-// 'openedFile' is assigned when Open() is called. If requested open mode is
-// a subset of controlFile's mode, it's possible to use the same file. If mode
-// is not a subset, then another file is opened. Consequently, 'openedFile'
-// could have a mode wider than requested and must be verified before read/write
-// operations. Before the file is opened and after it's closed, 'mode' is set to
-// an invalid value to prevent an unopened file from being used.
+// The file may be reopened if the requested mode in Open() is not a subset of
+// current mode. Consequently, 'file' could have a mode wider than requested and
+// must be verified before read/write operations. Before the file is opened and
+// after it's closed, 'mode' is set to an invalid value to prevent an unopened
+// file from being used.
 //
-// The reason that the control file is never opened as read-write is for better
+// The reason that the file is not opened initially as read-write is for better
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
-// multiple files are being opened for read-only (esp. startup).
 type localFile struct {
 	p9.DefaultWalkGetAttr
 
@@ -202,12 +200,9 @@ type localFile struct {
 	// hostPath will be safely updated by the Renamed hook.
 	hostPath string
 
-	// controlFile is opened when localFile is created and it's never nil.
-	controlFile *os.File
-
-	// openedFile is nil until localFile is opened. It may point to controlFile
-	// or be a new file struct. See struct comment for more details.
-	openedFile *os.File
+	// file is opened when localFile is created and it's never nil. It may be
+	// reopened...
+	file *os.File
 
 	// mode is the mode in which the file was opened. Set to invalidMode
 	// if localFile isn't opened.
@@ -220,7 +215,22 @@ type localFile struct {
 	readDirMu sync.Mutex
 }
 
-func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
+func openAnyFileFromParent(parent *localFile, name string) (*os.File, string, error) {
+	path := path.Join(parent.hostPath, name)
+	f, err := openAnyFile(path, func(mode int) (*os.File, error) {
+		fd, err := syscall.Openat(parent.fd(), name, openFlags|mode, 0)
+		if err != nil {
+			return nil, err
+		}
+		return os.NewFile(uintptr(fd), path), nil
+	})
+	return f, path, err
+}
+
+// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
+// to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
+// actual file open and is customizable by the caller.
+func openAnyFile(path string, fn func(mode int) (*os.File, error)) (*os.File, error) {
 	// Attempt to open file in the following mode in order:
 	//   1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too.
 	//      Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option
@@ -229,9 +239,9 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 	modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
 
 	var err error
-	var fd int
+	var file *os.File
 	for i, mode := range modes {
-		fd, err = syscall.Openat(parent.controlFD(), name, openFlags|mode, 0)
+		file, err = fn(mode)
 		if err == nil {
 			// openat succeeded, we're done.
 			break
@@ -239,20 +249,19 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 		switch e := extractErrno(err); e {
 		case syscall.ENOENT:
 			// File doesn't exist, no point in retrying.
-			return nil, "", e
+			return nil, e
 		}
 		// openat failed. Try again with next mode, preserving 'err' in case this
 		// was the last attempt.
-		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|mode, parent.controlFile.Name(), name, err)
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err)
 	}
 	if err != nil {
 		// All attempts to open file have failed, return the last error.
-		log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.controlFile.Name(), name, err)
-		return nil, "", extractErrno(err)
+		log.Debugf("Failed to open file, path: %q, err: %v", path, err)
+		return nil, extractErrno(err)
 	}
 
-	newPath := path.Join(parent.hostPath, name)
-	return os.NewFile(uintptr(fd), newPath), newPath, nil
+	return file, nil
 }
 
 func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
@@ -279,7 +288,7 @@ func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_
 	return &localFile{
 		attachPoint: a,
 		hostPath:    path,
-		controlFile: file,
+		file:        file,
 		mode:        invalidMode,
 		ft:          ft,
 	}, nil
@@ -314,33 +323,26 @@ func fchown(fd int, uid p9.UID, gid p9.GID) error {
 	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
 }
 
-func (l *localFile) controlFD() int {
-	return int(l.controlFile.Fd())
-}
-
-func (l *localFile) openedFD() int {
-	if l.openedFile == nil {
-		panic(fmt.Sprintf("trying to use an unopened file: %q", l.controlFile.Name()))
-	}
-	return int(l.openedFile.Fd())
+func (l *localFile) fd() int {
+	return int(l.file.Fd())
 }
 
 // Open implements p9.File.
 func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
-	if l.openedFile != nil {
-		panic(fmt.Sprintf("attempting to open already opened file: %q", l.controlFile.Name()))
+	if l.isOpen() {
+		panic(fmt.Sprintf("attempting to open already opened file: %q", l.file.Name()))
 	}
 
 	// Check if control file can be used or if a new open must be created.
 	var newFile *os.File
 	if mode == p9.ReadOnly {
-		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name())
-		newFile = l.controlFile
+		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.file.Name())
+		newFile = l.file
 	} else {
 		// Ideally reopen would call name_to_handle_at (with empty name) and
 		// open_by_handle_at to reopen the file without using 'hostPath'. However,
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
-		log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
+		log.Debugf("Open reopening file, mode: %v, %q", mode, l.file.Name())
 		var err error
 
 		newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
@@ -351,7 +353,9 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 
 	stat, err := stat(int(newFile.Fd()))
 	if err != nil {
-		newFile.Close()
+		if newFile != l.file {
+			newFile.Close()
+		}
 		return nil, p9.QID{}, 0, extractErrno(err)
 	}
 
@@ -361,8 +365,13 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		fd = newFDMaybe(newFile)
 	}
 
-	// Set fields on success
-	l.openedFile = newFile
+	// Close old file in case a new one was created.
+	if newFile != l.file {
+		if err := l.file.Close(); err != nil {
+			log.Warningf("Error closing file %q: %v", l.file.Name(), err)
+		}
+		l.file = newFile
+	}
 	l.mode = mode
 	return fd, l.attachPoint.makeQID(stat), 0, nil
 }
@@ -377,10 +386,9 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		return nil, nil, p9.QID{}, 0, syscall.EBADF
 	}
 
-	// Use a single file for both 'controlFile' and 'openedFile'. Mode must
-	// include read for control and whichever else was requested by caller. Note
-	// that resulting file might have a wider mode than needed for each particular
-	// case.
+	// 'file' may be used for other operations (e.g. Walk), so read access is
+	// always added to flags. Note that resulting file might have a wider mode
+	// than needed for each particular case.
 	flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
 	if mode == p9.WriteOnly {
 		flags |= syscall.O_RDWR
@@ -388,14 +396,14 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		flags |= mode.OSFlags()
 	}
 
-	fd, err := syscall.Openat(l.controlFD(), name, flags, uint32(perm.Permissions()))
+	fd, err := syscall.Openat(l.fd(), name, flags, uint32(perm.Permissions()))
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		syscall.Close(fd)
 		// Best effort attempt to remove the file in case of failure.
-		if err := syscall.Unlinkat(l.controlFD(), name); err != nil {
+		if err := syscall.Unlinkat(l.fd(), name); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -414,13 +422,12 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	c := &localFile{
 		attachPoint: l.attachPoint,
 		hostPath:    cPath,
-		controlFile: f,
-		openedFile:  f,
+		file:        f,
 		mode:        mode,
 	}
 
 	cu.Release()
-	return newFDMaybe(c.openedFile), c, l.attachPoint.makeQID(stat), 0, nil
+	return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
 }
 
 // Mkdir implements p9.File.
@@ -433,12 +440,12 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
+	if err := syscall.Mkdirat(l.fd(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		// Best effort attempt to remove the dir in case of failure.
-		if err := unix.Unlinkat(l.controlFD(), name, unix.AT_REMOVEDIR); err != nil {
+		if err := unix.Unlinkat(l.fd(), name, unix.AT_REMOVEDIR); err != nil {
 			log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -446,7 +453,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 
 	// Open directory to change ownership and stat it.
 	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
-	fd, err := syscall.Openat(l.controlFD(), name, flags, 0)
+	fd, err := syscall.Openat(l.fd(), name, flags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -468,20 +475,34 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	// Duplicate current file if 'names' is empty.
 	if len(names) == 0 {
-		newFd, err := syscall.Dup(l.controlFD())
-		if err != nil {
-			return nil, nil, extractErrno(err)
+		var newFile *os.File
+		if l.isOpen() {
+			// File mode may have changed when it was opened, so open a new one.
+			var err error
+			newFile, err = openAnyFile(l.hostPath, func(mode int) (*os.File, error) {
+				return os.OpenFile(l.hostPath, openFlags|mode, 0)
+			})
+			if err != nil {
+				return nil, nil, extractErrno(err)
+			}
+		} else {
+			newFd, err := syscall.Dup(l.fd())
+			if err != nil {
+				return nil, nil, extractErrno(err)
+			}
+			newFile = os.NewFile(uintptr(newFd), l.hostPath)
 		}
-		stat, err := stat(newFd)
+
+		stat, err := stat(int(newFile.Fd()))
 		if err != nil {
-			syscall.Close(newFd)
+			newFile.Close()
 			return nil, nil, extractErrno(err)
 		}
 
 		c := &localFile{
 			attachPoint: l.attachPoint,
 			hostPath:    l.hostPath,
-			controlFile: os.NewFile(uintptr(newFd), l.hostPath),
+			file:        newFile,
 			mode:        invalidMode,
 		}
 		return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
@@ -490,7 +511,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	var qids []p9.QID
 	last := l
 	for _, name := range names {
-		f, path, err := openAnyFile(last, name)
+		f, path, err := openAnyFileFromParent(last, name)
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
@@ -514,7 +535,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 // StatFS implements p9.File.
 func (l *localFile) StatFS() (p9.FSStat, error) {
 	var s syscall.Statfs_t
-	if err := syscall.Fstatfs(l.controlFD(), &s); err != nil {
+	if err := syscall.Fstatfs(l.fd(), &s); err != nil {
 		return p9.FSStat{}, extractErrno(err)
 	}
 
@@ -533,10 +554,10 @@ func (l *localFile) StatFS() (p9.FSStat, error) {
 
 // FSync implements p9.File.
 func (l *localFile) FSync() error {
-	if l.openedFile == nil {
+	if !l.isOpen() {
 		return syscall.EBADF
 	}
-	if err := l.openedFile.Sync(); err != nil {
+	if err := l.file.Sync(); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -544,7 +565,7 @@ func (l *localFile) FSync() error {
 
 // GetAttr implements p9.File.
 func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
-	stat, err := stat(l.controlFD())
+	stat, err := stat(l.fd())
 	if err != nil {
 		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
 	}
@@ -612,14 +633,14 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// Handle all the sanity checks up front so that the client gets a
 	// consistent result that is not attribute dependent.
 	if !valid.IsSubsetOf(allowed) {
-		log.Warningf("SetAttr() failed for %q, mask: %v", l.controlFile.Name(), valid)
+		log.Warningf("SetAttr() failed for %q, mask: %v", l.file.Name(), valid)
 		return syscall.EPERM
 	}
 
-	fd := l.controlFD()
-	if l.ft == regular {
-		// Regular files are opened in RO mode, thus it needs to be reopened here
-		// for write.
+	// Check if it's possible to use cached file, or if another one needs to be
+	// opened for write.
+	fd := l.fd()
+	if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
 		f, err := os.OpenFile(l.hostPath, openFlags|os.O_WRONLY, 0)
 		if err != nil {
 			return extractErrno(err)
@@ -733,7 +754,7 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
 	}
 
 	newParent := directory.(*localFile)
-	if err := renameat(l.controlFD(), oldName, newParent.controlFD(), newName); err != nil {
+	if err := renameat(l.fd(), oldName, newParent.fd(), newName); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -744,11 +765,11 @@ func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
 		return 0, syscall.EBADF
 	}
-	if l.openedFile == nil {
+	if !l.isOpen() {
 		return 0, syscall.EBADF
 	}
 
-	r, err := l.openedFile.ReadAt(p, int64(offset))
+	r, err := l.file.ReadAt(p, int64(offset))
 	switch err {
 	case nil, io.EOF:
 		return r, nil
@@ -762,11 +783,11 @@ func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
 		return 0, syscall.EBADF
 	}
-	if l.openedFile == nil {
+	if !l.isOpen() {
 		return 0, syscall.EBADF
 	}
 
-	w, err := l.openedFile.WriteAt(p, int64(offset))
+	w, err := l.file.WriteAt(p, int64(offset))
 	if err != nil {
 		return w, extractErrno(err)
 	}
@@ -783,19 +804,19 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
+	if err := unix.Symlinkat(target, l.fd(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		// Best effort attempt to remove the symlink in case of failure.
-		if err := syscall.Unlinkat(l.controlFD(), newName); err != nil {
+		if err := syscall.Unlinkat(l.fd(), newName); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
 		}
 	})
 	defer cu.Clean()
 
 	// Open symlink to change ownership and stat it.
-	fd, err := syscall.Openat(l.controlFD(), newName, unix.O_PATH|openFlags, 0)
+	fd, err := syscall.Openat(l.fd(), newName, unix.O_PATH|openFlags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -824,7 +845,7 @@ func (l *localFile) Link(target p9.File, newName string) error {
 	}
 
 	targetFile := target.(*localFile)
-	if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil {
+	if err := unix.Linkat(targetFile.fd(), "", l.fd(), newName, linux.AT_EMPTY_PATH); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -847,7 +868,7 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 		return syscall.EBADF
 	}
 
-	if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil {
+	if err := unix.Unlinkat(l.fd(), name, int(flags)); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -858,7 +879,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
 		return nil, syscall.EBADF
 	}
-	if l.openedFile == nil {
+	if !l.isOpen() {
 		return nil, syscall.EBADF
 	}
 
@@ -866,11 +887,11 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	// reading all directory contents. Take a lock because this operation is
 	// stateful.
 	l.readDirMu.Lock()
-	if _, err := l.openedFile.Seek(0, 0); err != nil {
+	if _, err := l.file.Seek(0, 0); err != nil {
 		l.readDirMu.Unlock()
 		return nil, extractErrno(err)
 	}
-	names, err := l.openedFile.Readdirnames(-1)
+	names, err := l.file.Readdirnames(-1)
 	if err != nil {
 		l.readDirMu.Unlock()
 		return nil, extractErrno(err)
@@ -879,7 +900,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 
 	var dirents []p9.Dirent
 	for i := int(offset); i >= 0 && i < len(names); i++ {
-		stat, err := statAt(l.openedFD(), names[i])
+		stat, err := statAt(l.fd(), names[i])
 		if err != nil {
 			continue
 		}
@@ -897,9 +918,10 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 // Readlink implements p9.File.
 func (l *localFile) Readlink() (string, error) {
 	// Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
-	for len := 128; len < 1024*1024; len *= 2 {
+	const limit = 1024 * 1024
+	for len := 128; len < limit; len *= 2 {
 		b := make([]byte, len)
-		n, err := unix.Readlinkat(l.controlFD(), "", b)
+		n, err := unix.Readlinkat(l.fd(), "", b)
 		if err != nil {
 			return "", extractErrno(err)
 		}
@@ -922,20 +944,16 @@ func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
 
 // Close implements p9.File.
 func (l *localFile) Close() error {
-	err := l.controlFile.Close()
-
-	// Close only once in case opened and control files point to
-	// the same os.File struct.
-	if l.openedFile != nil && l.openedFile != l.controlFile {
-		err = l.openedFile.Close()
-	}
-
-	l.openedFile = nil
-	l.controlFile = nil
 	l.mode = invalidMode
+	err := l.file.Close()
+	l.file = nil
 	return err
 }
 
+func (l *localFile) isOpen() bool {
+	return l.mode != invalidMode
+}
+
 // Renamed implements p9.Renamed.
 func (l *localFile) Renamed(newDir p9.File, newName string) {
 	l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 47b5380dc..e74df7ede 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -42,6 +42,48 @@ func assertPanic(t *testing.T, f func()) {
 	f()
 }
 
+func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
+	want := make([]byte, len(content))
+	copy(want, content)
+
+	b := []byte("test-1-2-3")
+	w, err := f.WriteAt(b, uint64(len(content)))
+	if flags == p9.WriteOnly || flags == p9.ReadWrite {
+		if err != nil {
+			return fmt.Errorf("WriteAt(): %v", err)
+		}
+		if w != len(b) {
+			return fmt.Errorf("WriteAt() was partial, got: %d, want: %d", w, len(b))
+		}
+		want = append(want, b...)
+	} else {
+		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+			return fmt.Errorf("WriteAt() should have failed, got: %d, want: EBADFD", err)
+		}
+	}
+
+	rBuf := make([]byte, len(want))
+	r, err := f.ReadAt(rBuf, 0)
+	if flags == p9.ReadOnly || flags == p9.ReadWrite {
+		if err != nil {
+			return fmt.Errorf("ReadAt(): %v", err)
+		}
+		if r != len(rBuf) {
+			return fmt.Errorf("ReadAt() was partial, got: %d, want: %d", r, len(rBuf))
+		}
+		if string(rBuf) != string(want) {
+			return fmt.Errorf("ReadAt() wrong data, got: %s, want: %s", string(rBuf), want)
+		}
+	} else {
+		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+			return fmt.Errorf("ReadAt() should have failed, got: %d, want: EBADFD", err)
+		}
+	}
+	return nil
+}
+
+var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
+
 var (
 	allTypes = []fileType{regular, directory, symlink}
 
@@ -160,61 +202,24 @@ func TestReadWrite(t *testing.T) {
 			t.Fatalf("%v: createFile() failed, err: %v", s, err)
 		}
 		defer child.Close()
-		b := []byte("foobar")
-		w, err := child.WriteAt(b, 0)
+		want := []byte("foobar")
+		w, err := child.WriteAt(want, 0)
 		if err != nil {
 			t.Fatalf("%v: Write() failed, err: %v", s, err)
 		}
-		if w != len(b) {
-			t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(b))
-		}
-		for _, test := range []struct {
-			flags p9.OpenFlags
-			read  bool
-			write bool
-		}{
-			{flags: p9.ReadOnly, read: true, write: false},
-			{flags: p9.WriteOnly, read: false, write: true},
-			{flags: p9.ReadWrite, read: true, write: true},
-		} {
+		if w != len(want) {
+			t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(want))
+		}
+		for _, flags := range allOpenFlags {
 			_, l, err := s.file.Walk([]string{"test"})
 			if err != nil {
 				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
 			}
-			if _, _, _, err := l.Open(test.flags); err != nil {
-				t.Fatalf("%v: Open(%v) failed, err: %v", s, test.flags, err)
-			}
-
-			w, err = l.WriteAt(b, 0)
-			if test.write {
-				if err != nil {
-					t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
-				}
-				if w != len(b) {
-					t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b))
-				}
-			} else {
-				if err == nil {
-					t.Fatalf("%v, %v: WriteAt() should have failed", s, test.flags)
-				}
+			if _, _, _, err := l.Open(flags); err != nil {
+				t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err)
 			}
-
-			rBuf := make([]byte, len(b))
-			r, err := l.ReadAt(rBuf, 0)
-			if test.read {
-				if err != nil {
-					t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err)
-				}
-				if r != len(rBuf) {
-					t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf))
-				}
-				if string(rBuf) != "foobar" {
-					t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar")
-				}
-			} else {
-				if err == nil {
-					t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags)
-				}
+			if err := testReadWrite(l, flags, want); err != nil {
+				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
 			}
 		}
 	})
@@ -222,42 +227,57 @@ func TestReadWrite(t *testing.T) {
 
 func TestCreate(t *testing.T) {
 	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
-		for i, test := range []struct {
-			flags p9.OpenFlags
-			read  bool
-		}{
-			{flags: p9.WriteOnly, read: false},
-			{flags: p9.ReadWrite, read: true},
-		} {
-			_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), test.flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		for i, flags := range allOpenFlags {
+			_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
 			if err != nil {
-				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, flags, err)
 			}
 
-			b := []byte("foobar")
-			w, err := l.WriteAt(b, 0)
+			if err := testReadWrite(l, flags, []byte{}); err != nil {
+				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
+			}
+		}
+	})
+}
+
+// TestReadWriteDup tests that a file opened in any mode can be dup'ed and
+// reopened in any other mode.
+func TestReadWriteDup(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		child, err := createFile(s.file, "test")
+		if err != nil {
+			t.Fatalf("%v: createFile() failed, err: %v", s, err)
+		}
+		defer child.Close()
+		want := []byte("foobar")
+		w, err := child.WriteAt(want, 0)
+		if err != nil {
+			t.Fatalf("%v: Write() failed, err: %v", s, err)
+		}
+		if w != len(want) {
+			t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(want))
+		}
+		for _, flags := range allOpenFlags {
+			_, l, err := s.file.Walk([]string{"test"})
 			if err != nil {
-				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
 			}
-			if w != len(b) {
-				t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b))
+			defer l.Close()
+			if _, _, _, err := l.Open(flags); err != nil {
+				t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err)
 			}
-
-			rBuf := make([]byte, len(b))
-			r, err := l.ReadAt(rBuf, 0)
-			if test.read {
+			for _, dupFlags := range allOpenFlags {
+				t.Logf("Original flags: %v, dup flags: %v", flags, dupFlags)
+				_, dup, err := l.Walk([]string{})
 				if err != nil {
-					t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err)
-				}
-				if r != len(rBuf) {
-					t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf))
+					t.Fatalf("%v: Walk(<empty>) failed: %v", s, err)
 				}
-				if string(rBuf) != "foobar" {
-					t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar")
+				defer dup.Close()
+				if _, _, _, err := dup.Open(dupFlags); err != nil {
+					t.Fatalf("%v: Open(%v) failed: %v", s, flags, err)
 				}
-			} else {
-				if err == nil {
-					t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags)
+				if err := testReadWrite(dup, dupFlags, want); err != nil {
+					t.Fatalf("%v: testReadWrite(%v) failed: %v", s, dupFlags, err)
 				}
 			}
 		}
-- 
cgit v1.2.3


From a40ee4f4b8a6874157759723583d6489bbac7f23 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 2 Apr 2019 11:26:47 -0700
Subject: Change bug number for duplicate bug.

PiperOrigin-RevId: 241567897
Change-Id: I580eac04f52bb15f4aab7df9822c4aa92e743021
---
 runsc/boot/controller.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 14e1eba5b..ed78e8070 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -232,7 +232,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	}
 	// Prevent CIDs containing ".." from confusing the sentry when creating
 	// /containers/<cid> directory.
-	// TODO: Once we have multiple independant roots, this
+	// TODO: Once we have multiple independent roots, this
 	// check won't be necessary.
 	if path.Clean(args.CID) != args.CID {
 		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
-- 
cgit v1.2.3


From f9431fb20f24834dd1d5c450631bdfa04fe042f4 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 2 Apr 2019 17:26:03 -0700
Subject: Remove obsolete TODO.

PiperOrigin-RevId: 241637164
Change-Id: I65476a739cf38f1818dc47f6ce60638dec8b77a8
---
 runsc/boot/controller.go | 1 -
 1 file changed, 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index ed78e8070..2488981f9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -193,7 +193,6 @@ type StartArgs struct {
 	// Spec is the spec of the container to start.
 	Spec *specs.Spec
 
-	// TODO: Separate sandbox and container configs.
 	// Config is the runsc-specific configuration for the sandbox.
 	Conf *Config
 
-- 
cgit v1.2.3


From 88409e983c463b6d9c8085e7fdbe7ff45b3c5184 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 4 Apr 2019 17:42:51 -0700
Subject: gvisor: Add support for the MS_NOEXEC mount option

https://github.com/google/gvisor/issues/145

PiperOrigin-RevId: 242044115
Change-Id: I8f140fe05e32ecd438b6be218e224e4b7fe05878
---
 pkg/sentry/fs/context.go               |  5 +++++
 pkg/sentry/fs/filesystems.go           |  4 ++++
 pkg/sentry/fs/proc/mounts.go           |  3 +++
 pkg/sentry/syscalls/linux/sys_mount.go |  5 ++++-
 runsc/boot/fs.go                       |  2 ++
 runsc/specutils/fs.go                  |  5 ++---
 test/syscalls/linux/BUILD              |  1 +
 test/syscalls/linux/mount.cc           | 18 ++++++++++++++++++
 8 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index 1775d3486..c0e6075e4 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -46,6 +46,11 @@ func ContextCanAccessFile(ctx context.Context, inode *Inode, reqPerms PermMask)
 		p = uattr.Perms.Group
 	}
 
+	// Do not allow programs to be executed if MS_NOEXEC is set.
+	if IsFile(inode.StableAttr) && reqPerms.Execute && inode.MountSource.Flags.NoExec {
+		return false
+	}
+
 	// Are permissions satisfied without capability checks?
 	if p.SupersetOf(reqPerms) {
 		return true
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index aa664b973..a6b27c402 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -140,6 +140,10 @@ type MountSourceFlags struct {
 	// cache, even when the platform supports direct mapped I/O. This
 	// doesn't correspond to any Linux mount options.
 	ForcePageCache bool
+
+	// NoExec corresponds to mount(2)'s "MS_NOEXEC" and indicates that
+	// binaries from this file system can't be executed.
+	NoExec bool
 }
 
 // GenericMountSourceOptions splits a string containing comma separated tokens of the
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 7111e5c0f..1e62af8c6 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -129,6 +129,9 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		if m.Flags.NoAtime {
 			opts += ",noatime"
 		}
+		if m.Flags.NoExec {
+			opts += ",noexec"
+		}
 		fmt.Fprintf(&buf, "%s ", opts)
 
 		// (7) Optional fields: zero or more fields of the form "tag[:value]".
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index 6b8d75d24..e110a553f 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -75,7 +75,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	// Silently allow MS_NOSUID, since we don't implement set-id bits
 	// anyway.
-	const unsupportedFlags = linux.MS_NODEV | linux.MS_NOEXEC |
+	const unsupportedFlags = linux.MS_NODEV |
 		linux.MS_NODIRATIME | linux.MS_STRICTATIME
 
 	// Linux just allows passing any flags to mount(2) - it won't fail when
@@ -100,6 +100,9 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
 		superFlags.ReadOnly = true
 	}
+	if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+		superFlags.NoExec = true
+	}
 
 	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil)
 	if err != nil {
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 25e23c09b..8dfb6dce6 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -482,6 +482,8 @@ func mountFlags(opts []string) fs.MountSourceFlags {
 			mf.ReadOnly = true
 		case "noatime":
 			mf.NoAtime = true
+		case "noexec":
+			mf.NoExec = true
 		default:
 			log.Warningf("ignoring unknown mount option %q", o)
 		}
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index aa17d4eb9..98c3b19c0 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -39,6 +39,7 @@ var optionsMap = map[string]mapping{
 	"diratime":      {set: false, val: syscall.MS_NODIRATIME},
 	"dirsync":       {set: true, val: syscall.MS_DIRSYNC},
 	"exec":          {set: false, val: syscall.MS_NOEXEC},
+	"noexec":        {set: true, val: syscall.MS_NOEXEC},
 	"iversion":      {set: true, val: syscall.MS_I_VERSION},
 	"loud":          {set: false, val: syscall.MS_SILENT},
 	"mand":          {set: true, val: syscall.MS_MANDLOCK},
@@ -76,9 +77,7 @@ var propOptionsMap = map[string]mapping{
 // invalidOptions list options not allowed.
 //   - shared: sandbox must be isolated from the host. Propagating mount changes
 //     from the sandbox to the host breaks the isolation.
-//   - noexec: not yet supported. Don't ignore it since it could break
-//     in-sandbox security.
-var invalidOptions = []string{"shared", "rshared", "noexec"}
+var invalidOptions = []string{"shared", "rshared"}
 
 // OptionsToFlags converts mount options to syscall flags.
 func OptionsToFlags(opts []string) uint32 {
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 1e386193b..38faba267 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1080,6 +1080,7 @@ cc_binary(
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         "//test/util:mount_util",
+        "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
index 6bb4287a3..201b83e87 100644
--- a/test/syscalls/linux/mount.cc
+++ b/test/syscalls/linux/mount.cc
@@ -31,6 +31,7 @@
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/mount_util.h"
+#include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -277,6 +278,23 @@ TEST(MountTest, MountNoAtime) {
   EXPECT_EQ(before, after);
 }
 
+TEST(MountTest, MountNoExec) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", MS_NOEXEC, "mode=0777", 0));
+
+  std::string const contents = "No no no, don't follow the instructions!";
+  auto const file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(dir.path(), contents, 0777));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {}, {}, nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, EACCES);
+}
+
 TEST(MountTest, RenameRemoveMountPoint) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
-- 
cgit v1.2.3


From eaac2806ffadbb3db6317e58c61b855b1350f0aa Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 9 Apr 2019 11:22:28 -0700
Subject: Add TCP checksum verification.

PiperOrigin-RevId: 242704699
Change-Id: I87db368ca343b3b4bf4f969b17d3aa4ce2f8bd4f
---
 pkg/sentry/socket/epsocket/epsocket.go  |  1 +
 pkg/tcpip/header/tcp.go                 | 68 ++++++++++++++++-----------------
 pkg/tcpip/link/fdbased/endpoint.go      | 18 ++++++---
 pkg/tcpip/link/loopback/loopback.go     |  2 +-
 pkg/tcpip/link/muxed/injectable.go      |  2 +-
 pkg/tcpip/link/muxed/injectable_test.go |  4 +-
 pkg/tcpip/stack/registration.go         | 10 ++++-
 pkg/tcpip/tcpip.go                      |  3 ++
 pkg/tcpip/transport/tcp/connect.go      |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go     |  9 ++++-
 pkg/tcpip/transport/tcp/forwarder.go    |  2 +-
 pkg/tcpip/transport/tcp/protocol.go     |  2 +-
 pkg/tcpip/transport/tcp/segment.go      | 31 +++++++++++++--
 pkg/tcpip/transport/tcp/tcp_test.go     | 29 +++++++++++++-
 pkg/tcpip/transport/udp/endpoint.go     |  2 +-
 runsc/boot/network.go                   |  1 +
 16 files changed, 129 insertions(+), 57 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e170da169..5bcafad98 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -154,6 +154,7 @@ var Metrics = tcpip.Stats{
 		SlowStartRetransmits:      mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
 		FastRetransmit:            mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
 		Timeouts:                  mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
+		ChecksumErrors:            mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
 	},
 	UDP: tcpip.UDPStats{
 		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index 6e3ee2e50..e656ebb15 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -22,16 +22,17 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum"
 )
 
+// These constants are the offsets of the respective fields in the TCP header.
 const (
-	srcPort     = 0
-	dstPort     = 2
-	seqNum      = 4
-	ackNum      = 8
-	dataOffset  = 12
-	tcpFlags    = 13
-	winSize     = 14
-	tcpChecksum = 16
-	urgentPtr   = 18
+	TCPSrcPortOffset   = 0
+	TCPDstPortOffset   = 2
+	TCPSeqNumOffset    = 4
+	TCPAckNumOffset    = 8
+	TCPDataOffset      = 12
+	TCPFlagsOffset     = 13
+	TCPWinSizeOffset   = 14
+	TCPChecksumOffset  = 16
+	TCPUrgentPtrOffset = 18
 )
 
 const (
@@ -179,27 +180,27 @@ const (
 
 // SourcePort returns the "source port" field of the tcp header.
 func (b TCP) SourcePort() uint16 {
-	return binary.BigEndian.Uint16(b[srcPort:])
+	return binary.BigEndian.Uint16(b[TCPSrcPortOffset:])
 }
 
 // DestinationPort returns the "destination port" field of the tcp header.
 func (b TCP) DestinationPort() uint16 {
-	return binary.BigEndian.Uint16(b[dstPort:])
+	return binary.BigEndian.Uint16(b[TCPDstPortOffset:])
 }
 
 // SequenceNumber returns the "sequence number" field of the tcp header.
 func (b TCP) SequenceNumber() uint32 {
-	return binary.BigEndian.Uint32(b[seqNum:])
+	return binary.BigEndian.Uint32(b[TCPSeqNumOffset:])
 }
 
 // AckNumber returns the "ack number" field of the tcp header.
 func (b TCP) AckNumber() uint32 {
-	return binary.BigEndian.Uint32(b[ackNum:])
+	return binary.BigEndian.Uint32(b[TCPAckNumOffset:])
 }
 
 // DataOffset returns the "data offset" field of the tcp header.
 func (b TCP) DataOffset() uint8 {
-	return (b[dataOffset] >> 4) * 4
+	return (b[TCPDataOffset] >> 4) * 4
 }
 
 // Payload returns the data in the tcp packet.
@@ -209,32 +210,32 @@ func (b TCP) Payload() []byte {
 
 // Flags returns the flags field of the tcp header.
 func (b TCP) Flags() uint8 {
-	return b[tcpFlags]
+	return b[TCPFlagsOffset]
 }
 
 // WindowSize returns the "window size" field of the tcp header.
 func (b TCP) WindowSize() uint16 {
-	return binary.BigEndian.Uint16(b[winSize:])
+	return binary.BigEndian.Uint16(b[TCPWinSizeOffset:])
 }
 
 // Checksum returns the "checksum" field of the tcp header.
 func (b TCP) Checksum() uint16 {
-	return binary.BigEndian.Uint16(b[tcpChecksum:])
+	return binary.BigEndian.Uint16(b[TCPChecksumOffset:])
 }
 
 // SetSourcePort sets the "source port" field of the tcp header.
 func (b TCP) SetSourcePort(port uint16) {
-	binary.BigEndian.PutUint16(b[srcPort:], port)
+	binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], port)
 }
 
 // SetDestinationPort sets the "destination port" field of the tcp header.
 func (b TCP) SetDestinationPort(port uint16) {
-	binary.BigEndian.PutUint16(b[dstPort:], port)
+	binary.BigEndian.PutUint16(b[TCPDstPortOffset:], port)
 }
 
 // SetChecksum sets the checksum field of the tcp header.
 func (b TCP) SetChecksum(checksum uint16) {
-	binary.BigEndian.PutUint16(b[tcpChecksum:], checksum)
+	binary.BigEndian.PutUint16(b[TCPChecksumOffset:], checksum)
 }
 
 // CalculateChecksum calculates the checksum of the tcp segment.
@@ -258,20 +259,20 @@ func (b TCP) ParsedOptions() TCPOptions {
 }
 
 func (b TCP) encodeSubset(seq, ack uint32, flags uint8, rcvwnd uint16) {
-	binary.BigEndian.PutUint32(b[seqNum:], seq)
-	binary.BigEndian.PutUint32(b[ackNum:], ack)
-	b[tcpFlags] = flags
-	binary.BigEndian.PutUint16(b[winSize:], rcvwnd)
+	binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seq)
+	binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ack)
+	b[TCPFlagsOffset] = flags
+	binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd)
 }
 
 // Encode encodes all the fields of the tcp header.
 func (b TCP) Encode(t *TCPFields) {
 	b.encodeSubset(t.SeqNum, t.AckNum, t.Flags, t.WindowSize)
-	binary.BigEndian.PutUint16(b[srcPort:], t.SrcPort)
-	binary.BigEndian.PutUint16(b[dstPort:], t.DstPort)
-	b[dataOffset] = (t.DataOffset / 4) << 4
-	binary.BigEndian.PutUint16(b[tcpChecksum:], t.Checksum)
-	binary.BigEndian.PutUint16(b[urgentPtr:], t.UrgentPointer)
+	binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], t.SrcPort)
+	binary.BigEndian.PutUint16(b[TCPDstPortOffset:], t.DstPort)
+	b[TCPDataOffset] = (t.DataOffset / 4) << 4
+	binary.BigEndian.PutUint16(b[TCPChecksumOffset:], t.Checksum)
+	binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], t.UrgentPointer)
 }
 
 // EncodePartial updates a subset of the fields of the tcp header. It is useful
@@ -290,18 +291,13 @@ func (b TCP) EncodePartial(partialChecksum, length uint16, seqnum, acknum uint32
 	b.encodeSubset(seqnum, acknum, flags, rcvwnd)
 
 	// Add the contributions of the passed-in fields to the checksum.
-	checksum = Checksum(b[seqNum:seqNum+8], checksum)
-	checksum = Checksum(b[winSize:winSize+2], checksum)
+	checksum = Checksum(b[TCPSeqNumOffset:TCPSeqNumOffset+8], checksum)
+	checksum = Checksum(b[TCPWinSizeOffset:TCPWinSizeOffset+2], checksum)
 
 	// Encode the checksum.
 	b.SetChecksum(^checksum)
 }
 
-// TCPChecksumOffset returns offset of the checksum field.
-func TCPChecksumOffset() uint16 {
-	return tcpChecksum
-}
-
 // ParseSynOptions parses the options received in a SYN segment and returns the
 // relevant ones. opts should point to the option part of the TCP Header.
 func ParseSynOptions(opts []byte, isAck bool) TCPSynOptions {
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 20e34c5ee..84439a9ed 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -122,13 +122,14 @@ type Options struct {
 	FD                 int
 	MTU                uint32
 	EthernetHeader     bool
-	ChecksumOffload    bool
 	ClosedFunc         func(*tcpip.Error)
 	Address            tcpip.LinkAddress
 	SaveRestore        bool
 	DisconnectOk       bool
 	GSOMaxSize         uint32
 	PacketDispatchMode PacketDispatchMode
+	TXChecksumOffload  bool
+	RXChecksumOffload  bool
 }
 
 // New creates a new fd-based endpoint.
@@ -142,8 +143,12 @@ func New(opts *Options) tcpip.LinkEndpointID {
 	}
 
 	caps := stack.LinkEndpointCapabilities(0)
-	if opts.ChecksumOffload {
-		caps |= stack.CapabilityChecksumOffload
+	if opts.RXChecksumOffload {
+		caps |= stack.CapabilityRXChecksumOffload
+	}
+
+	if opts.TXChecksumOffload {
+		caps |= stack.CapabilityTXChecksumOffload
 	}
 
 	hdrSize := 0
@@ -527,12 +532,13 @@ func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buf
 }
 
 // NewInjectable creates a new fd-based InjectableEndpoint.
-func NewInjectable(fd int, mtu uint32) (tcpip.LinkEndpointID, *InjectableEndpoint) {
+func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (tcpip.LinkEndpointID, *InjectableEndpoint) {
 	syscall.SetNonblock(fd, true)
 
 	e := &InjectableEndpoint{endpoint: endpoint{
-		fd:  fd,
-		mtu: mtu,
+		fd:   fd,
+		mtu:  mtu,
+		caps: capabilities,
 	}}
 
 	return stack.RegisterLinkEndpoint(e), e
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index d58c0f885..2dc4bcfda 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -56,7 +56,7 @@ func (*endpoint) MTU() uint32 {
 // Capabilities implements stack.LinkEndpoint.Capabilities. Loopback advertises
 // itself as supporting checksum offload, but in reality it's just omitted.
 func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return stack.CapabilityChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
+	return stack.CapabilityRXChecksumOffload | stack.CapabilityTXChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
 }
 
 // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. Given that the
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 99edc232d..b3e71c7fc 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -105,7 +105,7 @@ func (m *InjectableEndpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *
 }
 
 // NewInjectableEndpoint creates a new multi-endpoint injectable endpoint.
-func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint, mtu uint32) (tcpip.LinkEndpointID, *InjectableEndpoint) {
+func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint) (tcpip.LinkEndpointID, *InjectableEndpoint) {
 	e := &InjectableEndpoint{
 		routes: routes,
 	}
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 7d25effad..031449a05 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -87,8 +87,8 @@ func makeTestInjectableEndpoint(t *testing.T) (*InjectableEndpoint, *os.File, tc
 	if err != nil {
 		t.Fatal("Failed to create socket pair:", err)
 	}
-	_, underlyingEndpoint := fdbased.NewInjectable(pair[1], 6500)
+	_, underlyingEndpoint := fdbased.NewInjectable(pair[1], 6500, stack.CapabilityNone)
 	routes := map[tcpip.Address]stack.InjectableLinkEndpoint{dstIP: underlyingEndpoint}
-	_, endpoint := NewInjectableEndpoint(routes, 6500)
+	_, endpoint := NewInjectableEndpoint(routes)
 	return endpoint, os.NewFile(uintptr(pair[0]), "test route end"), dstIP
 }
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index f3cc849ec..6e1660051 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -232,7 +232,15 @@ type LinkEndpointCapabilities uint
 
 // The following are the supported link endpoint capabilities.
 const (
-	CapabilityChecksumOffload LinkEndpointCapabilities = 1 << iota
+	CapabilityNone LinkEndpointCapabilities = 0
+	// CapabilityTXChecksumOffload indicates that the link endpoint supports
+	// checksum computation for outgoing packets and the stack can skip
+	// computing checksums when sending packets.
+	CapabilityTXChecksumOffload LinkEndpointCapabilities = 1 << iota
+	// CapabilityRXChecksumOffload indicates that the link endpoint supports
+	// checksum verification on received packets and that it's safe for the
+	// stack to skip checksum verification.
+	CapabilityRXChecksumOffload
 	CapabilityResolutionRequired
 	CapabilitySaveRestore
 	CapabilityDisconnectOk
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index e9f73635f..e898dcbca 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -801,6 +801,9 @@ type TCPStats struct {
 
 	// Timeouts is the number of times the RTO expired.
 	Timeouts *StatCounter
+
+	// ChecksumErrors is the number of segments dropped due to bad checksums.
+	ChecksumErrors *StatCounter
 }
 
 // UDPStats collects UDP-specific stats.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 056e0b09a..6c4a4d95e 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -595,7 +595,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
 		// TCP header, then the kernel calculate a checksum of the
 		// header and data and get the right sum of the TCP packet.
 		tcp.SetChecksum(xsum)
-	} else if r.Capabilities()&stack.CapabilityChecksumOffload == 0 {
+	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
 		xsum = header.ChecksumVV(data, xsum)
 		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 41c87cc7e..b5d05af7d 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1447,6 +1447,13 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 		return
 	}
 
+	if !s.csumValid {
+		e.stack.Stats().MalformedRcvdPackets.Increment()
+		e.stack.Stats().TCP.ChecksumErrors.Increment()
+		s.decRef()
+		return
+	}
+
 	e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
 	if (s.flags & header.TCPFlagRst) != 0 {
 		e.stack.Stats().TCP.ResetsReceived.Increment()
@@ -1721,7 +1728,7 @@ func (e *endpoint) initGSO() {
 		panic(fmt.Sprintf("Unknown netProto: %v", e.netProto))
 	}
 	gso.NeedsCsum = true
-	gso.CsumOffset = header.TCPChecksumOffset()
+	gso.CsumOffset = header.TCPChecksumOffset
 	gso.MaxSize = e.route.GSOMaxSize()
 	e.gso = gso
 }
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 7a6589cfd..6a7efaf1d 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -68,7 +68,7 @@ func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, n
 	defer s.decRef()
 
 	// We only care about well-formed SYN packets.
-	if !s.parse() || s.flags != header.TCPFlagSyn {
+	if !s.parse() || !s.csumValid || s.flags != header.TCPFlagSyn {
 		return false
 	}
 
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 230668b5d..b5fb160bc 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -130,7 +130,7 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
 	s := newSegment(r, id, vv)
 	defer s.decRef()
 
-	if !s.parse() {
+	if !s.parse() || !s.csumValid {
 		return false
 	}
 
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index df8402bf9..c603fe713 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -45,6 +45,10 @@ type segment struct {
 	ackNumber      seqnum.Value
 	flags          uint8
 	window         seqnum.Size
+	// csum is only populated for received segments.
+	csum uint16
+	// csumValid is true if the csum in the received segment is valid.
+	csumValid bool
 
 	// parsedOptions stores the parsed values from the options in the segment.
 	parsedOptions  header.TCPOptions
@@ -124,7 +128,13 @@ func (s *segment) logicalLen() seqnum.Size {
 
 // parse populates the sequence & ack numbers, flags, and window fields of the
 // segment from the TCP header stored in the data. It then updates the view to
-// skip the data. Returns boolean indicating if the parsing was successful.
+// skip the header.
+//
+// Returns boolean indicating if the parsing was successful.
+//
+// If checksum verification is not offloaded then parse also verifies the
+// TCP checksum and stores the checksum and result of checksum verification in
+// the csum and csumValid fields of the segment.
 func (s *segment) parse() bool {
 	h := header.TCP(s.data.First())
 
@@ -145,12 +155,27 @@ func (s *segment) parse() bool {
 
 	s.options = []byte(h[header.TCPMinimumSize:offset])
 	s.parsedOptions = header.ParseTCPOptions(s.options)
-	s.data.TrimFront(offset)
+
+	// Query the link capabilities to decide if checksum validation is
+	// required.
+	verifyChecksum := true
+	if s.route.Capabilities()&stack.CapabilityRXChecksumOffload != 0 {
+		s.csumValid = true
+		verifyChecksum = false
+		s.data.TrimFront(offset)
+	}
+	if verifyChecksum {
+		s.csum = h.Checksum()
+		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()))
+		xsum = h.CalculateChecksum(xsum)
+		s.data.TrimFront(offset)
+		xsum = header.ChecksumVV(s.data, xsum)
+		s.csumValid = xsum == 0xffff
+	}
 
 	s.sequenceNumber = seqnum.Value(h.SequenceNumber())
 	s.ackNumber = seqnum.Value(h.AckNumber())
 	s.flags = h.Flags()
 	s.window = seqnum.Size(h.WindowSize())
-
 	return true
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 7f2615ca9..af50ac8af 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2963,8 +2963,7 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 		RcvWnd:  30000,
 	})
 	tcpbuf := vv.First()[header.IPv4MinimumSize:]
-	// 12 is the TCP header data offset.
-	tcpbuf[12] = ((header.TCPMinimumSize - 1) / 4) << 4
+	tcpbuf[header.TCPDataOffset] = ((header.TCPMinimumSize - 1) / 4) << 4
 
 	c.SendSegment(vv)
 
@@ -2973,6 +2972,32 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 	}
 }
 
+func TestReceivedIncorrectChecksumIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	c.CreateConnected(789, 30000, nil)
+	stats := c.Stack().Stats()
+	want := stats.TCP.ChecksumErrors.Value() + 1
+	vv := c.BuildSegment([]byte{0x1, 0x2, 0x3}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqnum.Value(790),
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+	tcpbuf := vv.First()[header.IPv4MinimumSize:]
+	// Overwrite a byte in the payload which should cause checksum
+	// verification to fail.
+	tcpbuf[(tcpbuf[header.TCPDataOffset]>>4)*4] = 0x4
+
+	c.SendSegment(vv)
+
+	if got := stats.TCP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.TCP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+}
+
 func TestReceivedSegmentQueuing(t *testing.T) {
 	// This test sends 200 segments containing a few bytes each to an
 	// endpoint and checks that they're all received and acknowledged by
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 19e532180..1f9251de3 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -640,7 +640,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	})
 
 	// Only calculate the checksum if offloading isn't supported.
-	if r.Capabilities()&stack.CapabilityChecksumOffload == 0 {
+	if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
 		xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
 		for _, v := range data.Views() {
 			xsum = header.Checksum(v, xsum)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 77291415b..3915a021f 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -142,6 +142,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			Address:            mac,
 			PacketDispatchMode: fdbased.PacketMMap,
 			GSOMaxSize:         link.GSOMaxSize,
+			RXChecksumOffload:  true,
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
-- 
cgit v1.2.3


From 93b3c9b76c16104cbb5cc55b6f2339cb43c356b5 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 9 Apr 2019 11:30:35 -0700
Subject: runsc: set UID and GID if gofer is executed in a new user namespace

Otherwise, we will not have capabilities in the user namespace.

And this patch adds the noexec option for mounts.

https://github.com/google/gvisor/issues/145

PiperOrigin-RevId: 242706519
Change-Id: I1b78b77d6969bd18038c71616e8eb7111b71207c
---
 runsc/container/container.go      |  7 +++-
 runsc/container/container_test.go | 76 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 687b89935..cc0c1ee25 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -866,8 +866,13 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Setup any uid/gid mappings, and create or join the configured user
 	// namespace so the gofer's view of the filesystem aligns with the
 	// users in the sandbox.
-	nss = append(nss, specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)...)
+	userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
+	nss = append(nss, userNS...)
 	specutils.SetUIDGIDMappings(cmd, spec)
+	if len(userNS) != 0 {
+		// We need to set UID and GID to have capabilities in a new user namespace.
+		cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
+	}
 
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index f17155175..9fe584aa3 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1250,6 +1250,82 @@ func TestReadonlyRoot(t *testing.T) {
 	}
 }
 
+func TestUIDMap(t *testing.T) {
+	for _, conf := range configs(noOverlay...) {
+		t.Logf("Running test with conf: %+v", conf)
+		testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer os.RemoveAll(testDir)
+		testFile := path.Join(testDir, "testfile")
+
+		spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile")
+		uid := os.Getuid()
+		gid := os.Getgid()
+		spec.Linux = &specs.Linux{
+			Namespaces: []specs.LinuxNamespace{
+				{Type: specs.UserNamespace},
+				{Type: specs.PIDNamespace},
+				{Type: specs.MountNamespace},
+			},
+			UIDMappings: []specs.LinuxIDMapping{
+				{
+					ContainerID: 0,
+					HostID:      uint32(uid),
+					Size:        1,
+				},
+			},
+			GIDMappings: []specs.LinuxIDMapping{
+				{
+					ContainerID: 0,
+					HostID:      uint32(gid),
+					Size:        1,
+				},
+			},
+		}
+
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: "/tmp",
+			Source:      testDir,
+			Type:        "bind",
+		})
+
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// Create, start and wait for the container.
+		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer c.Destroy()
+		if err := c.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+
+		ws, err := c.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
+		if !ws.Exited() || ws.ExitStatus() != 0 {
+			t.Fatalf("container failed, waitStatus: %v", ws)
+		}
+		st := syscall.Stat_t{}
+		if err := syscall.Stat(testFile, &st); err != nil {
+			t.Fatalf("error stat /testfile: %v", err)
+		}
+
+		if st.Uid != uint32(uid) || st.Gid != uint32(gid) {
+			t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid)
+		}
+	}
+}
+
 func TestReadonlyMount(t *testing.T) {
 	for _, conf := range configs(overlay) {
 		t.Logf("Running test with conf: %+v", conf)
-- 
cgit v1.2.3


From 546a1df7d15fd80f510d4203c5f9255bba4b4211 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 11 Apr 2019 17:53:24 -0700
Subject: Add 'runsc do' command

It provides an easy way to run commands to quickly test gVisor.
By default it maps the host root as the container root with a
writable overlay on top (so the host root is not modified).

Example:
  sudo runsc do ls -lh --color
  sudo runsc do ~/src/test/my-test.sh
PiperOrigin-RevId: 243178711
Change-Id: I05f3d6ce253fe4b5f1362f4a07b5387f6ddb5dd9
---
 runsc/cmd/BUILD          |   1 +
 runsc/cmd/do.go          | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 runsc/main.go            |   7 ++-
 runsc/sandbox/sandbox.go |   2 +-
 4 files changed, 165 insertions(+), 2 deletions(-)
 create mode 100644 runsc/cmd/do.go

(limited to 'runsc')

diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index dabf18c5f..b7551a5ab 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -13,6 +13,7 @@ go_library(
         "create.go",
         "debug.go",
         "delete.go",
+        "do.go",
         "events.go",
         "exec.go",
         "gofer.go",
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
new file mode 100644
index 000000000..343461130
--- /dev/null
+++ b/runsc/cmd/do.go
@@ -0,0 +1,157 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Do implements subcommands.Command for the "do" command. It sets up a simple
+// sandbox and executes the command inside it. See Usage() for more details.
+type Do struct {
+	root string
+	cwd  string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Do) Name() string {
+	return "do"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Do) Synopsis() string {
+	return "Simplistic way to execute a command inside the sandbox. It's to be used for testing only."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Do) Usage() string {
+	return `do [flags] <cmd> - runs a command.
+
+This command starts a sandbox with host filesystem mounted inside as readonly,
+with a writable tmpfs overlay on top of it. The given command is executed inside
+the sandbox. It's to be used to quickly test applications without having to
+install or run docker. It doesn't give nearly as many options and it's to be
+used for testing only.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Do) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
+	f.StringVar(&c.cwd, "cwd", ".", `path to the current directory, defaults to the current directory`)
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if len(f.Args()) == 0 {
+		c.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	// Map the entire host file system, but make it readonly with a writable
+	// overlay on top (ignore --overlay option).
+	conf.Overlay = true
+
+	hostname, err := os.Hostname()
+	if err != nil {
+		Fatalf("Error to retrieve hostname: %v", err)
+	}
+
+	absRoot, err := resolvePath(c.root)
+	if err != nil {
+		Fatalf("Error resolving root: %v", err)
+	}
+	absCwd, err := resolvePath(c.cwd)
+	if err != nil {
+		Fatalf("Error resolving current directory: %v", err)
+	}
+
+	spec := &specs.Spec{
+		Root: &specs.Root{
+			Path:     absRoot,
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Cwd:          absCwd,
+			Args:         f.Args(),
+			Env:          os.Environ(),
+			Capabilities: specutils.AllCapabilities(),
+		},
+		Hostname: hostname,
+	}
+
+	specutils.LogSpec(spec)
+
+	out, err := json.Marshal(spec)
+	if err != nil {
+		Fatalf("Error to marshal spec: %v", err)
+	}
+	tmpDir, err := ioutil.TempDir("", "runsc-do")
+	if err != nil {
+		Fatalf("Error to create tmp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	log.Infof("Changing configuration RootDir to %q", tmpDir)
+	conf.RootDir = tmpDir
+
+	cfgPath := filepath.Join(tmpDir, "config.json")
+	if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil {
+		Fatalf("Error write spec: %v", err)
+	}
+
+	// No network support yet.
+	conf.Network = boot.NetworkNone
+
+	id := fmt.Sprintf("runcs-do-%06d", rand.Int31n(1000000))
+	ws, err := container.Run(id, spec, conf, tmpDir, "", "", "")
+	if err != nil {
+		Fatalf("running container: %v", err)
+	}
+
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
+
+func resolvePath(path string) (string, error) {
+	var err error
+	path, err = filepath.Abs(path)
+	if err != nil {
+		return "", fmt.Errorf("resolving %q: %v", path, err)
+	}
+	path = filepath.Clean(path)
+	if err := syscall.Access(path, 0); err != nil {
+		return "", fmt.Errorf("unable to access %q: %v", path, err)
+	}
+	return path, nil
+}
diff --git a/runsc/main.go b/runsc/main.go
index bbf08228c..74253a844 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strings"
@@ -80,6 +81,7 @@ func main() {
 	subcommands.Register(new(cmd.Checkpoint), "")
 	subcommands.Register(new(cmd.Create), "")
 	subcommands.Register(new(cmd.Delete), "")
+	subcommands.Register(new(cmd.Do), "")
 	subcommands.Register(new(cmd.Events), "")
 	subcommands.Register(new(cmd.Exec), "")
 	subcommands.Register(new(cmd.Gofer), "")
@@ -168,6 +170,8 @@ func main() {
 		log.SetLevel(log.Debug)
 	}
 
+	subcommand := flag.CommandLine.Arg(0)
+
 	var logFile io.Writer = os.Stderr
 	if *logFD > -1 {
 		logFile = os.NewFile(uintptr(*logFD), "log file")
@@ -180,11 +184,12 @@ func main() {
 			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
 		}
 		logFile = f
+	} else if subcommand == "do" {
+		logFile = ioutil.Discard
 	}
 
 	e := newEmitter(*logFormat, logFile)
 
-	subcommand := flag.CommandLine.Arg(0)
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
 
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ae6375e13..92495c69e 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -104,7 +104,7 @@ func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	// Wait until the sandbox has booted.
 	b := make([]byte, 1)
 	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
-		return nil, fmt.Errorf("reading from the start-sync descriptor: %v", err)
+		return nil, fmt.Errorf("waiting for sandbox to start: %v", err)
 	}
 
 	c.Release()
-- 
cgit v1.2.3


From 9f8c89fc7fb7c4588713eb376fa56c4c3026d43c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 17 Apr 2019 11:14:24 -0700
Subject: Return error from fdbased.New

RELNOTES: n/a
PiperOrigin-RevId: 244031742
Change-Id: Id0cdb73194018fb5979e67b58510ead19b5a2b81
---
 pkg/tcpip/link/fdbased/endpoint.go       | 57 +++++++++++++++++---------------
 pkg/tcpip/link/fdbased/endpoint_test.go  |  6 +++-
 pkg/tcpip/sample/tun_tcp_connect/main.go |  5 ++-
 pkg/tcpip/sample/tun_tcp_echo/main.go    |  5 ++-
 runsc/boot/network.go                    |  5 ++-
 5 files changed, 48 insertions(+), 30 deletions(-)

(limited to 'runsc')

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 84439a9ed..6354688e2 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -136,10 +136,9 @@ type Options struct {
 //
 // Makes fd non-blocking, but does not take ownership of fd, which must remain
 // open for the lifetime of the returned endpoint.
-func New(opts *Options) tcpip.LinkEndpointID {
+func New(opts *Options) (tcpip.LinkEndpointID, error) {
 	if err := syscall.SetNonblock(opts.FD, true); err != nil {
-		// TODO : replace panic with an error return.
-		panic(fmt.Sprintf("syscall.SetNonblock(%v) failed: %v", opts.FD, err))
+		return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err)
 	}
 
 	caps := stack.LinkEndpointCapabilities(0)
@@ -175,27 +174,34 @@ func New(opts *Options) tcpip.LinkEndpointID {
 		packetDispatchMode: opts.PacketDispatchMode,
 	}
 
-	if opts.GSOMaxSize != 0 && isSocketFD(opts.FD) {
-		e.caps |= stack.CapabilityGSO
-		e.gsoMaxSize = opts.GSOMaxSize
+	// For non-socket FDs we read one packet a time (e.g. TAP devices).
+	msgsPerRecv := 1
+	e.inboundDispatcher = e.dispatch
+
+	isSocket, err := isSocketFD(opts.FD)
+	if err != nil {
+		return 0, err
 	}
-	if isSocketFD(opts.FD) && e.packetDispatchMode == PacketMMap {
-		if err := e.setupPacketRXRing(); err != nil {
-			// TODO: replace panic with an error return.
-			panic(fmt.Sprintf("e.setupPacketRXRing failed: %v", err))
+	if isSocket {
+		if opts.GSOMaxSize != 0 {
+			e.caps |= stack.CapabilityGSO
+			e.gsoMaxSize = opts.GSOMaxSize
 		}
-		e.inboundDispatcher = e.packetMMapDispatch
-		return stack.RegisterLinkEndpoint(e)
-	}
 
-	// For non-socket FDs we read one packet a time (e.g. TAP devices)
-	msgsPerRecv := 1
-	e.inboundDispatcher = e.dispatch
-	// If the provided FD is a socket then we optimize packet reads by
-	// using recvmmsg() instead of read() to read packets in a batch.
-	if isSocketFD(opts.FD) && e.packetDispatchMode == RecvMMsg {
-		e.inboundDispatcher = e.recvMMsgDispatch
-		msgsPerRecv = MaxMsgsPerRecv
+		switch e.packetDispatchMode {
+		case PacketMMap:
+			if err := e.setupPacketRXRing(); err != nil {
+				return 0, fmt.Errorf("e.setupPacketRXRing failed: %v", err)
+			}
+			e.inboundDispatcher = e.packetMMapDispatch
+			return stack.RegisterLinkEndpoint(e), nil
+
+		case RecvMMsg:
+			// If the provided FD is a socket then we optimize packet reads by
+			// using recvmmsg() instead of read() to read packets in a batch.
+			e.inboundDispatcher = e.recvMMsgDispatch
+			msgsPerRecv = MaxMsgsPerRecv
+		}
 	}
 
 	e.views = make([][]buffer.View, msgsPerRecv)
@@ -217,16 +223,15 @@ func New(opts *Options) tcpip.LinkEndpointID {
 		e.msgHdrs[i].Msg.Iovlen = uint64(iovLen)
 	}
 
-	return stack.RegisterLinkEndpoint(e)
+	return stack.RegisterLinkEndpoint(e), nil
 }
 
-func isSocketFD(fd int) bool {
+func isSocketFD(fd int) (bool, error) {
 	var stat syscall.Stat_t
 	if err := syscall.Fstat(fd, &stat); err != nil {
-		// TODO : replace panic with an error return.
-		panic(fmt.Sprintf("syscall.Fstat(%v,...) failed: %v", fd, err))
+		return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err)
 	}
-	return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK
+	return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil
 }
 
 // Attach launches the goroutine that reads packets from the file descriptor and
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index ecc5b73f3..5a06c6387 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -68,7 +68,11 @@ func newContext(t *testing.T, opt *Options) *context {
 	}
 
 	opt.FD = fds[1]
-	ep := stack.FindLinkEndpoint(New(opt)).(*endpoint)
+	epID, err := New(opt)
+	if err != nil {
+		t.Fatalf("Failed to create FD endpoint: %v", err)
+	}
+	ep := stack.FindLinkEndpoint(epID).(*endpoint)
 
 	c := &context{
 		t:    t,
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 327a79f48..cf8900c4d 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -137,7 +137,10 @@ func main() {
 		log.Fatal(err)
 	}
 
-	linkID := fdbased.New(&fdbased.Options{FD: fd, MTU: mtu})
+	linkID, err := fdbased.New(&fdbased.Options{FD: fd, MTU: mtu})
+	if err != nil {
+		log.Fatal(err)
+	}
 	if err := s.CreateNIC(1, sniffer.New(linkID)); err != nil {
 		log.Fatal(err)
 	}
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index b23dc13e7..da6202f97 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -128,12 +128,15 @@ func main() {
 		log.Fatal(err)
 	}
 
-	linkID := fdbased.New(&fdbased.Options{
+	linkID, err := fdbased.New(&fdbased.Options{
 		FD:             fd,
 		MTU:            mtu,
 		EthernetHeader: *tap,
 		Address:        tcpip.LinkAddress(maddr),
 	})
+	if err != nil {
+		log.Fatal(err)
+	}
 	if err := s.CreateNIC(1, linkID); err != nil {
 		log.Fatal(err)
 	}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 3915a021f..ceb00a858 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -135,7 +135,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		mac := tcpip.LinkAddress(generateRndMac())
-		linkEP := fdbased.New(&fdbased.Options{
+		linkEP, err := fdbased.New(&fdbased.Options{
 			FD:                 newFD,
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
@@ -144,6 +144,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			GSOMaxSize:         link.GSOMaxSize,
 			RXChecksumOffload:  true,
 		})
+		if err != nil {
+			return err
+		}
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
 		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
-- 
cgit v1.2.3


From c8cee7108f1a1b37e89961c6dd69ccab97952c86 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 17 Apr 2019 12:56:23 -0700
Subject: Use FD limit and file size limit from host

FD limit and file size limit is read from the host, instead
of using hard-coded defaults, given that they effect the sandbox
process. Also limit the direct cache to use no more than half
if the available FDs.

PiperOrigin-RevId: 244050323
Change-Id: I787ad0fdf07c49d589e51aebfeae477324fe26e6
---
 pkg/sentry/fs/BUILD                   |  1 +
 pkg/sentry/fs/context.go              | 12 +++++
 pkg/sentry/fs/dirent_cache.go         | 43 ++++++++++++++---
 pkg/sentry/fs/dirent_cache_limiter.go | 55 +++++++++++++++++++++
 pkg/sentry/fs/dirent_cache_test.go    | 90 +++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/gofer/session.go        |  9 ++++
 pkg/sentry/fs/mount.go                | 20 ++++++--
 pkg/sentry/fs/mount_overlay.go        | 11 ++++-
 pkg/sentry/kernel/kernel.go           |  9 ++++
 pkg/sentry/kernel/task.go             |  2 +
 runsc/boot/fs.go                      | 19 +++++++-
 runsc/boot/limits.go                  | 77 +++++++++++++++++++++++++++++-
 runsc/boot/loader.go                  |  4 ++
 test/syscalls/linux/poll.cc           |  9 ++++
 14 files changed, 347 insertions(+), 14 deletions(-)
 create mode 100644 pkg/sentry/fs/dirent_cache_limiter.go

(limited to 'runsc')

diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 1742d3a65..1fd9e30f6 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -12,6 +12,7 @@ go_library(
         "dentry.go",
         "dirent.go",
         "dirent_cache.go",
+        "dirent_cache_limiter.go",
         "dirent_list.go",
         "dirent_state.go",
         "event_list.go",
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index c0e6075e4..4869428a8 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -26,6 +26,9 @@ type contextID int
 const (
 	// CtxRoot is a Context.Value key for a Dirent.
 	CtxRoot contextID = iota
+
+	// CtxDirentCacheLimiter is a Context.Value key for DirentCacheLimiter.
+	CtxDirentCacheLimiter
 )
 
 // ContextCanAccessFile determines whether `file` can be accessed in the requested way
@@ -100,3 +103,12 @@ func RootFromContext(ctx context.Context) *Dirent {
 	}
 	return nil
 }
+
+// DirentCacheLimiterFromContext returns the DirentCacheLimiter used by ctx, or
+// nil if ctx does not have a dirent cache limiter.
+func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter {
+	if v := ctx.Value(CtxDirentCacheLimiter); v != nil {
+		return v.(*DirentCacheLimiter)
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 502b0a09b..d26a06971 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -32,6 +32,10 @@ type DirentCache struct {
 	// when cache is nil.
 	maxSize uint64
 
+	// limit restricts the number of entries in the cache amoung multiple caches.
+	// It may be nil if there are no global limit for this cache.
+	limit *DirentCacheLimiter
+
 	// mu protects currentSize and direntList.
 	mu sync.Mutex `state:"nosave"`
 
@@ -45,8 +49,7 @@ type DirentCache struct {
 	list direntList `state:"zerovalue"`
 }
 
-// NewDirentCache returns a new DirentCache with the given maxSize. If maxSize
-// is 0, nil is returned.
+// NewDirentCache returns a new DirentCache with the given maxSize.
 func NewDirentCache(maxSize uint64) *DirentCache {
 	return &DirentCache{
 		maxSize: maxSize,
@@ -71,15 +74,24 @@ func (c *DirentCache) Add(d *Dirent) {
 		return
 	}
 
+	// First check against the global limit.
+	for c.limit != nil && !c.limit.tryInc() {
+		if c.currentSize == 0 {
+			// If the global limit is reached, but there is nothing more to drop from
+			// this cache, there is not much else to do.
+			c.mu.Unlock()
+			return
+		}
+		c.remove(c.list.Back())
+	}
+
 	// d is not in cache. Add it and take a reference.
 	c.list.PushFront(d)
 	d.IncRef()
 	c.currentSize++
 
-	// Remove the oldest until we are under the size limit.
-	for c.maxSize > 0 && c.currentSize > c.maxSize {
-		c.remove(c.list.Back())
-	}
+	c.maybeShrink()
+
 	c.mu.Unlock()
 }
 
@@ -92,6 +104,9 @@ func (c *DirentCache) remove(d *Dirent) {
 	d.SetNext(nil)
 	d.DecRef()
 	c.currentSize--
+	if c.limit != nil {
+		c.limit.dec()
+	}
 }
 
 // Remove removes the element from the cache and decrements its refCount. It
@@ -142,3 +157,19 @@ func (c *DirentCache) Invalidate() {
 	}
 	c.mu.Unlock()
 }
+
+// setMaxSize sets cache max size. If current size is larger than max size, the
+// cache shrinks to acommodate the new max.
+func (c *DirentCache) setMaxSize(max uint64) {
+	c.mu.Lock()
+	c.maxSize = max
+	c.maybeShrink()
+	c.mu.Unlock()
+}
+
+// shrink removes the oldest element until the list is under the size limit.
+func (c *DirentCache) maybeShrink() {
+	for c.maxSize > 0 && c.currentSize > c.maxSize {
+		c.remove(c.list.Back())
+	}
+}
diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go
new file mode 100644
index 000000000..024c7b2d5
--- /dev/null
+++ b/pkg/sentry/fs/dirent_cache_limiter.go
@@ -0,0 +1,55 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"sync"
+)
+
+// DirentCacheLimiter acts as a global limit for all dirent caches in the
+// process.
+//
+// +stateify savable
+type DirentCacheLimiter struct {
+	mu    sync.Mutex `state:"nosave"`
+	max   uint64
+	count uint64 `state:"zerovalue"`
+}
+
+// NewDirentCacheLimiter creates a new DirentCacheLimiter.
+func NewDirentCacheLimiter(max uint64) *DirentCacheLimiter {
+	return &DirentCacheLimiter{max: max}
+}
+
+func (d *DirentCacheLimiter) tryInc() bool {
+	d.mu.Lock()
+	if d.count >= d.max {
+		d.mu.Unlock()
+		return false
+	}
+	d.count++
+	d.mu.Unlock()
+	return true
+}
+
+func (d *DirentCacheLimiter) dec() {
+	d.mu.Lock()
+	if d.count == 0 {
+		panic(fmt.Sprintf("underflowing DirentCacheLimiter count: %+v", d))
+	}
+	d.count--
+	d.mu.Unlock()
+}
diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go
index 5d0e9d91c..93e8d415f 100644
--- a/pkg/sentry/fs/dirent_cache_test.go
+++ b/pkg/sentry/fs/dirent_cache_test.go
@@ -120,6 +120,96 @@ func TestDirentCache(t *testing.T) {
 	}
 }
 
+func TestDirentCacheLimiter(t *testing.T) {
+	const (
+		globalMaxSize = 5
+		maxSize       = 3
+	)
+
+	limit := NewDirentCacheLimiter(globalMaxSize)
+	c1 := NewDirentCache(maxSize)
+	c1.limit = limit
+	c2 := NewDirentCache(maxSize)
+	c2.limit = limit
+
+	// Create a Dirent d.
+	d := NewNegativeDirent("")
+
+	// Add d to the cache.
+	c1.Add(d)
+	if got, want := c1.Size(), uint64(1); got != want {
+		t.Errorf("c1.Size() got %v, want %v", got, want)
+	}
+
+	// Add maxSize-1 more elements. d should be oldest element.
+	for i := 0; i < maxSize-1; i++ {
+		c1.Add(NewNegativeDirent(""))
+	}
+	if got, want := c1.Size(), uint64(maxSize); got != want {
+		t.Errorf("c1.Size() got %v, want %v", got, want)
+	}
+
+	// Check that d is still there.
+	if got, want := c1.contains(d), true; got != want {
+		t.Errorf("c1.contains(d) got %v want %v", got, want)
+	}
+
+	// Fill up the other cache, it will start dropping old entries from the cache
+	// when the global limit is reached.
+	for i := 0; i < maxSize; i++ {
+		c2.Add(NewNegativeDirent(""))
+	}
+
+	// Check is what's remaining from global max.
+	if got, want := c2.Size(), globalMaxSize-maxSize; int(got) != want {
+		t.Errorf("c2.Size() got %v, want %v", got, want)
+	}
+
+	// Check that d was not dropped.
+	if got, want := c1.contains(d), true; got != want {
+		t.Errorf("c1.contains(d) got %v want %v", got, want)
+	}
+
+	// Add an entry that will eventually be dropped. Check is done later...
+	drop := NewNegativeDirent("")
+	c1.Add(drop)
+
+	// Check that d is bumped to front even when global limit is reached.
+	c1.Add(d)
+	if got, want := c1.contains(d), true; got != want {
+		t.Errorf("c1.contains(d) got %v want %v", got, want)
+	}
+
+	// Add 2 more element and check that:
+	//   - d is still in the list: to verify that d was bumped
+	//   - d2/d3 are in the list: older entries are dropped when global limit is
+	//     reached.
+	//   - drop is not in the list: indeed older elements are dropped.
+	d2 := NewNegativeDirent("")
+	c1.Add(d2)
+	d3 := NewNegativeDirent("")
+	c1.Add(d3)
+	if got, want := c1.contains(d), true; got != want {
+		t.Errorf("c1.contains(d) got %v want %v", got, want)
+	}
+	if got, want := c1.contains(d2), true; got != want {
+		t.Errorf("c1.contains(d2) got %v want %v", got, want)
+	}
+	if got, want := c1.contains(d3), true; got != want {
+		t.Errorf("c1.contains(d3) got %v want %v", got, want)
+	}
+	if got, want := c1.contains(drop), false; got != want {
+		t.Errorf("c1.contains(drop) got %v want %v", got, want)
+	}
+
+	// Drop all entries from one cache. The other will be allowed to grow.
+	c1.Invalidate()
+	c2.Add(NewNegativeDirent(""))
+	if got, want := c2.Size(), uint64(maxSize); got != want {
+		t.Errorf("c2.Size() got %v, want %v", got, want)
+	}
+}
+
 // TestNilDirentCache tests that a nil cache supports all cache operations, but
 // treats them as noop.
 func TestNilDirentCache(t *testing.T) {
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index d626b86f5..ed5147c65 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -28,6 +28,10 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
+// DefaultDirentCacheSize is the default dirent cache size for 9P mounts. It can
+// be adjusted independentely from the other dirent caches.
+var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize
+
 // +stateify savable
 type endpointMaps struct {
 	// mu protexts the direntMap, the keyMap, and the pathMap below.
@@ -249,6 +253,11 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	// Construct the MountSource with the session and superBlockFlags.
 	m := fs.NewMountSource(s, filesystem, superBlockFlags)
 
+	// Given that gofer files can consume host FDs, restrict the number
+	// of files that can be held by the cache.
+	m.SetDirentCacheMaxSize(DefaultDirentCacheSize)
+	m.SetDirentCacheLimiter(fs.DirentCacheLimiterFromContext(ctx))
+
 	// Send the Tversion request.
 	s.client, err = p9.NewClient(conn, s.msize, s.version)
 	if err != nil {
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 5cc777bef..1e245ae5f 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -151,9 +151,9 @@ type MountSource struct {
 	children map[*MountSource]struct{}
 }
 
-// defaultDirentCacheSize is the number of Dirents that the VFS can hold an extra
-// reference on.
-const defaultDirentCacheSize uint64 = 1000
+// DefaultDirentCacheSize is the number of Dirents that the VFS can hold an
+// extra reference on.
+const DefaultDirentCacheSize uint64 = 1000
 
 // NewMountSource returns a new MountSource. Filesystem may be nil if there is no
 // filesystem backing the mount.
@@ -162,7 +162,7 @@ func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags Mou
 		MountSourceOperations: mops,
 		Flags:                 flags,
 		Filesystem:            filesystem,
-		fscache:               NewDirentCache(defaultDirentCacheSize),
+		fscache:               NewDirentCache(DefaultDirentCacheSize),
 		children:              make(map[*MountSource]struct{}),
 	}
 }
@@ -246,6 +246,18 @@ func (msrc *MountSource) FlushDirentRefs() {
 	msrc.fscache.Invalidate()
 }
 
+// SetDirentCacheMaxSize sets the max size to the dirent cache associated with
+// this mount source.
+func (msrc *MountSource) SetDirentCacheMaxSize(max uint64) {
+	msrc.fscache.setMaxSize(max)
+}
+
+// SetDirentCacheLimiter sets the limiter objcet to the dirent cache associated
+// with this mount source.
+func (msrc *MountSource) SetDirentCacheLimiter(l *DirentCacheLimiter) {
+	msrc.fscache.limit = l
+}
+
 // NewCachingMountSource returns a generic mount that will cache dirents
 // aggressively.
 func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 4c89673b5..fb60a1aec 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -31,10 +31,19 @@ type overlayMountSourceOperations struct {
 func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *MountSource {
 	upper.IncRef()
 	lower.IncRef()
-	return NewMountSource(&overlayMountSourceOperations{
+	msrc := NewMountSource(&overlayMountSourceOperations{
 		upper: upper,
 		lower: lower,
 	}, &overlayFilesystem{}, flags)
+
+	// Use the minimum number to keep resource usage under limits.
+	size := lower.fscache.maxSize
+	if size > upper.fscache.maxSize {
+		size = upper.fscache.maxSize
+	}
+	msrc.fscache.setMaxSize(size)
+
+	return msrc
 }
 
 // Revalidate implements MountSourceOperations.Revalidate for an overlay by
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index b8953657c..290c4a53c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -188,6 +188,11 @@ type Kernel struct {
 
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
+
+	// DirentCacheLimiter controls the number of total dirent entries can be in
+	// caches. Not all caches use it, only the caches that use host resources use
+	// the limiter. It may be nil if disabled.
+	DirentCacheLimiter *fs.DirentCacheLimiter
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -626,6 +631,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 			return ctx.k.mounts.Root()
 		}
 		return nil
+	case fs.CtxDirentCacheLimiter:
+		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1170,6 +1177,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
 	case fs.CtxRoot:
 		return ctx.k.mounts.Root()
+	case fs.CtxDirentCacheLimiter:
+		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 9c365e781..ed2175c37 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -601,6 +601,8 @@ func (t *Task) Value(key interface{}) interface{} {
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
 		return t.fsc.RootDirectory()
+	case fs.CtxDirentCacheLimiter:
+		return t.k.DirentCacheLimiter
 	case inet.CtxStack:
 		return t.NetworkContext()
 	case ktime.CtxRealtimeClock:
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 8dfb6dce6..761142d98 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -20,10 +20,10 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"syscall"
 
 	// Include filesystem types that OCI spec might mount.
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
-	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
@@ -38,6 +38,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -81,6 +82,22 @@ func (f *fdDispenser) empty() bool {
 	return len(f.fds) == 0
 }
 
+func adjustDirentCache(k *kernel.Kernel) error {
+	var hl syscall.Rlimit
+	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
+	}
+	if int64(hl.Cur) != syscall.RLIM_INFINITY {
+		newSize := hl.Cur / 2
+		if newSize < gofer.DefaultDirentCacheSize {
+			log.Infof("Setting gofer dirent cache size to %d", newSize)
+			gofer.DefaultDirentCacheSize = newSize
+			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+		}
+	}
+	return nil
+}
+
 // setupRootContainerFS creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
 // 'setMountNS' is called after namespace is created. It must set the mount NS
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index e3e716bf9..32e62cdf7 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -16,8 +16,11 @@ package boot
 
 import (
 	"fmt"
+	"sync"
+	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 )
 
@@ -41,10 +44,43 @@ var fromLinuxResource = map[string]limits.LimitType{
 	"RLIMIT_STACK":      limits.Stack,
 }
 
-func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+func findName(lt limits.LimitType) string {
+	for k, v := range fromLinuxResource {
+		if v == lt {
+			return k
+		}
+	}
+	return "unknown"
+}
+
+var defaults defs
+
+type defs struct {
+	mu  sync.Mutex
+	set *limits.LimitSet
+	err error
+}
+
+func (d *defs) get() (*limits.LimitSet, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.err != nil {
+		return nil, d.err
+	}
+	if d.set == nil {
+		if err := d.initDefaults(); err != nil {
+			d.err = err
+			return nil, err
+		}
+	}
+	return d.set, nil
+}
+
+func (d *defs) initDefaults() error {
 	ls, err := limits.NewLinuxLimitSet()
 	if err != nil {
-		return nil, err
+		return err
 	}
 
 	// Set default limits based on what containers get by default, ex:
@@ -66,6 +102,43 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
 	ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0})
 	ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity})
 
+	// Read host limits that directly affect the sandbox and adjust the defaults
+	// based on them.
+	for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} {
+		var hl syscall.Rlimit
+		if err := syscall.Getrlimit(res, &hl); err != nil {
+			return err
+		}
+
+		lt, ok := limits.FromLinuxResource[res]
+		if !ok {
+			return fmt.Errorf("unknown rlimit type %v", res)
+		}
+		hostLimit := limits.Limit{
+			Cur: limits.FromLinux(hl.Cur),
+			Max: limits.FromLinux(hl.Max),
+		}
+
+		defaultLimit := ls.Get(lt)
+		if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur {
+			log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur)
+		}
+		if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max {
+			log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max)
+			ls.SetUnchecked(lt, hostLimit)
+		}
+	}
+
+	d.set = ls
+	return nil
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+	ls, err := defaults.get()
+	if err != nil {
+		return nil, err
+	}
+
 	// Then apply overwrites on top of defaults.
 	for _, rl := range spec.Process.Rlimits {
 		lt, ok := fromLinuxResource[rl.Type]
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 56cb137f0..88a834aa5 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -274,6 +274,10 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
+	if err := adjustDirentCache(k); err != nil {
+		return nil, err
+	}
+
 	// Turn on packet logging if enabled.
 	if args.Conf.LogPackets {
 		log.Infof("Packet logging enabled")
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index 7a6a39444..67a86cc22 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -255,7 +255,16 @@ TEST_F(PollTest, Nfds) {
   // Stash value of RLIMIT_NOFILES.
   struct rlimit rlim;
   TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0);
+
+  // gVisor caps the number of FDs that epoll can use beyond RLIMIT_NOFILE.
+  constexpr rlim_t gVisorMax = 1048576;
+  if (rlim.rlim_cur > gVisorMax) {
+    rlim.rlim_cur = gVisorMax;
+    TEST_PCHECK(setrlimit(RLIMIT_NOFILE, &rlim) == 0);
+  }
+
   rlim_t max_fds = rlim.rlim_cur;
+  LOG(INFO) << "Using limit: " << max_fds;
 
   // Create an eventfd. Since its value is initially zero, it is writable.
   FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
-- 
cgit v1.2.3


From df21460cfdf589299e98171407741e3c253debe4 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 23 Apr 2019 11:32:34 -0700
Subject: Fix container_test flakes.

Create, Start, and Destroy were racing to create and destroy the
metadata directory of containers.

This is a re-upload of
https://gvisor-review.googlesource.com/c/gvisor/+/16260, but with the
correct account.

Change-Id: I16b7a9d0971f0df873e7f4145e6ac8f72730a4f1
PiperOrigin-RevId: 244892991
---
 runsc/container/container.go | 78 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 11 deletions(-)

(limited to 'runsc')

diff --git a/runsc/container/container.go b/runsc/container/container.go
index cc0c1ee25..1bed1a97e 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -99,7 +99,9 @@ type Container struct {
 	// BundleDir is the directory containing the container bundle.
 	BundleDir string `json:"bundleDir"`
 
-	// Root is the directory containing the container metadata file.
+	// Root is the directory containing the container metadata file. If this
+	// container is the root container, Root and RootContainerDir will be the
+	// same.
 	Root string `json:"root"`
 
 	// CreatedAt is the time the container was created.
@@ -128,6 +130,12 @@ type Container struct {
 	// Sandbox is the sandbox this container is running in. It's set when the
 	// container is created and reset when the sandbox is destroyed.
 	Sandbox *sandbox.Sandbox `json:"sandbox"`
+
+	// RootContainerDir is the root directory containing the metadata file of the
+	// sandbox root container. It's used to lock in order to serialize creating
+	// and deleting this Container's metadata directory. If this container is the
+	// root container, this is the same as Root.
+	RootContainerDir string
 }
 
 // Load loads a container with the given id from a metadata file. id may be an
@@ -243,6 +251,12 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		return nil, err
 	}
 
+	unlockRoot, err := maybeLockRootContainer(spec, conf.RootDir)
+	if err != nil {
+		return nil, err
+	}
+	defer unlockRoot()
+
 	// Lock the container metadata file to prevent concurrent creations of
 	// containers with the same id.
 	containerRoot := filepath.Join(conf.RootDir, id)
@@ -261,14 +275,15 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	}
 
 	c := &Container{
-		ID:            id,
-		Spec:          spec,
-		ConsoleSocket: consoleSocket,
-		BundleDir:     bundleDir,
-		Root:          containerRoot,
-		Status:        Creating,
-		CreatedAt:     time.Now(),
-		Owner:         os.Getenv("USER"),
+		ID:               id,
+		Spec:             spec,
+		ConsoleSocket:    consoleSocket,
+		BundleDir:        bundleDir,
+		Root:             containerRoot,
+		Status:           Creating,
+		CreatedAt:        time.Now(),
+		Owner:            os.Getenv("USER"),
+		RootContainerDir: conf.RootDir,
 	}
 	// The Cleanup object cleans up partially created containers when an error occurs.
 	// Any errors occuring during cleanup itself are ignored.
@@ -279,7 +294,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// started in an existing sandbox, we must do so. The metadata will
 	// indicate the ID of the sandbox, which is the same as the ID of the
 	// init container in the sandbox.
-	if specutils.ShouldCreateSandbox(spec) {
+	if isRoot(spec) {
 		log.Debugf("Creating new sandbox for container %q", id)
 
 		// Create and join cgroup before processes are created to ensure they are
@@ -354,6 +369,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 // Start starts running the containerized process inside the sandbox.
 func (c *Container) Start(conf *boot.Config) error {
 	log.Debugf("Start container %q", c.ID)
+
+	unlockRoot, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
+	if err != nil {
+		return err
+	}
+	defer unlockRoot()
+
 	unlock, err := c.lock()
 	if err != nil {
 		return err
@@ -371,7 +393,7 @@ func (c *Container) Start(conf *boot.Config) error {
 		}
 	}
 
-	if specutils.ShouldCreateSandbox(c.Spec) {
+	if isRoot(c.Spec) {
 		if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
 			return err
 		}
@@ -418,6 +440,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 		return err
 	}
 	defer unlock()
+
 	if err := c.requireStatus("restore", Created); err != nil {
 		return err
 	}
@@ -644,6 +667,12 @@ func (c *Container) Destroy() error {
 	// of errors return their concatenation.
 	var errs []string
 
+	unlock, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
+	if err != nil {
+		return err
+	}
+	defer unlock()
+
 	if err := c.stop(); err != nil {
 		err = fmt.Errorf("stopping container: %v", err)
 		log.Warningf("%v", err)
@@ -960,6 +989,33 @@ func lockContainerMetadata(containerRootDir string) (func() error, error) {
 	return l.Unlock, nil
 }
 
+// maybeLockRootContainer locks the sandbox root container. It is used to
+// prevent races to create and delete child container sandboxes.
+func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, error) {
+	if isRoot(spec) {
+		return func() error { return nil }, nil
+	}
+
+	sbid, ok := specutils.SandboxID(spec)
+	if !ok {
+		return nil, fmt.Errorf("no sandbox ID found when locking root container")
+	}
+	sb, err := Load(rootDir, sbid)
+	if err != nil {
+		return nil, err
+	}
+
+	unlock, err := sb.lock()
+	if err != nil {
+		return nil, err
+	}
+	return unlock, nil
+}
+
+func isRoot(spec *specs.Spec) bool {
+	return specutils.ShouldCreateSandbox(spec)
+}
+
 // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
 // it in the current context.
 func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
-- 
cgit v1.2.3


From 908edee04f92055a8c53a63d1b8d57ffe56aa682 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 23 Apr 2019 16:10:05 -0700
Subject: Replace os.File with fd.FD in fsgofer

os.NewFile() accounts for 38% of CPU time in localFile.Walk().
This change switchs to use fd.FD which is much cheaper to create.
Now, fd.New() in localFile.Walk() accounts for only 4%.

PiperOrigin-RevId: 244944983
Change-Id: Ic892df96cf2633e78ad379227a213cb93ee0ca46
---
 pkg/fd/fd.go             |  18 ++++
 runsc/fsgofer/fsgofer.go | 224 +++++++++++++++++++++++++++--------------------
 runsc/sandbox/network.go |   2 +
 3 files changed, 148 insertions(+), 96 deletions(-)

(limited to 'runsc')

diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index a2edf2aa6..d40758c22 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -167,6 +167,24 @@ func NewFromFile(file *os.File) (*FD, error) {
 	return New(fd), nil
 }
 
+// Open is equivallent to open(2).
+func Open(path string, openmode int, perm uint32) (*FD, error) {
+	f, err := syscall.Open(path, openmode|syscall.O_LARGEFILE, perm)
+	if err != nil {
+		return nil, err
+	}
+	return New(f), nil
+}
+
+// OpenAt is equivallent to openat(2).
+func OpenAt(dir *FD, path string, flags int, mode uint32) (*FD, error) {
+	f, err := syscall.Openat(dir.FD(), path, flags, mode)
+	if err != nil {
+		return nil, err
+	}
+	return New(f), nil
+}
+
 // Close closes the file descriptor contained in the FD.
 //
 // Close is safe to call multiple times, but will return an error after the
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 45b455430..60dad642f 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -27,6 +27,7 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"runtime"
 	"sync"
 	"syscall"
 
@@ -122,13 +123,13 @@ func (a *attachPoint) Attach() (p9.File, error) {
 	if err != nil {
 		return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err)
 	}
-	mode := os.O_RDWR
+	mode := syscall.O_RDWR
 	if a.conf.ROMount || stat.Mode&syscall.S_IFDIR != 0 {
-		mode = os.O_RDONLY
+		mode = syscall.O_RDONLY
 	}
 
 	// Open the root directory.
-	f, err := os.OpenFile(a.prefix, mode|openFlags, 0)
+	f, err := fd.Open(a.prefix, openFlags|mode, 0)
 	if err != nil {
 		return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err)
 	}
@@ -201,8 +202,9 @@ type localFile struct {
 	hostPath string
 
 	// file is opened when localFile is created and it's never nil. It may be
-	// reopened...
-	file *os.File
+	// reopened if the Open() mode is wider than the mode the file was originally
+	// opened with.
+	file *fd.FD
 
 	// mode is the mode in which the file was opened. Set to invalidMode
 	// if localFile isn't opened.
@@ -215,14 +217,10 @@ type localFile struct {
 	readDirMu sync.Mutex
 }
 
-func openAnyFileFromParent(parent *localFile, name string) (*os.File, string, error) {
+func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) {
 	path := path.Join(parent.hostPath, name)
-	f, err := openAnyFile(path, func(mode int) (*os.File, error) {
-		fd, err := syscall.Openat(parent.fd(), name, openFlags|mode, 0)
-		if err != nil {
-			return nil, err
-		}
-		return os.NewFile(uintptr(fd), path), nil
+	f, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+		return fd.OpenAt(parent.file, name, openFlags|mode, 0)
 	})
 	return f, path, err
 }
@@ -230,7 +228,7 @@ func openAnyFileFromParent(parent *localFile, name string) (*os.File, string, er
 // openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
 // to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
 // actual file open and is customizable by the caller.
-func openAnyFile(path string, fn func(mode int) (*os.File, error)) (*os.File, error) {
+func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
 	// Attempt to open file in the following mode in order:
 	//   1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too.
 	//      Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option
@@ -239,7 +237,7 @@ func openAnyFile(path string, fn func(mode int) (*os.File, error)) (*os.File, er
 	modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
 
 	var err error
-	var file *os.File
+	var file *fd.FD
 	for i, mode := range modes {
 		file, err = fn(mode)
 		if err == nil {
@@ -279,7 +277,7 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
 	return ft, nil
 }
 
-func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) {
+func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
 	ft, err := getSupportedFileType(stat)
 	if err != nil {
 		return nil, err
@@ -297,18 +295,22 @@ func newLocalFile(a *attachPoint, file *os.File, path string, stat syscall.Stat_
 // newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as
 // non-blocking. If anything fails, returns nil. It's better to have a file
 // without host FD, than to fail the operation.
-func newFDMaybe(file *os.File) *fd.FD {
-	fd, err := fd.NewFromFile(file)
+func newFDMaybe(file *fd.FD) *fd.FD {
+	dupFD, err := syscall.Dup(file.FD())
+	// Technically, the runtime may call the finalizer on file as soon as
+	// FD() returns.
+	runtime.KeepAlive(file)
 	if err != nil {
 		return nil
 	}
+	dup := fd.New(dupFD)
 
 	// fd is blocking; non-blocking is required.
-	if err := syscall.SetNonblock(fd.FD(), true); err != nil {
-		fd.Close()
+	if err := syscall.SetNonblock(dup.FD(), true); err != nil {
+		dup.Close()
 		return nil
 	}
-	return fd
+	return dup
 }
 
 func stat(fd int) (syscall.Stat_t, error) {
@@ -323,35 +325,30 @@ func fchown(fd int, uid p9.UID, gid p9.GID) error {
 	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
 }
 
-func (l *localFile) fd() int {
-	return int(l.file.Fd())
-}
-
 // Open implements p9.File.
 func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	if l.isOpen() {
-		panic(fmt.Sprintf("attempting to open already opened file: %q", l.file.Name()))
+		panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
 	}
 
 	// Check if control file can be used or if a new open must be created.
-	var newFile *os.File
+	var newFile *fd.FD
 	if mode == p9.ReadOnly {
-		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.file.Name())
+		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.hostPath)
 		newFile = l.file
 	} else {
 		// Ideally reopen would call name_to_handle_at (with empty name) and
 		// open_by_handle_at to reopen the file without using 'hostPath'. However,
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
-		log.Debugf("Open reopening file, mode: %v, %q", mode, l.file.Name())
+		log.Debugf("Open reopening file, mode: %v, %q", mode, l.hostPath)
 		var err error
-
-		newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
+		newFile, err = fd.Open(l.hostPath, openFlags|mode.OSFlags(), 0)
 		if err != nil {
 			return nil, p9.QID{}, 0, extractErrno(err)
 		}
 	}
 
-	stat, err := stat(int(newFile.Fd()))
+	stat, err := stat(newFile.FD())
 	if err != nil {
 		if newFile != l.file {
 			newFile.Close()
@@ -368,7 +365,7 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	// Close old file in case a new one was created.
 	if newFile != l.file {
 		if err := l.file.Close(); err != nil {
-			log.Warningf("Error closing file %q: %v", l.file.Name(), err)
+			log.Warningf("Error closing file %q: %v", l.hostPath, err)
 		}
 		l.file = newFile
 	}
@@ -396,33 +393,31 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		flags |= mode.OSFlags()
 	}
 
-	fd, err := syscall.Openat(l.fd(), name, flags, uint32(perm.Permissions()))
+	child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions()))
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
-		syscall.Close(fd)
+		child.Close()
 		// Best effort attempt to remove the file in case of failure.
-		if err := syscall.Unlinkat(l.fd(), name); err != nil {
+		if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
 	defer cu.Clean()
 
-	if err := fchown(fd, uid, gid); err != nil {
+	if err := fchown(child.FD(), uid, gid); err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
-	stat, err := stat(fd)
+	stat, err := stat(child.FD())
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 
-	cPath := path.Join(l.hostPath, name)
-	f := os.NewFile(uintptr(fd), cPath)
 	c := &localFile{
 		attachPoint: l.attachPoint,
-		hostPath:    cPath,
-		file:        f,
+		hostPath:    path.Join(l.hostPath, name),
+		file:        child,
 		mode:        mode,
 	}
 
@@ -440,12 +435,12 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if err := syscall.Mkdirat(l.fd(), name, uint32(perm.Permissions())); err != nil {
+	if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		// Best effort attempt to remove the dir in case of failure.
-		if err := unix.Unlinkat(l.fd(), name, unix.AT_REMOVEDIR); err != nil {
+		if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil {
 			log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -453,16 +448,16 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 
 	// Open directory to change ownership and stat it.
 	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
-	fd, err := syscall.Openat(l.fd(), name, flags, 0)
+	f, err := fd.OpenAt(l.file, name, flags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	defer syscall.Close(fd)
+	defer f.Close()
 
-	if err := fchown(fd, uid, gid); err != nil {
+	if err := fchown(f.FD(), uid, gid); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	stat, err := stat(fd)
+	stat, err := stat(f.FD())
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -475,25 +470,25 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	// Duplicate current file if 'names' is empty.
 	if len(names) == 0 {
-		var newFile *os.File
+		var newFile *fd.FD
 		if l.isOpen() {
 			// File mode may have changed when it was opened, so open a new one.
 			var err error
-			newFile, err = openAnyFile(l.hostPath, func(mode int) (*os.File, error) {
-				return os.OpenFile(l.hostPath, openFlags|mode, 0)
+			newFile, err = openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
+				return fd.Open(l.hostPath, openFlags|mode, 0)
 			})
 			if err != nil {
 				return nil, nil, extractErrno(err)
 			}
 		} else {
-			newFd, err := syscall.Dup(l.fd())
+			newFd, err := syscall.Dup(l.file.FD())
 			if err != nil {
 				return nil, nil, extractErrno(err)
 			}
-			newFile = os.NewFile(uintptr(newFd), l.hostPath)
+			newFile = fd.New(newFd)
 		}
 
-		stat, err := stat(int(newFile.Fd()))
+		stat, err := stat(int(newFile.FD()))
 		if err != nil {
 			newFile.Close()
 			return nil, nil, extractErrno(err)
@@ -515,7 +510,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
-		stat, err := stat(int(f.Fd()))
+		stat, err := stat(f.FD())
 		if err != nil {
 			f.Close()
 			return nil, nil, extractErrno(err)
@@ -535,7 +530,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 // StatFS implements p9.File.
 func (l *localFile) StatFS() (p9.FSStat, error) {
 	var s syscall.Statfs_t
-	if err := syscall.Fstatfs(l.fd(), &s); err != nil {
+	if err := syscall.Fstatfs(l.file.FD(), &s); err != nil {
 		return p9.FSStat{}, extractErrno(err)
 	}
 
@@ -557,7 +552,7 @@ func (l *localFile) FSync() error {
 	if !l.isOpen() {
 		return syscall.EBADF
 	}
-	if err := l.file.Sync(); err != nil {
+	if err := syscall.Fsync(l.file.FD()); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -565,7 +560,7 @@ func (l *localFile) FSync() error {
 
 // GetAttr implements p9.File.
 func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
-	stat, err := stat(l.fd())
+	stat, err := stat(l.file.FD())
 	if err != nil {
 		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
 	}
@@ -633,20 +628,20 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// Handle all the sanity checks up front so that the client gets a
 	// consistent result that is not attribute dependent.
 	if !valid.IsSubsetOf(allowed) {
-		log.Warningf("SetAttr() failed for %q, mask: %v", l.file.Name(), valid)
+		log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid)
 		return syscall.EPERM
 	}
 
 	// Check if it's possible to use cached file, or if another one needs to be
 	// opened for write.
-	fd := l.fd()
+	f := l.file
 	if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
-		f, err := os.OpenFile(l.hostPath, openFlags|os.O_WRONLY, 0)
+		var err error
+		f, err = fd.Open(l.hostPath, openFlags|syscall.O_WRONLY, 0)
 		if err != nil {
 			return extractErrno(err)
 		}
 		defer f.Close()
-		fd = int(f.Fd())
 	}
 
 	// The semantics are to either return an error if no changes were made,
@@ -661,14 +656,14 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// over another.
 	var err error
 	if valid.Permissions {
-		if cerr := syscall.Fchmod(fd, uint32(attr.Permissions)); cerr != nil {
+		if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
 			log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
 			err = extractErrno(cerr)
 		}
 	}
 
 	if valid.Size {
-		if terr := syscall.Ftruncate(fd, int64(attr.Size)); terr != nil {
+		if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
 			log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
 			err = extractErrno(terr)
 		}
@@ -700,20 +695,20 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 			// utimensat operates different that other syscalls. To operate on a
 			// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
 			// name.
-			f, err := os.OpenFile(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+			parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
 			if err != nil {
 				return extractErrno(err)
 			}
-			defer f.Close()
+			defer syscall.Close(parent)
 
-			if terr := utimensat(int(f.Fd()), path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
+			if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
 				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
 				err = extractErrno(terr)
 			}
 		} else {
 			// Directories and regular files can operate directly on the fd
 			// using empty name.
-			if terr := utimensat(fd, "", utimes, 0); terr != nil {
+			if terr := utimensat(f.FD(), "", utimes, 0); terr != nil {
 				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
 				err = extractErrno(terr)
 			}
@@ -729,7 +724,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 		if valid.GID {
 			gid = int(attr.GID)
 		}
-		if oerr := syscall.Fchownat(fd, "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+		if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
 			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
 			err = extractErrno(oerr)
 		}
@@ -754,7 +749,7 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
 	}
 
 	newParent := directory.(*localFile)
-	if err := renameat(l.fd(), oldName, newParent.fd(), newName); err != nil {
+	if err := renameat(l.file.FD(), oldName, newParent.file.FD(), newName); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -804,28 +799,28 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if err := unix.Symlinkat(target, l.fd(), newName); err != nil {
+	if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := specutils.MakeCleanup(func() {
 		// Best effort attempt to remove the symlink in case of failure.
-		if err := syscall.Unlinkat(l.fd(), newName); err != nil {
+		if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
 		}
 	})
 	defer cu.Clean()
 
 	// Open symlink to change ownership and stat it.
-	fd, err := syscall.Openat(l.fd(), newName, unix.O_PATH|openFlags, 0)
+	f, err := fd.OpenAt(l.file, newName, unix.O_PATH|openFlags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	defer syscall.Close(fd)
+	defer f.Close()
 
-	if err := fchown(fd, uid, gid); err != nil {
+	if err := fchown(f.FD(), uid, gid); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	stat, err := stat(fd)
+	stat, err := stat(f.FD())
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -845,7 +840,7 @@ func (l *localFile) Link(target p9.File, newName string) error {
 	}
 
 	targetFile := target.(*localFile)
-	if err := unix.Linkat(targetFile.fd(), "", l.fd(), newName, linux.AT_EMPTY_PATH); err != nil {
+	if err := unix.Linkat(targetFile.file.FD(), "", l.file.FD(), newName, linux.AT_EMPTY_PATH); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -868,7 +863,7 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 		return syscall.EBADF
 	}
 
-	if err := unix.Unlinkat(l.fd(), name, int(flags)); err != nil {
+	if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -887,30 +882,67 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	// reading all directory contents. Take a lock because this operation is
 	// stateful.
 	l.readDirMu.Lock()
-	if _, err := l.file.Seek(0, 0); err != nil {
-		l.readDirMu.Unlock()
+	defer l.readDirMu.Unlock()
+
+	if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
 		return nil, extractErrno(err)
 	}
-	names, err := l.file.Readdirnames(-1)
-	if err != nil {
-		l.readDirMu.Unlock()
-		return nil, extractErrno(err)
+
+	return l.readDirent(l.file.FD(), offset, count)
+}
+
+func (l *localFile) readDirent(f int, offset uint64, count uint32) ([]p9.Dirent, error) {
+	// Limit 'count' to cap the slice size that is returned.
+	const maxCount = 100000
+	if count > maxCount {
+		count = maxCount
 	}
-	l.readDirMu.Unlock()
 
-	var dirents []p9.Dirent
-	for i := int(offset); i >= 0 && i < len(names); i++ {
-		stat, err := statAt(l.fd(), names[i])
+	dirents := make([]p9.Dirent, 0, count)
+
+	// Pre-allocate buffers that will be reused to get partial results.
+	direntsBuf := make([]byte, 8192)
+	names := make([]string, 0, 100)
+
+	skip := offset // Tracks the number of entries to skip.
+	end := offset + uint64(count)
+	for offset < end {
+		dirSize, err := syscall.ReadDirent(f, direntsBuf)
 		if err != nil {
-			continue
+			return dirents, err
+		}
+		if dirSize <= 0 {
+			return dirents, nil // EOF
+		}
+
+		names := names[:0]
+		_, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names)
+
+		// Skip over entries that the caller is not interested in.
+		if skip > 0 {
+			if skip > uint64(len(names)) {
+				skip -= uint64(len(names))
+				names = names[:0]
+			} else {
+				names = names[skip:]
+				skip = 0
+			}
+		}
+		for _, name := range names {
+			stat, err := statAt(l.file.FD(), name)
+			if err != nil {
+				log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err)
+				continue
+			}
+			qid := l.attachPoint.makeQID(stat)
+			offset++
+			dirents = append(dirents, p9.Dirent{
+				QID:    qid,
+				Type:   qid.Type,
+				Name:   name,
+				Offset: offset,
+			})
 		}
-		qid := l.attachPoint.makeQID(stat)
-		dirents = append(dirents, p9.Dirent{
-			QID:    qid,
-			Type:   qid.Type,
-			Name:   names[i],
-			Offset: uint64(i + 1),
-		})
 	}
 	return dirents, nil
 }
@@ -921,7 +953,7 @@ func (l *localFile) Readlink() (string, error) {
 	const limit = 1024 * 1024
 	for len := 128; len < limit; len *= 2 {
 		b := make([]byte, len)
-		n, err := unix.Readlinkat(l.fd(), "", b)
+		n, err := unix.Readlinkat(l.file.FD(), "", b)
 		if err != nil {
 			return "", extractErrno(err)
 		}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index be924ae25..e52a51569 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -257,6 +257,8 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 					return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
 				}
 				link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize
+			} else {
+				log.Infof("GSO not available in host.")
 			}
 		}
 
-- 
cgit v1.2.3


From 1b10f52d598e41f9ffe3a851da2da746f3d3a47a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 23 Apr 2019 17:33:50 -0700
Subject: Remember file position during Readdir()

The caller must call Readdir() at least twice to detect
EOF. The old code was always restarting the directory
search and then skipping elements already seen, effectively
doubling the cost to read a directory. The code now
remembers the last offset and doesn't reposition the cursor
if next request comes at the same offset.

PiperOrigin-RevId: 244957816
Change-Id: If21a8dc68b76614adbcf4301439adfda40f2643f
---
 runsc/fsgofer/fsgofer.go | 56 ++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 23 deletions(-)

(limited to 'runsc')

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 60dad642f..c964a2a3b 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -215,6 +215,12 @@ type localFile struct {
 
 	// readDirMu protects against concurrent Readdir calls.
 	readDirMu sync.Mutex
+
+	// lastDirentOffset is the last offset returned by Readdir(). If another call
+	// to Readdir is made at the same offset, the file doesn't need to be
+	// repositioned. This is an important optimization because the caller must
+	// always make one extra call to detect EOF (empty result, no error).
+	lastDirentOffset uint64
 }
 
 func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) {
@@ -470,25 +476,14 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	// Duplicate current file if 'names' is empty.
 	if len(names) == 0 {
-		var newFile *fd.FD
-		if l.isOpen() {
-			// File mode may have changed when it was opened, so open a new one.
-			var err error
-			newFile, err = openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
-				return fd.Open(l.hostPath, openFlags|mode, 0)
-			})
-			if err != nil {
-				return nil, nil, extractErrno(err)
-			}
-		} else {
-			newFd, err := syscall.Dup(l.file.FD())
-			if err != nil {
-				return nil, nil, extractErrno(err)
-			}
-			newFile = fd.New(newFd)
+		newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
+			return fd.Open(l.hostPath, openFlags|mode, 0)
+		})
+		if err != nil {
+			return nil, nil, extractErrno(err)
 		}
 
-		stat, err := stat(int(newFile.FD()))
+		stat, err := stat(newFile.FD())
 		if err != nil {
 			newFile.Close()
 			return nil, nil, extractErrno(err)
@@ -884,14 +879,30 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	l.readDirMu.Lock()
 	defer l.readDirMu.Unlock()
 
-	if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
-		return nil, extractErrno(err)
+	skip := uint64(0)
+
+	// Check if the file is at the correct position already. If not, seek to the
+	// beginning and read the entire directory again.
+	if l.lastDirentOffset != offset {
+		if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
+			return nil, extractErrno(err)
+		}
+		skip = offset
 	}
 
-	return l.readDirent(l.file.FD(), offset, count)
+	dirents, err := l.readDirent(l.file.FD(), offset, count, skip)
+	if err == nil {
+		// On success, remember the offset that was returned at the current
+		// position.
+		l.lastDirentOffset = offset + uint64(len(dirents))
+	} else {
+		// On failure, the state is unknown, force call to seek() next time.
+		l.lastDirentOffset = math.MaxUint64
+	}
+	return dirents, err
 }
 
-func (l *localFile) readDirent(f int, offset uint64, count uint32) ([]p9.Dirent, error) {
+func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) {
 	// Limit 'count' to cap the slice size that is returned.
 	const maxCount = 100000
 	if count > maxCount {
@@ -904,7 +915,6 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32) ([]p9.Dirent,
 	direntsBuf := make([]byte, 8192)
 	names := make([]string, 0, 100)
 
-	skip := offset // Tracks the number of entries to skip.
 	end := offset + uint64(count)
 	for offset < end {
 		dirSize, err := syscall.ReadDirent(f, direntsBuf)
@@ -912,7 +922,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32) ([]p9.Dirent,
 			return dirents, err
 		}
 		if dirSize <= 0 {
-			return dirents, nil // EOF
+			return dirents, nil
 		}
 
 		names := names[:0]
-- 
cgit v1.2.3


From 99b877fa1d6fda178fcfb3db0463485a2ab3017b Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 23 Apr 2019 19:06:09 -0700
Subject: Revert runsc to use RecvMMsg packet dispatcher.

PacketMMap mode has issues due to a kernel bug. This change
reverts us to using recvmmsg instead of a shared ring buffer to
dispatch inbound packets. This will reduce performance but should
be more stable under heavy load till PacketMMap is updated to
use TPacketv3.

See #210 for details.

Perf difference between recvmmsg vs packetmmap.

RecvMMsg :
iperf3 -c 172.17.0.2
Connecting to host 172.17.0.2, port 5201
[  4] local 172.17.0.1 port 43478 connected to 172.17.0.2 port 5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   778 MBytes  6.53 Gbits/sec  4349    188 KBytes
[  4]   1.00-2.00   sec   786 MBytes  6.59 Gbits/sec  4395    212 KBytes
[  4]   2.00-3.00   sec   756 MBytes  6.34 Gbits/sec  3655    161 KBytes
[  4]   3.00-4.00   sec   782 MBytes  6.56 Gbits/sec  4419    175 KBytes
[  4]   4.00-5.00   sec   755 MBytes  6.34 Gbits/sec  4317    187 KBytes
[  4]   5.00-6.00   sec   774 MBytes  6.49 Gbits/sec  4002    173 KBytes
[  4]   6.00-7.00   sec   737 MBytes  6.18 Gbits/sec  3904    191 KBytes
[  4]   7.00-8.00   sec   530 MBytes  4.44 Gbits/sec  3318    189 KBytes
[  4]   8.00-9.00   sec   487 MBytes  4.09 Gbits/sec  2627    188 KBytes
[  4]   9.00-10.00  sec   770 MBytes  6.46 Gbits/sec  4221    170 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-10.00  sec  6.99 GBytes  6.00 Gbits/sec  39207             sender
[  4]   0.00-10.00  sec  6.99 GBytes  6.00 Gbits/sec                  receiver

iperf Done.

PacketMMap:

bhaskerh@gvisor-bench:~/tensorflow$ iperf3 -c 172.17.0.2
Connecting to host 172.17.0.2, port 5201
[  4] local 172.17.0.1 port 43496 connected to 172.17.0.2 port 5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   657 MBytes  5.51 Gbits/sec    0   1.01 MBytes
[  4]   1.00-2.00   sec  1021 MBytes  8.56 Gbits/sec    0   1.01 MBytes
[  4]   2.00-3.00   sec  1.21 GBytes  10.4 Gbits/sec   45   1.01 MBytes
[  4]   3.00-4.00   sec  1018 MBytes  8.54 Gbits/sec   15   1.01 MBytes
[  4]   4.00-5.00   sec  1.28 GBytes  11.0 Gbits/sec   45   1.01 MBytes
[  4]   5.00-6.00   sec  1.38 GBytes  11.9 Gbits/sec    0   1.01 MBytes
[  4]   6.00-7.00   sec  1.34 GBytes  11.5 Gbits/sec   45    856 KBytes
[  4]   7.00-8.00   sec  1.23 GBytes  10.5 Gbits/sec    0    901 KBytes
[  4]   8.00-9.00   sec  1010 MBytes  8.48 Gbits/sec    0    923 KBytes
[  4]   9.00-10.00  sec  1.39 GBytes  11.9 Gbits/sec    0    960 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-10.00  sec  11.4 GBytes  9.83 Gbits/sec  150             sender
[  4]   0.00-10.00  sec  11.4 GBytes  9.83 Gbits/sec                  receiver

Updates #210

PiperOrigin-RevId: 244968438
Change-Id: Id461b5cbff2dea6fa55cfc108ea246d8f83da20b
---
 runsc/boot/network.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index ceb00a858..35baa36ad 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -140,7 +140,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
 			Address:            mac,
-			PacketDispatchMode: fdbased.PacketMMap,
+			PacketDispatchMode: fdbased.RecvMMsg,
 			GSOMaxSize:         link.GSOMaxSize,
 			RXChecksumOffload:  true,
 		})
-- 
cgit v1.2.3


From 228dc15fd13eb91f03a907f75a3fbcec692a61a3 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Fri, 26 Apr 2019 12:50:38 -0700
Subject: Bump the AF_PACKET socket rcv buf size to 4MB by default.

Packet socket receive buffers default to the sysctl value of
net.core.rmem_default and are capped by net.core.rmem_max both
which are usually set to 208KB on most systems.

Since we can't expect every gVisor user to bump these we use
SO_RCVBUFFORCE to exceed the limit. This is possible as runsc runs
with CAP_NET_ADMIN outside the sandbox and can do this before
the FD is passed to the sentry inside the sandbox.

Updates #211

iperf output w/ 4MB buffer.

 iperf3 -c 172.17.0.2 -t 100
 Connecting to host 172.17.0.2, port 5201
 [  4] local 172.17.0.1 port 40378 connected to 172.17.0.2 port 5201
 [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
 [  4]   0.00-1.00   sec  1.15 GBytes  9.89 Gbits/sec    0   1.02 MBytes
 [  4]   1.00-2.00   sec  1.18 GBytes  10.2 Gbits/sec    0   1.02 MBytes
 [  4]   2.00-3.00   sec   965 MBytes  8.09 Gbits/sec    0   1.02 MBytes
 [  4]   3.00-4.00   sec   942 MBytes  7.90 Gbits/sec    0   1.02 MBytes
 [  4]   4.00-5.00   sec   952 MBytes  7.99 Gbits/sec    0   1.02 MBytes
 [  4]   5.00-6.00   sec  1.14 GBytes  9.81 Gbits/sec    0   1.02 MBytes
 [  4]   6.00-7.00   sec  1.13 GBytes  9.68 Gbits/sec    0   1.02 MBytes
 [  4]   7.00-8.00   sec   930 MBytes  7.80 Gbits/sec    0   1.02 MBytes
 [  4]   8.00-9.00   sec  1.15 GBytes  9.91 Gbits/sec    0   1.02 MBytes
 [  4]   9.00-10.00  sec   938 MBytes  7.87 Gbits/sec    0   1.02 MBytes
 [  4]  10.00-11.00  sec   737 MBytes  6.18 Gbits/sec    0   1.02 MBytes
 [  4]  11.00-12.00  sec  1.16 GBytes  9.93 Gbits/sec    0   1.02 MBytes
 [  4]  12.00-13.00  sec   917 MBytes  7.69 Gbits/sec    0   1.02 MBytes
 [  4]  13.00-14.00  sec  1.19 GBytes  10.2 Gbits/sec    0   1.02 MBytes
 [  4]  14.00-15.00  sec  1.01 GBytes  8.70 Gbits/sec    0   1.02 MBytes
 [  4]  15.00-16.00  sec  1.20 GBytes  10.3 Gbits/sec    0   1.02 MBytes
 [  4]  16.00-17.00  sec  1.14 GBytes  9.80 Gbits/sec    0   1.02 MBytes
 ^C[  4]  17.00-17.60  sec   718 MBytes  10.1 Gbits/sec    0   1.02 MBytes
 - - - - - - - - - - - - - - - - - - - - - - - - -
 [ ID] Interval           Transfer     Bandwidth       Retr
 [  4]   0.00-17.60  sec  18.4 GBytes  8.98 Gbits/sec    0             sender
 [  4]   0.00-17.60  sec  0.00 Bytes  0.00 bits/sec                  receiver

PiperOrigin-RevId: 245470590
Change-Id: I1c08c5ee8345de6ac070513656a4703312dc3c00
---
 pkg/tcpip/link/fdbased/endpoint.go |  5 +++--
 runsc/sandbox/network.go           | 11 +++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 6354688e2..8f4d67074 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -197,8 +197,9 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
 			return stack.RegisterLinkEndpoint(e), nil
 
 		case RecvMMsg:
-			// If the provided FD is a socket then we optimize packet reads by
-			// using recvmmsg() instead of read() to read packets in a batch.
+			// If the provided FD is a socket then we optimize
+			// packet reads by using recvmmsg() instead of read() to
+			// read packets in a batch.
 			e.inboundDispatcher = e.recvMMsgDispatch
 			msgsPerRecv = MaxMsgsPerRecv
 		}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index e52a51569..6c6b665a0 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -262,6 +262,17 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 			}
 		}
 
+		// Use SO_RCVBUFFORCE because on linux the receive buffer for an
+		// AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
+		// defaults to a unusually low value of 208KB. This is too low
+		// for gVisor to be able to receive packets at high throughputs
+		// without incurring packet drops.
+		const rcvBufSize = 4 << 20 // 4MB.
+
+		if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
+			return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+		}
+
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
 		for _, addr := range ip4addrs {
-- 
cgit v1.2.3


From 43dff57b878edb5502daf486cbc13b058780dd56 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 26 Apr 2019 16:50:35 -0700
Subject: Make raw sockets a toggleable feature disabled by default.

PiperOrigin-RevId: 245511019
Change-Id: Ia9562a301b46458988a6a1f0bbd5f07cbfcb0615
---
 pkg/syserr/netstack.go                    |  2 ++
 pkg/tcpip/stack/stack.go                  | 12 ++++++++++++
 pkg/tcpip/tcpip.go                        |  1 +
 pkg/tcpip/transport/tcp/endpoint_state.go |  1 +
 runsc/boot/config.go                      |  6 ++++++
 runsc/boot/loader.go                      |  7 +++++--
 runsc/cmd/exec.go                         | 18 +++++++++++++++---
 runsc/main.go                             |  2 ++
 runsc/specutils/specutils.go              | 22 ++++++++++++++++------
 runsc/test/integration/BUILD              |  5 ++++-
 runsc/test/integration/exec_test.go       | 26 +++++++++++++++++++++++---
 runsc/test/testutil/docker.go             | 21 +++++++++++++++++----
 12 files changed, 104 insertions(+), 19 deletions(-)

(limited to 'runsc')

diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index c5a628c7d..1a23919ef 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -45,6 +45,7 @@ var (
 	ErrNoSuchFile            = New(tcpip.ErrNoSuchFile.String(), linux.ENOENT)
 	ErrInvalidOptionValue    = New(tcpip.ErrInvalidOptionValue.String(), linux.EINVAL)
 	ErrBroadcastDisabled     = New(tcpip.ErrBroadcastDisabled.String(), linux.EACCES)
+	ErrNotPermittedNet       = New(tcpip.ErrNotPermitted.String(), linux.EPERM)
 )
 
 var netstackErrorTranslations = map[*tcpip.Error]*Error{
@@ -84,6 +85,7 @@ var netstackErrorTranslations = map[*tcpip.Error]*Error{
 	tcpip.ErrMessageTooLong:        ErrMessageTooLong,
 	tcpip.ErrNoBufferSpace:         ErrNoBufferSpace,
 	tcpip.ErrBroadcastDisabled:     ErrBroadcastDisabled,
+	tcpip.ErrNotPermitted:          ErrNotPermittedNet,
 }
 
 // TranslateNetstackError converts an error from the tcpip package to a sentry
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a74c0a7a0..8f7b6f781 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -291,6 +291,10 @@ type Stack struct {
 
 	linkAddrCache *linkAddrCache
 
+	// raw indicates whether raw sockets may be created. It is set during
+	// Stack creation and is immutable.
+	raw bool
+
 	mu         sync.RWMutex
 	nics       map[tcpip.NICID]*NIC
 	forwarding bool
@@ -327,6 +331,9 @@ type Options struct {
 	// should be handled by the stack internally (true) or outside the
 	// stack (false).
 	HandleLocal bool
+
+	// Raw indicates whether raw sockets may be created.
+	Raw bool
 }
 
 // New allocates a new networking stack with only the requested networking and
@@ -352,6 +359,7 @@ func New(network []string, transport []string, opts Options) *Stack {
 		clock:              clock,
 		stats:              opts.Stats.FillIn(),
 		handleLocal:        opts.HandleLocal,
+		raw:                opts.Raw,
 	}
 
 	// Add specified network protocols.
@@ -512,6 +520,10 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp
 // protocol. Raw endpoints receive all traffic for a given protocol regardless
 // of address.
 func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if !s.raw {
+		return nil, tcpip.ErrNotPermitted
+	}
+
 	t, ok := s.transportProtocols[transport]
 	if !ok {
 		return nil, tcpip.ErrUnknownProtocol
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index e898dcbca..80cd6b4e5 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -102,6 +102,7 @@ var (
 	ErrMessageTooLong        = &Error{msg: "message too long"}
 	ErrNoBufferSpace         = &Error{msg: "no buffer space available"}
 	ErrBroadcastDisabled     = &Error{msg: "broadcast socket option disabled"}
+	ErrNotPermitted          = &Error{msg: "operation not permitted"}
 )
 
 // Errors related to Subnet
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index a42e09b8c..7f9dabb4d 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -341,6 +341,7 @@ func loadError(s string) *tcpip.Error {
 			tcpip.ErrMessageTooLong,
 			tcpip.ErrNoBufferSpace,
 			tcpip.ErrBroadcastDisabled,
+			tcpip.ErrNotPermitted,
 		}
 
 		messageToError = make(map[string]*tcpip.Error)
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 2523077fd..ba47effc1 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -175,6 +175,11 @@ type Config struct {
 	// Network indicates what type of network to use.
 	Network NetworkType
 
+	// EnableRaw indicates whether raw sockets should be enabled. Raw
+	// sockets are disabled by stripping CAP_NET_RAW from the list of
+	// capabilities.
+	EnableRaw bool
+
 	// GSO indicates that generic segmentation offload is enabled.
 	GSO bool
 
@@ -235,6 +240,7 @@ func (c *Config) ToFlags() []string {
 		"--watchdog-action=" + c.WatchdogAction.String(),
 		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
 		"--profile=" + strconv.FormatBool(c.ProfileEnable),
+		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
 	}
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 88a834aa5..48ecb2626 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -227,7 +227,7 @@ func New(args Args) (*Loader, error) {
 	}
 
 	// Create capabilities.
-	caps, err := specutils.Capabilities(args.Spec.Process.Capabilities)
+	caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
 	if err != nil {
 		return nil, fmt.Errorf("converting capabilities: %v", err)
 	}
@@ -554,7 +554,7 @@ func (l *Loader) createContainer(cid string) error {
 // this method returns.
 func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
 	// Create capabilities.
-	caps, err := specutils.Capabilities(spec.Process.Capabilities)
+	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
 		return fmt.Errorf("creating capabilities: %v", err)
 	}
@@ -800,6 +800,9 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 			Clock:       clock,
 			Stats:       epsocket.Metrics,
 			HandleLocal: true,
+			// Enable raw sockets for users with sufficient
+			// privileges.
+			Raw: true,
 		})}
 		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
 			return nil, fmt.Errorf("failed to enable SACK: %v", err)
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 9e058ad97..718d01067 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -132,7 +132,11 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 	if e.Capabilities == nil {
-		e.Capabilities, err = specutils.Capabilities(c.Spec.Process.Capabilities)
+		// enableRaw is set to true to prevent the filtering out of
+		// CAP_NET_RAW. This is the opposite of Create() because exec
+		// requires the capability to be set explicitly, while 'docker
+		// run' sets it by default.
+		e.Capabilities, err = specutils.Capabilities(true /* enableRaw */, c.Spec.Process.Capabilities)
 		if err != nil {
 			Fatalf("creating capabilities: %v", err)
 		}
@@ -351,7 +355,11 @@ func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
 	var caps *auth.TaskCapabilities
 	if p.Capabilities != nil {
 		var err error
-		caps, err = specutils.Capabilities(p.Capabilities)
+		// enableRaw is set to true to prevent the filtering out of
+		// CAP_NET_RAW. This is the opposite of Create() because exec
+		// requires the capability to be set explicitly, while 'docker
+		// run' sets it by default.
+		caps, err = specutils.Capabilities(true /* enableRaw */, p.Capabilities)
 		if err != nil {
 			return nil, fmt.Errorf("error creating capabilities: %v", err)
 		}
@@ -413,7 +421,11 @@ func capabilities(cs []string) (*auth.TaskCapabilities, error) {
 		specCaps.Inheritable = append(specCaps.Inheritable, cap)
 		specCaps.Permitted = append(specCaps.Permitted, cap)
 	}
-	return specutils.Capabilities(&specCaps)
+	// enableRaw is set to true to prevent the filtering out of
+	// CAP_NET_RAW. This is the opposite of Create() because exec requires
+	// the capability to be set explicitly, while 'docker run' sets it by
+	// default.
+	return specutils.Capabilities(true /* enableRaw */, &specCaps)
 }
 
 // stringSlice allows a flag to be used multiple times, where each occurrence
diff --git a/runsc/main.go b/runsc/main.go
index 74253a844..b35726a74 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -68,6 +68,7 @@ var (
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 	profile        = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+	netRaw         = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
 
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
 )
@@ -159,6 +160,7 @@ func main() {
 		WatchdogAction: wa,
 		PanicSignal:    *panicSignal,
 		ProfileEnable:  *profile,
+		EnableRaw:      *netRaw,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 	}
 	if len(*straceSyscalls) != 0 {
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index af8d34535..32f81b8d4 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -198,20 +198,26 @@ func ReadMounts(f *os.File) ([]specs.Mount, error) {
 
 // Capabilities takes in spec and returns a TaskCapabilities corresponding to
 // the spec.
-func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+	// Strip CAP_NET_RAW from all capability sets if necessary.
+	skipSet := map[linux.Capability]struct{}{}
+	if !enableRaw {
+		skipSet[linux.CAP_NET_RAW] = struct{}{}
+	}
+
 	var caps auth.TaskCapabilities
 	if specCaps != nil {
 		var err error
-		if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding); err != nil {
+		if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil {
 			return nil, err
 		}
-		if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective); err != nil {
+		if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil {
 			return nil, err
 		}
-		if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable); err != nil {
+		if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil {
 			return nil, err
 		}
-		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted); err != nil {
+		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
 			return nil, err
 		}
 		// TODO: Support ambient capabilities.
@@ -275,13 +281,17 @@ var capFromName = map[string]linux.Capability{
 	"CAP_AUDIT_READ":       linux.CAP_AUDIT_READ,
 }
 
-func capsFromNames(names []string) (auth.CapabilitySet, error) {
+func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) {
 	var caps []linux.Capability
 	for _, n := range names {
 		c, ok := capFromName[n]
 		if !ok {
 			return 0, fmt.Errorf("unknown capability %q", n)
 		}
+		// Should we skip this capabilty?
+		if _, ok := skipSet[c]; ok {
+			continue
+		}
 		caps = append(caps, c)
 	}
 	return auth.CapabilitySetOfMany(caps), nil
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index 779d30ec9..0c4e4fa80 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -15,7 +15,10 @@ go_test(
         "manual",
         "local",
     ],
-    deps = ["//runsc/test/testutil"],
+    deps = [
+        "//pkg/abi/linux",
+        "//runsc/test/testutil",
+    ],
 )
 
 go_library(
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index fac8337f4..d87957e2d 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -27,10 +27,13 @@
 package integration
 
 import (
+	"fmt"
+	"strconv"
 	"syscall"
 	"testing"
 	"time"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
@@ -46,11 +49,28 @@ func TestExecCapabilities(t *testing.T) {
 	}
 	defer d.CleanUp()
 
-	want, err := d.WaitForOutput("CapEff:\t[0-9a-f]+\n", 5*time.Second)
+	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
 	if err != nil {
-		t.Fatalf("WaitForOutput() timeout: %v", err)
+		t.Fatalf("WaitForOutputSubmatch() timeout: %v", err)
 	}
-	t.Log("Root capabilities:", want)
+	if len(matches) != 2 {
+		t.Fatalf("There should be a match for the whole line and the capability bitmask")
+	}
+	capString := matches[1]
+	t.Log("Root capabilities:", capString)
+
+	// CAP_NET_RAW was in the capability set for the container, but was
+	// removed. However, `exec` does not remove it. Verify that it's not
+	// set in the container, then re-add it for comparison.
+	caps, err := strconv.ParseUint(capString, 16, 64)
+	if err != nil {
+		t.Fatalf("failed to convert capabilities %q: %v", capString, err)
+	}
+	if caps&(1<<uint64(linux.CAP_NET_RAW)) != 0 {
+		t.Fatalf("CAP_NET_RAW should be filtered, but is set in the container: %x", caps)
+	}
+	caps |= 1 << uint64(linux.CAP_NET_RAW)
+	want := fmt.Sprintf("CapEff:\t%016x\n", caps)
 
 	// Now check that exec'd process capabilities match the root.
 	got, err := d.Exec("grep", "CapEff:", "/proc/self/status")
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index bce609061..b651319ed 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -334,19 +334,32 @@ func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
 // WaitForOutput calls 'docker logs' to retrieve containers output and searches
 // for the given pattern.
 func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
+	matches, err := d.WaitForOutputSubmatch(pattern, timeout)
+	if err != nil {
+		return "", err
+	}
+	if len(matches) == 0 {
+		return "", nil
+	}
+	return matches[0], nil
+}
+
+// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
+// searches for the given pattern. It returns any regexp submatches as well.
+func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
 	re := regexp.MustCompile(pattern)
 	var out string
 	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
 		var err error
 		out, err = d.Logs()
 		if err != nil {
-			return "", err
+			return nil, err
 		}
-		if match := re.FindString(out); match != "" {
+		if matches := re.FindStringSubmatch(out); matches != nil {
 			// Success!
-			return match, nil
+			return matches, nil
 		}
 		time.Sleep(100 * time.Millisecond)
 	}
-	return "", fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
+	return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
 }
-- 
cgit v1.2.3


From f4ce43e1f426148d99c28c1b0e5c43ddda17a8cb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 29 Apr 2019 14:03:04 -0700
Subject: Allow and document bug ids in gVisor codebase.

PiperOrigin-RevId: 245818639
Change-Id: I03703ef0fb9b6675955637b9fe2776204c545789
---
 CONTRIBUTING.md                                    |  7 +++
 pkg/cpuid/cpuid_test.go                            |  2 +-
 pkg/dhcp/client.go                                 |  2 +-
 pkg/log/glog.go                                    |  2 +-
 pkg/metric/metric.go                               |  4 +-
 pkg/segment/set.go                                 |  2 +-
 pkg/segment/test/set_functions.go                  |  2 +-
 pkg/sentry/arch/arch.go                            |  2 +-
 pkg/sentry/arch/arch_amd64.go                      |  4 +-
 pkg/sentry/arch/arch_x86.go                        |  2 +-
 pkg/sentry/arch/signal_amd64.go                    |  6 +--
 pkg/sentry/arch/stack.go                           |  6 +--
 pkg/sentry/context/context.go                      |  2 +-
 pkg/sentry/control/proc.go                         |  2 +-
 pkg/sentry/fs/README.md                            |  2 +-
 pkg/sentry/fs/ashmem/area.go                       |  4 +-
 pkg/sentry/fs/binder/binder.go                     | 22 ++++----
 pkg/sentry/fs/dentry.go                            |  2 +-
 pkg/sentry/fs/dirent.go                            |  8 +--
 pkg/sentry/fs/file.go                              |  2 +-
 pkg/sentry/fs/file_overlay.go                      |  4 +-
 pkg/sentry/fs/fsutil/file.go                       |  8 +--
 pkg/sentry/fs/fsutil/inode_cached.go               |  4 +-
 pkg/sentry/fs/gofer/cache_policy.go                |  4 +-
 pkg/sentry/fs/gofer/file.go                        |  2 +-
 pkg/sentry/fs/gofer/file_state.go                  |  2 +-
 pkg/sentry/fs/gofer/handles.go                     |  2 +-
 pkg/sentry/fs/gofer/inode.go                       |  6 +--
 pkg/sentry/fs/gofer/inode_state.go                 |  2 +-
 pkg/sentry/fs/gofer/session.go                     |  2 +-
 pkg/sentry/fs/gofer/session_state.go               |  2 +-
 pkg/sentry/fs/host/fs.go                           |  4 +-
 pkg/sentry/fs/host/inode.go                        | 10 ++--
 pkg/sentry/fs/inode.go                             |  6 +--
 pkg/sentry/fs/inode_operations.go                  |  2 +-
 pkg/sentry/fs/inode_overlay.go                     |  6 +--
 pkg/sentry/fs/mount.go                             |  4 +-
 pkg/sentry/fs/mount_test.go                        |  2 +-
 pkg/sentry/fs/proc/README.md                       | 12 ++---
 pkg/sentry/fs/proc/fds.go                          |  2 +-
 pkg/sentry/fs/proc/loadavg.go                      |  2 +-
 pkg/sentry/fs/proc/meminfo.go                      |  6 +--
 pkg/sentry/fs/proc/mounts.go                       |  2 +-
 pkg/sentry/fs/proc/net.go                          |  2 +-
 pkg/sentry/fs/proc/stat.go                         | 12 ++---
 pkg/sentry/fs/proc/sys_net.go                      |  2 +-
 pkg/sentry/fs/proc/task.go                         |  8 +--
 pkg/sentry/fs/proc/version.go                      |  2 +-
 pkg/sentry/fs/ramfs/dir.go                         |  2 +-
 pkg/sentry/fs/tmpfs/fs.go                          |  2 +-
 pkg/sentry/fs/tmpfs/inode_file.go                  |  2 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                       |  2 +-
 pkg/sentry/fs/tty/dir.go                           |  6 +--
 pkg/sentry/fs/tty/fs.go                            |  2 +-
 pkg/sentry/fs/tty/master.go                        |  6 +--
 pkg/sentry/fs/tty/slave.go                         |  6 +--
 pkg/sentry/kernel/auth/credentials.go              |  2 +-
 pkg/sentry/kernel/auth/user_namespace.go           |  2 +-
 pkg/sentry/kernel/pending_signals.go               |  2 +-
 pkg/sentry/kernel/ptrace.go                        |  4 +-
 pkg/sentry/kernel/rseq.go                          |  2 +-
 pkg/sentry/kernel/sched/cpuset.go                  |  2 +-
 pkg/sentry/kernel/semaphore/semaphore.go           |  6 +--
 pkg/sentry/kernel/shm/shm.go                       |  2 +-
 pkg/sentry/kernel/syscalls.go                      |  2 +-
 pkg/sentry/kernel/task_context.go                  |  2 +-
 pkg/sentry/kernel/task_exec.go                     |  2 +-
 pkg/sentry/kernel/task_exit.go                     |  4 +-
 pkg/sentry/kernel/task_identity.go                 |  2 +-
 pkg/sentry/kernel/task_run.go                      |  2 +-
 pkg/sentry/kernel/task_signals.go                  |  4 +-
 pkg/sentry/kernel/task_stop.go                     |  2 +-
 pkg/sentry/loader/loader.go                        |  2 +-
 pkg/sentry/loader/vdso.go                          |  6 +--
 pkg/sentry/memmap/memmap.go                        |  2 +-
 pkg/sentry/mm/aio_context.go                       |  2 +-
 pkg/sentry/mm/procfs.go                            | 10 ++--
 pkg/sentry/mm/special_mappable.go                  |  2 +-
 pkg/sentry/mm/syscalls.go                          |  6 +--
 pkg/sentry/mm/vma.go                               |  2 +-
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go        |  2 +-
 pkg/sentry/platform/platform.go                    |  2 +-
 pkg/sentry/platform/ptrace/subprocess.go           |  2 +-
 pkg/sentry/platform/ring0/x86.go                   |  4 +-
 pkg/sentry/sighandling/sighandling.go              |  2 +-
 pkg/sentry/sighandling/sighandling_unsafe.go       |  2 +-
 pkg/sentry/socket/epsocket/epsocket.go             | 32 ++++++------
 pkg/sentry/socket/epsocket/save_restore.go         |  2 +-
 pkg/sentry/socket/epsocket/stack.go                |  2 +-
 pkg/sentry/socket/hostinet/socket.go               |  2 +-
 pkg/sentry/socket/netlink/route/protocol.go        |  8 +--
 pkg/sentry/socket/netlink/socket.go                | 10 ++--
 pkg/sentry/socket/rpcinet/conn/conn.go             |  2 +-
 pkg/sentry/socket/rpcinet/notifier/notifier.go     |  4 +-
 pkg/sentry/socket/rpcinet/socket.go                |  6 +--
 pkg/sentry/socket/rpcinet/syscall_rpc.proto        |  2 +-
 pkg/sentry/strace/strace.go                        |  2 +-
 pkg/sentry/syscalls/linux/error.go                 |  2 +-
 pkg/sentry/syscalls/linux/linux64.go               | 60 +++++++++++-----------
 pkg/sentry/syscalls/linux/sys_aio.go               |  2 +-
 pkg/sentry/syscalls/linux/sys_file.go              |  4 +-
 pkg/sentry/syscalls/linux/sys_mmap.go              |  4 +-
 pkg/sentry/syscalls/linux/sys_read.go              |  2 +-
 pkg/sentry/syscalls/linux/sys_socket.go            |  4 +-
 pkg/sentry/syscalls/linux/sys_thread.go            |  2 +-
 pkg/sentry/syscalls/linux/sys_write.go             |  4 +-
 pkg/sentry/time/calibrated_clock.go                |  6 +--
 pkg/sentry/time/parameters.go                      |  2 +-
 pkg/sentry/usermem/usermem.go                      |  4 +-
 pkg/sentry/watchdog/watchdog.go                    |  2 +-
 pkg/syserr/syserr.go                               | 10 ++--
 pkg/tcpip/network/ipv4/icmp.go                     |  2 +-
 pkg/tcpip/network/ipv6/icmp.go                     |  4 +-
 pkg/tcpip/stack/nic.go                             |  6 +--
 pkg/tcpip/stack/stack.go                           |  4 +-
 pkg/tcpip/stack/stack_global_state.go              |  2 +-
 pkg/tcpip/stack/transport_test.go                  |  2 +-
 pkg/tcpip/tcpip.go                                 |  2 +-
 pkg/tcpip/transport/raw/raw.go                     |  2 +-
 pkg/tcpip/transport/tcp/BUILD                      |  2 +-
 pkg/unet/unet.go                                   |  2 +-
 pkg/unet/unet_test.go                              |  2 +-
 runsc/boot/controller.go                           |  4 +-
 runsc/boot/fs.go                                   |  6 +--
 runsc/boot/loader.go                               |  2 +-
 runsc/cmd/checkpoint.go                            |  2 +-
 runsc/container/container.go                       |  2 +-
 runsc/container/container_test.go                  |  4 +-
 runsc/sandbox/sandbox.go                           |  6 +--
 runsc/specutils/specutils.go                       |  4 +-
 test/syscalls/BUILD                                |  6 +--
 test/syscalls/build_defs.bzl                       |  4 +-
 test/syscalls/linux/32bit.cc                       | 14 ++---
 test/syscalls/linux/aio.cc                         |  2 +-
 test/syscalls/linux/chmod.cc                       |  2 +-
 test/syscalls/linux/epoll.cc                       |  2 +-
 test/syscalls/linux/exec_binary.cc                 | 12 ++---
 test/syscalls/linux/file_base.h                    |  4 +-
 test/syscalls/linux/ioctl.cc                       |  4 +-
 test/syscalls/linux/ip_socket_test_util.cc         |  2 +-
 test/syscalls/linux/lseek.cc                       |  2 +-
 test/syscalls/linux/mkdir.cc                       |  2 +-
 test/syscalls/linux/mmap.cc                        | 18 +++----
 test/syscalls/linux/open.cc                        |  2 +-
 test/syscalls/linux/partial_bad_buffer.cc          | 18 +++----
 test/syscalls/linux/pipe.cc                        |  6 +--
 test/syscalls/linux/proc.cc                        | 32 ++++++------
 test/syscalls/linux/proc_pid_smaps.cc              |  2 +-
 test/syscalls/linux/ptrace.cc                      |  2 +-
 test/syscalls/linux/pwrite64.cc                    |  2 +-
 test/syscalls/linux/readv_socket.cc                |  2 +-
 test/syscalls/linux/rtsignal.cc                    |  2 +-
 test/syscalls/linux/socket_inet_loopback.cc        | 10 ++--
 .../socket_ipv4_udp_unbound_external_networking.cc |  4 +-
 test/syscalls/linux/socket_netlink_route.cc        |  4 +-
 test/syscalls/linux/socket_stream_blocking.cc      |  2 +-
 test/syscalls/linux/socket_test_util.cc            |  2 +-
 test/syscalls/linux/socket_unix.cc                 | 16 +++---
 test/syscalls/linux/socket_unix_dgram.cc           |  2 +-
 .../linux/socket_unix_dgram_non_blocking.cc        |  2 +-
 test/syscalls/linux/socket_unix_non_stream.cc      | 10 ++--
 .../linux/socket_unix_unbound_seqpacket.cc         |  2 +-
 test/syscalls/linux/socket_unix_unbound_stream.cc  |  4 +-
 test/syscalls/linux/stat.cc                        |  2 +-
 test/syscalls/linux/stat_times.cc                  |  8 +--
 test/syscalls/linux/tcp_socket.cc                  |  2 +-
 test/syscalls/linux/tkill.cc                       |  2 +-
 test/syscalls/linux/udp_bind.cc                    |  4 +-
 test/syscalls/linux/uidgid.cc                      |  2 +-
 test/syscalls/linux/utimes.cc                      |  4 +-
 test/syscalls/linux/wait.cc                        |  2 +-
 test/syscalls/linux/write.cc                       |  2 +-
 third_party/gvsync/downgradable_rwmutex_unsafe.go  |  2 +-
 vdso/cycle_clock.h                                 |  2 +-
 vdso/vdso_amd64.lds                                |  2 +-
 vdso/vdso_arm64.lds                                |  2 +-
 176 files changed, 403 insertions(+), 396 deletions(-)

(limited to 'runsc')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d6dafc595..238dd6665 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -126,6 +126,13 @@ change.
 When approved, the change will be submitted by a team member and automatically
 merged into the repository.
 
+### Bug IDs
+
+Some TODOs and NOTEs sprinkled throughout the code have associated IDs of the
+form b/1234. These correspond to bugs in our internal bug tracker. Eventually
+these bugs will be moved to the GitHub Issues, but until then they can simply be
+ignored.
+
 ### The small print
 
 Contributions made by corporations are covered by a different agreement than the
diff --git a/pkg/cpuid/cpuid_test.go b/pkg/cpuid/cpuid_test.go
index 35e7b8e50..64ade1cbe 100644
--- a/pkg/cpuid/cpuid_test.go
+++ b/pkg/cpuid/cpuid_test.go
@@ -78,7 +78,7 @@ func TestTakeFeatureIntersection(t *testing.T) {
 	}
 }
 
-// TODO: Run this test on a very old platform, and make sure more
+// TODO(b/73346484): Run this test on a very old platform, and make sure more
 // bits are enabled than just FPU and PAE. This test currently may not detect
 // if HostFeatureSet gives back junk bits.
 func TestHostFeatureSet(t *testing.T) {
diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 354205e63..2ba79be32 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -120,7 +120,7 @@ func (c *Client) Config() Config {
 // If the server sets a lease limit a timer is set to automatically
 // renew it.
 func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) (cfg Config, reterr error) {
-	// TODO: remove calls to {Add,Remove}Address when they're no
+	// TODO(b/127321246): remove calls to {Add,Remove}Address when they're no
 	// longer required to send and receive broadcast.
 	if err := c.stack.AddAddressWithOptions(c.nicid, ipv4.ProtocolNumber, tcpipHeader.IPv4Any, stack.NeverPrimaryEndpoint); err != nil && err != tcpip.ErrDuplicateAddress {
 		return Config{}, fmt.Errorf("dhcp: AddAddressWithOptions(): %s", err)
diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index fbb58501b..24d5390d7 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -144,7 +144,7 @@ func (g GoogleEmitter) Emit(level Level, timestamp time.Time, format string, arg
 	b.writeAll(pid)
 	b.write(' ')
 
-	// FIXME: The caller, fabricated. This really sucks, but it
+	// FIXME(b/73383460): The caller, fabricated. This really sucks, but it
 	// is unacceptable to put runtime.Callers() in the hot path.
 	b.writeAll(caller)
 	b.write(']')
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 02af75974..e5eb95f89 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -44,8 +44,8 @@ var (
 //
 // Metrics are not saved across save/restore and thus reset to zero on restore.
 //
-// TODO: Support non-cumulative metrics.
-// TODO: Support metric fields.
+// TODO(b/67298402): Support non-cumulative metrics.
+// TODO(b/67298427): Support metric fields.
 //
 type Uint64Metric struct {
 	// value is the actual value of the metric. It must be accessed
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index a9a3b8875..74a916ea3 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -1270,7 +1270,7 @@ func segmentAfterPosition(n *node, i int) Iterator {
 }
 
 func zeroValueSlice(slice []Value) {
-	// TODO: check if Go is actually smart enough to optimize a
+	// TODO(jamieliu): check if Go is actually smart enough to optimize a
 	// ClearValue that assigns nil to a memset here
 	for i := range slice {
 		Functions{}.ClearValue(&slice[i])
diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go
index 05ba5fbb9..41f649011 100644
--- a/pkg/segment/test/set_functions.go
+++ b/pkg/segment/test/set_functions.go
@@ -15,7 +15,7 @@
 package segment
 
 // Basic numeric constants that we define because the math package doesn't.
-// TODO: These should be Math.MaxInt64/MinInt64?
+// TODO(nlacasse): These should be Math.MaxInt64/MinInt64?
 const (
 	maxInt = int(^uint(0) >> 1)
 	minInt = -maxInt - 1
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 4cd7a9af5..16d8eb2b2 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -53,7 +53,7 @@ type FloatingPointData byte
 
 // Context provides architecture-dependent information for a specific thread.
 //
-// NOTE: Currently we use uintptr here to refer to a generic native
+// NOTE(b/34169503): Currently we use uintptr here to refer to a generic native
 // register value. While this will work for the foreseeable future, it isn't
 // strictly correct. We may want to create some abstraction that makes this
 // more clear or enables us to store values of arbitrary widths. This is
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 2507774f7..7ec2f2c84 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -305,7 +305,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
 		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
 		return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
 	}
-	// TODO: debug registers
+	// TODO(b/34088053): debug registers
 	return c.Native(0), nil
 }
 
@@ -320,6 +320,6 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
 		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
 		return err
 	}
-	// TODO: debug registers
+	// TODO(b/34088053): debug registers
 	return nil
 }
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index c8bf0e7f2..4305fe2cb 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -306,7 +306,7 @@ func (s *State) ptraceGetRegs() syscall.PtraceRegs {
 	// FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the
 	// same in PtraceSetRegs.)
 	//
-	// TODO: Remove this fixup since newer Linux
+	// TODO(gvisor.dev/issue/168): Remove this fixup since newer Linux
 	// doesn't have this behavior anymore.
 	if regs.Fs == 0 && regs.Fs_base <= 0xffffffff {
 		regs.Fs = _FS_TLS_SEL
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index c9de36897..7f76eba27 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -319,7 +319,7 @@ func (c *context64) NewSignalStack() NativeSignalStack {
 // From Linux 'arch/x86/include/uapi/asm/sigcontext.h' the following is the
 // size of the magic cookie at the end of the xsave frame.
 //
-// NOTE: Currently we don't actually populate the fpstate
+// NOTE(b/33003106#comment11): Currently we don't actually populate the fpstate
 // on the signal stack.
 const _FP_XSTATE_MAGIC2_SIZE = 4
 
@@ -392,7 +392,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 		Sigset: sigset,
 	}
 
-	// TODO: Set SignalContext64.Err, Trapno, and Cr2
+	// TODO(gvisor.dev/issue/159): Set SignalContext64.Err, Trapno, and Cr2
 	// based on the fault that caused the signal. For now, leave Err and
 	// Trapno unset and assume CR2 == info.Addr() for SIGSEGVs and
 	// SIGBUSes.
@@ -505,7 +505,7 @@ func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalSt
 	l := len(c.sigFPState)
 	if l > 0 {
 		c.x86FPState = c.sigFPState[l-1]
-		// NOTE: State save requires that any slice
+		// NOTE(cl/133042258): State save requires that any slice
 		// elements from '[len:cap]' to be zero value.
 		c.sigFPState[l-1] = nil
 		c.sigFPState = c.sigFPState[0 : l-1]
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index f2cfb0426..2e33ccdf5 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -97,7 +97,7 @@ func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
 		if c < 0 {
 			return 0, fmt.Errorf("bad binary.Size for %T", v)
 		}
-		// TODO: Use a real context.Context.
+		// TODO(b/38173783): Use a real context.Context.
 		n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
 		if err != nil || c != n {
 			return 0, err
@@ -121,11 +121,11 @@ func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
 		var err error
 		if isVaddr {
 			value := s.Arch.Native(uintptr(0))
-			// TODO: Use a real context.Context.
+			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
 			*vaddr = usermem.Addr(s.Arch.Value(value))
 		} else {
-			// TODO: Use a real context.Context.
+			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
 		}
 		if err != nil {
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index 7ed6a5e8a..eefc3e1b4 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -114,7 +114,7 @@ var bgContext = &logContext{Logger: log.Log()}
 // Background returns an empty context using the default logger.
 //
 // Users should be wary of using a Background context. Please tag any use with
-// FIXME and a note to remove this use.
+// FIXME(b/38173783) and a note to remove this use.
 //
 // Generally, one should use the Task as their context when available, or avoid
 // having to use a context in places where a Task is unavailable.
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index e848def14..aca2267a7 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -261,7 +261,7 @@ func (proc *Proc) Ps(args *PsArgs, out *string) error {
 }
 
 // Process contains information about a single process in a Sandbox.
-// TODO: Implement TTY field.
+// TODO(b/117881927): Implement TTY field.
 type Process struct {
 	UID auth.KUID       `json:"uid"`
 	PID kernel.ThreadID `json:"pid"`
diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md
index a88a0cd3a..f53ed3eaa 100644
--- a/pkg/sentry/fs/README.md
+++ b/pkg/sentry/fs/README.md
@@ -59,7 +59,7 @@ two categories:
 
 The first is always necessary to save and restore. An application may never have
 any open file descriptors, but across save and restore it should see a coherent
-view of any mount namespace. NOTE: Currently only one "initial"
+view of any mount namespace. NOTE(b/63601033): Currently only one "initial"
 mount namespace is supported.
 
 The second is so that system calls across save and restore are coherent with
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index 651cbc164..1f61c5711 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -240,7 +240,7 @@ func (a *Area) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
 			return 0, syserror.EINVAL
 		}
 
-		// TODO: If personality flag
+		// TODO(b/30946773,gvisor.dev/issue/153): If personality flag
 		// READ_IMPLIES_EXEC is set, set PROT_EXEC if PORT_READ is set.
 
 		a.perms = perms
@@ -290,7 +290,7 @@ func (a *Area) pinOperation(pin linux.AshmemPin, op uint32) (uintptr, error) {
 		return linux.AshmemNotPurged, nil
 
 	case linux.AshmemUnpinIoctl:
-		// TODO: Implement purge on unpin.
+		// TODO(b/30946773): Implement purge on unpin.
 		a.pb.UnpinRange(r)
 		return 0, nil
 
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index a41b5dcae..d9f1559de 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -69,7 +69,7 @@ func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *
 
 // GetFile implements fs.InodeOperations.GetFile.
 //
-// TODO: Add functionality to GetFile: Additional fields will be
+// TODO(b/30946773): Add functionality to GetFile: Additional fields will be
 // needed in the Device structure, initialize them here. Also, Device will need
 // to keep track of the created Procs in order to implement BINDER_READ_WRITE
 // ioctl.
@@ -133,7 +133,7 @@ func (bp *Proc) Write(ctx context.Context, file *fs.File, src usermem.IOSequence
 
 // Flush implements fs.FileOperations.Flush.
 //
-// TODO: Implement.
+// TODO(b/30946773): Implement.
 func (bp *Proc) Flush(ctx context.Context, file *fs.File) error {
 	return nil
 }
@@ -149,7 +149,7 @@ func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.M
 	}
 	opts.MaxPerms.Write = false
 
-	// TODO: Binder sets VM_DONTCOPY, preventing the created vma
+	// TODO(b/30946773): Binder sets VM_DONTCOPY, preventing the created vma
 	// from being copied across fork(), but we don't support this yet. As
 	// a result, MMs containing a Binder mapping cannot be forked (MM.Fork will
 	// fail when AddMapping returns EBUSY).
@@ -159,7 +159,7 @@ func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.M
 
 // Ioctl implements fs.FileOperations.Ioctl.
 //
-// TODO: Implement.
+// TODO(b/30946773): Implement.
 func (bp *Proc) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	// Switch on ioctl request.
 	switch uint32(args[1].Int()) {
@@ -173,22 +173,22 @@ func (bp *Proc) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgum
 		})
 		return 0, err
 	case linux.BinderWriteReadIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderSetIdleTimeoutIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderSetMaxThreadsIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderSetIdlePriorityIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderSetContextMgrIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderThreadExitIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		return 0, syserror.ENOSYS
 	default:
 		// Ioctls irrelevant to Binder.
@@ -228,7 +228,7 @@ func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR,
 
 // Translate implements memmap.Mappable.Translate.
 func (bp *Proc) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
-	// TODO: In addition to the page initially allocated and mapped
+	// TODO(b/30946773): In addition to the page initially allocated and mapped
 	// in AddMapping (Linux: binder_mmap), Binder allocates and maps pages for
 	// each transaction (Linux: binder_ioctl => binder_ioctl_write_read =>
 	// binder_thread_write => binder_transaction => binder_alloc_buf =>
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index 4879df4d6..29fb155a4 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -83,7 +83,7 @@ type DirCtx struct {
 	attrs map[string]DentAttr
 
 	// DirCursor is the directory cursor.
-	// TODO: Once Handles are removed this can just live in the
+	// TODO(b/67778717): Once Handles are removed this can just live in the
 	// respective FileOperations implementations and not need to get
 	// plumbed everywhere.
 	DirCursor *string
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4bcdf530a..54fc11fe1 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -318,7 +318,7 @@ func (d *Dirent) SyncAll(ctx context.Context) {
 
 	// There is nothing to sync for a read-only filesystem.
 	if !d.Inode.MountSource.Flags.ReadOnly {
-		// FIXME: This should be a mount traversal, not a
+		// FIXME(b/34856369): This should be a mount traversal, not a
 		// Dirent traversal, because some Inodes that need to be synced
 		// may no longer be reachable by name (after sys_unlink).
 		//
@@ -1506,7 +1506,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	}
 
 	// Are we frozen?
-	// TODO: Is this the right errno?
+	// TODO(jamieliu): Is this the right errno?
 	if oldParent.frozen && !oldParent.Inode.IsVirtual() {
 		return syscall.ENOENT
 	}
@@ -1565,7 +1565,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	} else {
 		// Check constraints on the dirent being replaced.
 
-		// NOTE: We don't want to keep replaced alive
+		// NOTE(b/111808347): We don't want to keep replaced alive
 		// across the Rename, so must call DecRef manually (no defer).
 
 		// Check that we can delete replaced.
@@ -1606,7 +1606,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		// Allow the file system to drop extra references on replaced.
 		replaced.dropExtendedReference()
 
-		// NOTE: Keeping a dirent
+		// NOTE(b/31798319,b/31867149,b/31867671): Keeping a dirent
 		// open across renames is currently broken for multiple
 		// reasons, so we flush all references on the replaced node and
 		// its children.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 2c2126f17..5d5026661 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -65,7 +65,7 @@ const FileMaxOffset = math.MaxInt64
 // under a single abortable mutex which also synchronizes lseek(2), read(2),
 // and write(2).
 //
-// FIXME: Split synchronization from cancellation.
+// FIXME(b/38451980): Split synchronization from cancellation.
 //
 // +stateify savable
 type File struct {
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index e1f02f0f4..6e680f0a4 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -160,7 +160,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See
 	// If this was a seek on a directory, we must update the cursor.
 	if seekDir && whence == SeekSet && offset == 0 {
 		// Currently only seeking to 0 on a directory is supported.
-		// FIXME: Lift directory seeking limitations.
+		// FIXME(b/33075855): Lift directory seeking limitations.
 		f.dirCursor = ""
 	}
 	return n, nil
@@ -329,7 +329,7 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt
 	if !o.isMappableLocked() {
 		return syserror.ENODEV
 	}
-	// FIXME: This is a copy/paste of fsutil.GenericConfigureMMap,
+	// FIXME(jamieliu): This is a copy/paste of fsutil.GenericConfigureMMap,
 	// which we can't use because the overlay implementation is in package fs,
 	// so depending on fs/fsutil would create a circular dependency. Move
 	// overlay to fs/overlay.
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index df34dc788..42afdd11c 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -36,7 +36,7 @@ func (FileNoopRelease) Release() {}
 //
 // Currently only seeking to 0 on a directory is supported.
 //
-// FIXME: Lift directory seeking limitations.
+// FIXME(b/33075855): Lift directory seeking limitations.
 func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) {
 	inode := file.Dirent.Inode
 	current := file.Offset()
@@ -50,7 +50,7 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence,
 	if fs.IsCharDevice(inode.StableAttr) {
 		// Ignore seek requests.
 		//
-		// FIXME: This preserves existing
+		// FIXME(b/34716638): This preserves existing
 		// behavior but is not universally correct.
 		return 0, nil
 	}
@@ -104,7 +104,7 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence,
 				return current, syserror.EINVAL
 			}
 			return sz + offset, nil
-		// FIXME: This is not universally correct.
+		// FIXME(b/34778850): This is not universally correct.
 		// Remove SpecialDirectory.
 		case fs.SpecialDirectory:
 			if offset != 0 {
@@ -112,7 +112,7 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence,
 			}
 			// SEEK_END to 0 moves the directory "cursor" to the end.
 			//
-			// FIXME: The ensures that after the seek,
+			// FIXME(b/35442290): The ensures that after the seek,
 			// reading on the directory will get EOF. But it is not
 			// correct in general because the directory can grow in
 			// size; attempting to read those new entries will be
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index b690cfe93..ba33b9912 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -479,7 +479,7 @@ func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst us
 	// common: getting a return value of 0 from a read syscall is the only way
 	// to detect EOF.
 	//
-	// TODO: Separate out c.attr.Size and use atomics instead of
+	// TODO(jamieliu): Separate out c.attr.Size and use atomics instead of
 	// c.dataMu.
 	c.dataMu.RLock()
 	size := c.attr.Size
@@ -776,7 +776,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 	var translatedEnd uint64
 	for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
 		segMR := seg.Range().Intersect(optional)
-		// TODO: Make Translations writable even if writability is
+		// TODO(jamieliu): Make Translations writable even if writability is
 		// not required if already kept-dirty by another writable translation.
 		perms := usermem.AccessType{
 			Read:    true,
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index d7fbb71b7..51c573aef 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -136,7 +136,7 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child
 
 	// Walk from parent to child again.
 	//
-	// TODO: If we have a directory FD in the parent
+	// TODO(b/112031682): If we have a directory FD in the parent
 	// inodeOperations, then we can use fstatat(2) to get the inode
 	// attributes instead of making this RPC.
 	qids, _, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name})
@@ -171,7 +171,7 @@ func (cp cachePolicy) keep(d *fs.Dirent) bool {
 		return false
 	}
 	sattr := d.Inode.StableAttr
-	// NOTE: Only cache files, directories, and symlinks.
+	// NOTE(b/31979197): Only cache files, directories, and symlinks.
 	return fs.IsFile(sattr) || fs.IsDir(sattr) || fs.IsSymlink(sattr)
 }
 
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 80d1e08a6..35caa42cd 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -297,7 +297,7 @@ func (f *fileOperations) Flush(ctx context.Context, file *fs.File) error {
 	// We do this because some p9 server implementations of Flush are
 	// over-zealous.
 	//
-	// FIXME: weaken these implementations and remove this check.
+	// FIXME(edahlgren): weaken these implementations and remove this check.
 	if !file.Flags().Write {
 		return nil
 	}
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index f770ca4ea..d0c64003c 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -28,7 +28,7 @@ func (f *fileOperations) afterLoad() {
 
 		// Manually load the open handles.
 		var err error
-		// TODO: Context is not plumbed to save/restore.
+		// TODO(b/38173783): Context is not plumbed to save/restore.
 		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), f.flags)
 		if err != nil {
 			return fmt.Errorf("failed to re-open handle: %v", err)
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index f32e99ce0..0b33e80c3 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -49,7 +49,7 @@ func (h *handles) DecRef() {
 				log.Warningf("error closing host file: %v", err)
 			}
 		}
-		// FIXME: Context is not plumbed here.
+		// FIXME(b/38173783): Context is not plumbed here.
 		if err := h.File.close(context.Background()); err != nil {
 			log.Warningf("error closing p9 file: %v", err)
 		}
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 29af1010c..1181a24cc 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -570,13 +570,13 @@ func init() {
 }
 
 // AddLink implements InodeOperations.AddLink, but is currently a noop.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) AddLink() {}
 
 // DropLink implements InodeOperations.DropLink, but is currently a noop.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index ad4d3df58..44d76ba9f 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -123,7 +123,7 @@ func (i *inodeFileState) afterLoad() {
 			// beforeSave.
 			return fmt.Errorf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings))
 		}
-		// TODO: Context is not plumbed to save/restore.
+		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index ed5147c65..4ed688ce5 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -134,7 +134,7 @@ type session struct {
 	// socket files. This allows unix domain sockets to be used with paths that
 	// belong to a gofer.
 	//
-	// TODO: there are few possible races with someone stat'ing the
+	// TODO(b/77154739): there are few possible races with someone stat'ing the
 	// file and another deleting it concurrently, where the file will not be
 	// reported as socket file.
 	endpoints *endpointMaps `state:"wait"`
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 0ad5d63b5..b1f299be5 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -104,7 +104,7 @@ func (s *session) afterLoad() {
 	// If private unix sockets are enabled, create and fill the session's endpoint
 	// maps.
 	if opts.privateunixsocket {
-		// TODO: Context is not plumbed to save/restore.
+		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		if err = s.restoreEndpointMaps(ctx); err != nil {
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index 800649211..de349a41a 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -87,7 +87,7 @@ func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFl
 	options := fs.GenericMountSourceOptions(data)
 
 	// Grab the whitelist if one was specified.
-	// TODO: require another option "testonly" in order to allow
+	// TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow
 	// no whitelist.
 	if wl, ok := options[whitelistKey]; ok {
 		f.paths = strings.Split(wl, "|")
@@ -320,7 +320,7 @@ func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
 
 // Keep implements fs.MountSourceOperations.Keep.
 //
-// TODO: It is possible to change the permissions on a
+// TODO(b/72455313,b/77596690): It is possible to change the permissions on a
 // host file while it is in the dirent cache (say from RO to RW), but it is not
 // possible to re-open the file with more relaxed permissions, since the host
 // FD is already open and stored in the inode.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 2030edcb4..69c648f67 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -95,7 +95,7 @@ type inodeFileState struct {
 
 // ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
 func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
-	// TODO: Using safemem.FromIOReader here is wasteful for two
+	// TODO(jamieliu): Using safemem.FromIOReader here is wasteful for two
 	// reasons:
 	//
 	// - Using preadv instead of iterated preads saves on host system calls.
@@ -325,7 +325,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 
 // canMap returns true if this fs.Inode can be memory mapped.
 func canMap(inode *fs.Inode) bool {
-	// FIXME: Some obscure character devices can be mapped.
+	// FIXME(b/38213152): Some obscure character devices can be mapped.
 	return fs.IsFile(inode.StableAttr)
 }
 
@@ -428,15 +428,15 @@ func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
 }
 
 // AddLink implements fs.InodeOperations.AddLink.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) AddLink() {}
 
 // DropLink implements fs.InodeOperations.DropLink.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
 
 // readdirAll returns all of the directory entries in i.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index d82f9740e..fe411a766 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -93,10 +93,10 @@ func (i *Inode) DecRef() {
 
 // destroy releases the Inode and releases the msrc reference taken.
 func (i *Inode) destroy() {
-	// FIXME: Context is not plumbed here.
+	// FIXME(b/38173783): Context is not plumbed here.
 	ctx := context.Background()
 	if err := i.WriteOut(ctx); err != nil {
-		// FIXME: Mark as warning again once noatime is
+		// FIXME(b/65209558): Mark as warning again once noatime is
 		// properly supported.
 		log.Debugf("Inode %+v, failed to sync all metadata: %v", i.StableAttr, err)
 	}
@@ -359,7 +359,7 @@ func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) {
 // AddLink calls i.InodeOperations.AddLink.
 func (i *Inode) AddLink() {
 	if i.overlay != nil {
-		// FIXME: Remove this from InodeOperations altogether.
+		// FIXME(b/63117438): Remove this from InodeOperations altogether.
 		//
 		// This interface is only used by ramfs to update metadata of
 		// children. These filesystems should _never_ have overlay
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index ceacc7659..ff8b75f31 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -118,7 +118,7 @@ type InodeOperations interface {
 	//
 	// The caller must ensure that this operation is permitted.
 	//
-	// TODO: merge Remove and RemoveDirectory, Remove
+	// TODO(b/67778723): merge Remove and RemoveDirectory, Remove
 	// just needs a type flag.
 	Remove(ctx context.Context, dir *Inode, name string) error
 
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 254646176..bda3e1861 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -142,7 +142,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 				} else {
 					// If we have something from the upper, we can only use it if the types
 					// match.
-					// NOTE: Allow SpecialDirectories and Directories to merge.
+					// NOTE(b/112312863): Allow SpecialDirectories and Directories to merge.
 					// This is needed to allow submounts in /proc and /sys.
 					if upperInode.StableAttr.Type == child.Inode.StableAttr.Type ||
 						(IsDir(upperInode.StableAttr) && IsDir(child.Inode.StableAttr)) {
@@ -226,7 +226,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
 		return nil, err
 	}
 
-	// NOTE: Replace the Dirent with a transient Dirent, since
+	// NOTE(b/71766861): Replace the Dirent with a transient Dirent, since
 	// we are about to create the real Dirent: an overlay Dirent.
 	//
 	// This ensures the *fs.File returned from overlayCreate is in the same
@@ -338,7 +338,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 		// directory will appear empty in the upper fs, which will then
 		// allow the rename to proceed when it should return ENOTEMPTY.
 		//
-		// NOTE: Ideally, we'd just pass in the replaced
+		// NOTE(b/111808347): Ideally, we'd just pass in the replaced
 		// Dirent from Rename, but we must drop the reference on
 		// replaced before we make the rename call, so Rename can't
 		// pass the Dirent to the Inode without significantly
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 1e245ae5f..4d1693204 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -42,7 +42,7 @@ type DirentOperations interface {
 
 // MountSourceOperations contains filesystem specific operations.
 type MountSourceOperations interface {
-	// TODO: Add:
+	// TODO(b/67778729): Add:
 	// BlockSize() int64
 	// FS() Filesystem
 
@@ -101,7 +101,7 @@ func (i InodeMappings) String() string {
 // amalgamation implies that a mount source cannot be shared by multiple mounts
 // (e.g. cannot be mounted at different locations).
 //
-// TODO: Move mount-specific information out of MountSource.
+// TODO(b/63601033): Move mount-specific information out of MountSource.
 //
 // +stateify savable
 type MountSource struct {
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index 269d6b9da..d7605b2c9 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -33,7 +33,7 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool {
 }
 
 // TestMountSourceOnlyCachedOnce tests that a Dirent that is mounted over only ends
-// up in a single Dirent Cache. NOTE: Having a dirent in multiple
+// up in a single Dirent Cache. NOTE(b/63848693): Having a dirent in multiple
 // caches causes major consistency issues.
 func TestMountSourceOnlyCachedOnce(t *testing.T) {
 	ctx := contexttest.Context(t)
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 3cc5f197c..5d4ec6c7b 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -91,7 +91,7 @@ CPU.IO utilization in last 10 minutes | Always zero
 Num currently running processes       | Always zero
 Total num processes                   | Always zero
 
-TODO: Populate the columns with accurate statistics.
+TODO(b/62345059): Populate the columns with accurate statistics.
 
 ### meminfo
 
@@ -128,12 +128,12 @@ Field name        | Notes
 Buffers           | Always zero, no block devices
 SwapCache         | Always zero, no swap
 Inactive(anon)    | Always zero, see SwapCache
-Unevictable       | Always zero TODO
-Mlocked           | Always zero TODO
+Unevictable       | Always zero TODO(b/31823263)
+Mlocked           | Always zero TODO(b/31823263)
 SwapTotal         | Always zero, no swap
 SwapFree          | Always zero, no swap
-Dirty             | Always zero TODO
-Writeback         | Always zero TODO
+Dirty             | Always zero TODO(b/31823263)
+Writeback         | Always zero TODO(b/31823263)
 MemAvailable      | Uses the same value as MemFree since there is no swap.
 Slab              | Missing
 SReclaimable      | Missing
@@ -185,7 +185,7 @@ softirq 0 0 0 0 0 0 0 0 0 0 0
 
 All fields except for `btime` are always zero.
 
-TODO: Populate with accurate fields.
+TODO(b/37226836): Populate with accurate fields.
 
 ### sys
 
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 25da06f5d..f2329e623 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -258,7 +258,7 @@ func newFdInfoDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 // Lookup loads an fd in /proc/TID/fdinfo into a Dirent.
 func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
 	inode, err := walkDescriptors(fdid.t, p, func(file *fs.File, fdFlags kernel.FDFlags) *fs.Inode {
-		// TODO: Using a static inode here means that the
+		// TODO(b/121266871): Using a static inode here means that the
 		// data can be out-of-date if, for instance, the flags on the
 		// FD change before we read this file. We should switch to
 		// generating the data on Read(). Also, we should include pos,
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 78f3a1dc0..3ee0e570a 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -40,7 +40,7 @@ func (d *loadavgData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 
 	var buf bytes.Buffer
 
-	// TODO: Include real data in fields.
+	// TODO(b/62345059): Include real data in fields.
 	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
 	// Column 4-5: currently running processes and the total number of processes.
 	// Column 6: the last process ID used.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 620e93ce3..75cbf3e77 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -58,7 +58,7 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 	fmt.Fprintf(&buf, "MemTotal:       %8d kB\n", totalSize/1024)
 	memFree := (totalSize - totalUsage) / 1024
 	// We use MemFree as MemAvailable because we don't swap.
-	// TODO: When reclaim is implemented the value of MemAvailable
+	// TODO(rahat): When reclaim is implemented the value of MemAvailable
 	// should change.
 	fmt.Fprintf(&buf, "MemFree:        %8d kB\n", memFree)
 	fmt.Fprintf(&buf, "MemAvailable:   %8d kB\n", memFree)
@@ -72,8 +72,8 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 	fmt.Fprintf(&buf, "Inactive(anon):        0 kB\n")
 	fmt.Fprintf(&buf, "Active(file):   %8d kB\n", activeFile/1024)
 	fmt.Fprintf(&buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
-	fmt.Fprintf(&buf, "Unevictable:           0 kB\n") // TODO
-	fmt.Fprintf(&buf, "Mlocked:               0 kB\n") // TODO
+	fmt.Fprintf(&buf, "Unevictable:           0 kB\n") // TODO(b/31823263)
+	fmt.Fprintf(&buf, "Mlocked:               0 kB\n") // TODO(b/31823263)
 	fmt.Fprintf(&buf, "SwapTotal:             0 kB\n")
 	fmt.Fprintf(&buf, "SwapFree:              0 kB\n")
 	fmt.Fprintf(&buf, "Dirty:                 0 kB\n")
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 1e62af8c6..fe62b167b 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -114,7 +114,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		// (4) Root: the pathname of the directory in the filesystem
 		// which forms the root of this mount.
 		//
-		// NOTE: This will always be "/" until we implement
+		// NOTE(b/78135857): This will always be "/" until we implement
 		// bind mounts.
 		fmt.Fprintf(&buf, "/ ")
 
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 55a958f9e..d24b2d370 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -154,7 +154,7 @@ func (n *netDev) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	contents[1] = " face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n"
 
 	for _, i := range interfaces {
-		// TODO: Collect stats from each inet.Stack
+		// TODO(b/71872867): Collect stats from each inet.Stack
 		// implementation (hostinet, epsocket, and rpcinet).
 
 		// Implements the same format as
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index f2bbef375..18bd8e9b6 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -83,7 +83,7 @@ func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]
 
 	var buf bytes.Buffer
 
-	// TODO: We currently export only zero CPU stats. We could
+	// TODO(b/37226836): We currently export only zero CPU stats. We could
 	// at least provide some aggregate stats.
 	var cpu cpuStats
 	fmt.Fprintf(&buf, "cpu  %s\n", cpu)
@@ -100,7 +100,7 @@ func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]
 	const numInterrupts = 256
 
 	// The Kernel doesn't handle real interrupts, so report all zeroes.
-	// TODO: We could count page faults as #PF.
+	// TODO(b/37226836): We could count page faults as #PF.
 	fmt.Fprintf(&buf, "intr 0") // total
 	for i := 0; i < numInterrupts; i++ {
 		fmt.Fprintf(&buf, " 0")
@@ -108,22 +108,22 @@ func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]
 	fmt.Fprintf(&buf, "\n")
 
 	// Total number of context switches.
-	// TODO: Count this.
+	// TODO(b/37226836): Count this.
 	fmt.Fprintf(&buf, "ctxt 0\n")
 
 	// CLOCK_REALTIME timestamp from boot, in seconds.
 	fmt.Fprintf(&buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
 
 	// Total number of clones.
-	// TODO: Count this.
+	// TODO(b/37226836): Count this.
 	fmt.Fprintf(&buf, "processes 0\n")
 
 	// Number of runnable tasks.
-	// TODO: Count this.
+	// TODO(b/37226836): Count this.
 	fmt.Fprintf(&buf, "procs_running 0\n")
 
 	// Number of tasks waiting on IO.
-	// TODO: Count this.
+	// TODO(b/37226836): Count this.
 	fmt.Fprintf(&buf, "procs_blocked 0\n")
 
 	// Number of each softirq handled.
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 728a46a74..0ce77f04f 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -39,7 +39,7 @@ const (
 
 // tcpMemInode is used to read/write the size of netstack tcp buffers.
 //
-// TODO: If we have multiple proc mounts, concurrent writes can
+// TODO(b/121381035): If we have multiple proc mounts, concurrent writes can
 // leave netstack and the proc files in an inconsistent state. Since we set the
 // buffer size from these proc files on restore, we may also race and end up in
 // an inconsistent state on restore.
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 0edcdfce2..9f65a8337 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -77,7 +77,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"fd":      newFdDir(t, msrc),
 		"fdinfo":  newFdInfoDir(t, msrc),
 		"gid_map": newGIDMap(t, msrc),
-		// FIXME: create the correct io file for threads.
+		// FIXME(b/123511468): create the correct io file for threads.
 		"io":        newIO(t, msrc),
 		"maps":      newMaps(t, msrc),
 		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
@@ -93,7 +93,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		contents["task"] = newSubtasks(t, msrc, pidns)
 	}
 
-	// TODO: Set EUID/EGID based on dumpability.
+	// TODO(b/31916171): Set EUID/EGID based on dumpability.
 	d := &taskDir{
 		Dir:   *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
 		t:     t,
@@ -245,7 +245,7 @@ func (e *exe) executable() (d *fs.Dirent, err error) {
 	e.t.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO: Check shouldn't allow Readlink once the
+			// TODO(b/34851096): Check shouldn't allow Readlink once the
 			// Task is zombied.
 			err = syserror.EACCES
 			return
@@ -297,7 +297,7 @@ type namespaceSymlink struct {
 }
 
 func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode {
-	// TODO: Namespace symlinks should contain the namespace name and the
+	// TODO(rahat): Namespace symlinks should contain the namespace name and the
 	// inode number for the namespace instance, so for example user:[123456]. We
 	// currently fake the inode number by sticking the symlink inode in its
 	// place.
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index b6d49d5e9..58e0c793c 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -65,7 +65,7 @@ func (v *versionData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 	// Since we don't really want to expose build information to
 	// applications, those fields are omitted.
 	//
-	// FIXME: Using Version from the init task SyscallTable
+	// FIXME(mpratt): Using Version from the init task SyscallTable
 	// disregards the different version a task may have (e.g., in a uts
 	// namespace).
 	ver := init.Leader().SyscallTable().Version
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 159fd2981..c0400b67d 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -358,7 +358,7 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p
 	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewDir(ctx, dir, perms)
 	})
-	// TODO: Support updating status times, as those should be
+	// TODO(nlacasse): Support updating status times, as those should be
 	// updated by links.
 	return err
 }
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index d0c93028f..8e44421b6 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -34,7 +34,7 @@ const (
 	// GID for the root directory.
 	rootGIDKey = "gid"
 
-	// TODO: support a tmpfs size limit.
+	// TODO(edahlgren/mpratt): support a tmpfs size limit.
 	// size = "size"
 
 	// Permissions that exceed modeMask will be rejected.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 7c80d711b..4450e1363 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -309,7 +309,7 @@ func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst userm
 	// common: getting a return value of 0 from a read syscall is the only way
 	// to detect EOF.
 	//
-	// TODO: Separate out f.attr.Size and use atomics instead of
+	// TODO(jamieliu): Separate out f.attr.Size and use atomics instead of
 	// f.dataMu.
 	f.dataMu.RLock()
 	size := f.attr.Size
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 555692505..5bb4922cb 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -32,7 +32,7 @@ import (
 var fsInfo = fs.Info{
 	Type: linux.TMPFS_MAGIC,
 
-	// TODO: allow configuring a tmpfs size and enforce it.
+	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
 	TotalBlocks: 0,
 	FreeBlocks:  0,
 }
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 33b4c6438..f8713471a 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -66,7 +66,7 @@ type dirInodeOperations struct {
 
 	// msrc is the super block this directory is on.
 	//
-	// TODO: Plumb this through instead of storing it here.
+	// TODO(chrisko): Plumb this through instead of storing it here.
 	msrc *fs.MountSource
 
 	// mu protects the fields below.
@@ -89,7 +89,7 @@ type dirInodeOperations struct {
 
 	// next is the next pty index to use.
 	//
-	// TODO: reuse indices when ptys are closed.
+	// TODO(b/29356795): reuse indices when ptys are closed.
 	next uint32
 }
 
@@ -118,7 +118,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 		// N.B. Linux always uses inode id 1 for the directory. See
 		// fs/devpts/inode.c:devpts_fill_super.
 		//
-		// TODO: Since ptsDevice must be shared between
+		// TODO(b/75267214): Since ptsDevice must be shared between
 		// different mounts, we must not assign fixed numbers.
 		InodeID:   ptsDevice.NextIno(),
 		BlockSize: usermem.PageSize,
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 43e0e2a04..a53448c47 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -43,7 +43,7 @@ func (*filesystem) Name() string {
 
 // AllowUserMount allows users to mount(2) this file system.
 func (*filesystem) AllowUserMount() bool {
-	// TODO: Users may mount this once the terminals are in a
+	// TODO(b/29356795): Users may mount this once the terminals are in a
 	// usable state.
 	return false
 }
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 7c256abb0..e2686a074 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -51,7 +51,7 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn
 		// N.B. Linux always uses inode id 2 for ptmx. See
 		// fs/devpts/inode.c:mknod_ptmx.
 		//
-		// TODO: Since ptsDevice must be shared between
+		// TODO(b/75267214): Since ptsDevice must be shared between
 		// different mounts, we must not assign fixed numbers.
 		InodeID: ptsDevice.NextIno(),
 		Type:    fs.CharacterDevice,
@@ -157,7 +157,7 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args a
 		// of the slave end.
 		return mf.t.ld.setTermios(ctx, io, args)
 	case linux.TCSETSW:
-		// TODO: This should drain the output queue first.
+		// TODO(b/29356795): This should drain the output queue first.
 		return mf.t.ld.setTermios(ctx, io, args)
 	case linux.TIOCGPTN:
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
@@ -165,7 +165,7 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args a
 		})
 		return 0, err
 	case linux.TIOCSPTLCK:
-		// TODO: Implement pty locking. For now just pretend we do.
+		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
 		return 0, mf.t.ld.windowSize(ctx, io, args)
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index e8368bcdd..ed080ca0f 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -56,7 +56,7 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne
 		// N.B. Linux always uses inode id = tty index + 3. See
 		// fs/devpts/inode.c:devpts_pty_new.
 		//
-		// TODO: Since ptsDevice must be shared between
+		// TODO(b/75267214): Since ptsDevice must be shared between
 		// different mounts, we must not assign fixed numbers.
 		InodeID: ptsDevice.NextIno(),
 		Type:    fs.CharacterDevice,
@@ -137,7 +137,7 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args ar
 	case linux.TCSETS:
 		return sf.si.t.ld.setTermios(ctx, io, args)
 	case linux.TCSETSW:
-		// TODO: This should drain the output queue first.
+		// TODO(b/29356795): This should drain the output queue first.
 		return sf.si.t.ld.setTermios(ctx, io, args)
 	case linux.TIOCGPTN:
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
@@ -151,7 +151,7 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args ar
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		// TODO: Implement once we have support for job
+		// TODO(b/129283598): Implement once we have support for job
 		// control.
 		return 0, nil
 	default:
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index a843b9aab..2055da196 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -125,7 +125,7 @@ func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *T
 		creds.EffectiveCaps = capabilities.EffectiveCaps
 		creds.BoundingCaps = capabilities.BoundingCaps
 		creds.InheritableCaps = capabilities.InheritableCaps
-		// TODO: Support ambient capabilities.
+		// TODO(nlacasse): Support ambient capabilities.
 	} else {
 		// If no capabilities are specified, grant capabilities consistent with
 		// setresuid + setresgid from NewRootCredentials to the given uid and
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index 30957bb9a..159940a69 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -49,7 +49,7 @@ type UserNamespace struct {
 	gidMapFromParent idMapSet
 	gidMapToParent   idMapSet
 
-	// TODO: Support disabling setgroups(2).
+	// TODO(b/27454212): Support disabling setgroups(2).
 }
 
 // NewRootUserNamespace returns a UserNamespace that is appropriate for a
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index 373e11772..deff6def9 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -30,7 +30,7 @@ const (
 	// rtSignalCap is the maximum number of instances of a given realtime
 	// signal that may be pending.
 	//
-	// TODO: In Linux, the minimum signal queue size is
+	// TODO(igudger): In Linux, the minimum signal queue size is
 	// RLIMIT_SIGPENDING, which is by default max_threads/2.
 	rtSignalCap = 32
 )
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 8d78b2fb3..15f2e2964 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -162,7 +162,7 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
 	if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
 		return false
 	}
-	// TODO: dumpability check
+	// TODO(b/31916171): dumpability check
 	if callerCreds.UserNamespace != targetCreds.UserNamespace {
 		return false
 	}
@@ -396,7 +396,7 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
 	if target.stop == (*groupStop)(nil) {
 		target.trapStopPending = true
 		target.endInternalStopLocked()
-		// TODO: Linux blocks ptrace_attach() until the task has
+		// TODO(jamieliu): Linux blocks ptrace_attach() until the task has
 		// entered the ptrace-stop (or exited) via JOBCTL_TRAPPING.
 	}
 	target.tg.signalHandlers.mu.Unlock()
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 0a954bc16..6d3314e81 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -66,7 +66,7 @@ func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
 	if rscr.CriticalSection.Contains(rscr.Restart) {
 		return syserror.EINVAL
 	}
-	// TODO: check that rscr.CriticalSection and rscr.Restart are in
+	// TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in
 	// the application address range, for consistency with Linux
 	t.tg.rscr.Store(&rscr)
 	return nil
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
index 69aee9127..41ac1067d 100644
--- a/pkg/sentry/kernel/sched/cpuset.go
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -29,7 +29,7 @@ type CPUSet []byte
 
 // CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus.
 func CPUSetSize(num uint) uint {
-	// NOTE: Applications may expect that the size of a CPUSet in
+	// NOTE(b/68859821): Applications may expect that the size of a CPUSet in
 	// bytes is always a multiple of sizeof(unsigned long), since this is true
 	// in Linux. Thus we always round up.
 	bytes := (num + bitsPerByte - 1) / bitsPerByte
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 29a2eb804..2b7c1a9bc 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
 		return syserror.ERANGE
 	}
 
-	// TODO: Clear undo entries in all processes
+	// TODO(b/29354920): Clear undo entries in all processes
 	sem.value = val
 	sem.pid = pid
 	s.changeTime = ktime.NowFromContext(ctx)
@@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 	for i, val := range vals {
 		sem := &s.sems[i]
 
-		// TODO: Clear undo entries in all processes
+		// TODO(b/29354920): Clear undo entries in all processes
 		sem.value = int16(val)
 		sem.pid = pid
 		sem.wakeWaiters()
@@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch
 	}
 
 	// All operations succeeded, apply them.
-	// TODO: handle undo operations.
+	// TODO(b/29354920): handle undo operations.
 	for i, v := range tmpVals {
 		s.sems[i].value = v
 		s.sems[i].wakeWaiters()
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 349f2a26e..d4812a065 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -427,7 +427,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
 func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	// TODO: RemoveMapping may be called during task exit, when ctx
+	// TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
 	// is context.Background. Gracefully handle missing clocks. Failing to
 	// update the detach time in these cases is ok, since no one can observe the
 	// omission.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 7eb99718d..293b21249 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -165,7 +165,7 @@ type Stracer interface {
 	//
 	// The returned private data is passed to SyscallExit.
 	//
-	// TODO: remove kernel imports from the strace
+	// TODO(gvisor.dev/issue/155): remove kernel imports from the strace
 	// package so that the type can be used directly.
 	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
 
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 1b4d4cf2f..ac38dd157 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -60,7 +60,7 @@ func (tc *TaskContext) release() {
 	// Nil out pointers so that if the task is saved after release, it doesn't
 	// follow the pointers to possibly now-invalid objects.
 	if tc.MemoryManager != nil {
-		// TODO
+		// TODO(b/38173783)
 		tc.MemoryManager.DecUsers(context.Background())
 		tc.MemoryManager = nil
 	}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 9fca90a1c..b49f902a5 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -208,7 +208,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.tc = *r.tc
 	t.mu.Unlock()
 	t.unstopVforkParent()
-	// NOTE: All locks must be dropped prior to calling Activate.
+	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
 	t.MemoryManager().Activate()
 
 	t.ptraceExec(oldTID)
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 1a0734ab6..a07956208 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -339,7 +339,7 @@ func (t *Task) exitChildren() {
 			}, true /* group */)
 			other.signalHandlers.mu.Unlock()
 		}
-		// TODO: The init process waits for all processes in the
+		// TODO(b/37722272): The init process waits for all processes in the
 		// namespace to exit before completing its own exit
 		// (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
 		// other tasks in the namespace are dead, except possibly for this
@@ -692,7 +692,7 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.Si
 		info.Code = arch.CLD_EXITED
 		info.SetStatus(int32(t.exitStatus.Code))
 	}
-	// TODO: Set utime, stime.
+	// TODO(b/72102453): Set utime, stime.
 	return info
 }
 
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index e105eba13..6c9608f8d 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -421,7 +421,7 @@ func (t *Task) SetKeepCaps(k bool) {
 
 // updateCredsForExec updates t.creds to reflect an execve().
 //
-// NOTE: We currently do not implement privileged executables
+// NOTE(b/30815691): We currently do not implement privileged executables
 // (set-user/group-ID bits and file capabilities). This allows us to make a lot
 // of simplifying assumptions:
 //
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 6b5fe7165..7115aa967 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -110,7 +110,7 @@ func (t *Task) doStop() {
 		return
 	}
 	t.Deactivate()
-	// NOTE: t.Activate() must be called without any locks held, so
+	// NOTE(b/30316266): t.Activate() must be called without any locks held, so
 	// this defer must precede the defer for unlocking the signal mutex.
 	defer t.Activate()
 	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 3a8e61900..7f2e0df72 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -509,7 +509,7 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
 	if t.stop != nil {
 		return false
 	}
-	// - TODO: No special case for when t is also the sending task,
+	// - TODO(b/38173783): No special case for when t is also the sending task,
 	// because the identity of the sender is unknown.
 	// - Do not choose tasks that have already been interrupted, as they may be
 	// busy handling another signal.
@@ -895,7 +895,7 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
 		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
 		sigchld.SetStatus(status)
-		// TODO: Set utime, stime.
+		// TODO(b/72102453): Set utime, stime.
 		t.sendSignalLocked(sigchld, true /* group */)
 	}
 }
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 36846484c..1302cadc1 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -69,7 +69,7 @@ import (
 // A TaskStop is a condition visible to the task control flow graph that
 // prevents a task goroutine from running or exiting, i.e. an internal stop.
 //
-// NOTE: Most TaskStops don't contain any data; they're
+// NOTE(b/30793614): Most TaskStops don't contain any data; they're
 // distinguished by their type. The obvious way to implement such a TaskStop
 // is:
 //
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 80ad59dde..79051befa 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -70,7 +70,7 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	defer d.DecRef()
 
 	perms := fs.PermMask{
-		// TODO: Linux requires only execute
+		// TODO(gvisor.dev/issue/160): Linux requires only execute
 		// permission, not read. However, our backing filesystems may
 		// prevent us from reading the file without read permission.
 		//
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 18b7e90d8..8c196df84 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -194,7 +194,7 @@ func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error)
 
 // VDSO describes a VDSO.
 //
-// NOTE: to support multiple architectures or operating systems, this
+// NOTE(mpratt): to support multiple architectures or operating systems, this
 // would need to contain a VDSO for each.
 //
 // +stateify savable
@@ -262,7 +262,7 @@ func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
 
 	return &VDSO{
 		ParamPage: mm.NewSpecialMappable("[vvar]", mfp, paramPage),
-		// TODO: Don't advertise the VDSO, as
+		// TODO(gvisor.dev/issue/157): Don't advertise the VDSO, as
 		// some applications may not be able to handle multiple [vdso]
 		// hints.
 		vdso:  mm.NewSpecialMappable("", mfp, vdso),
@@ -279,7 +279,7 @@ func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
 // kernel simply directly maps the entire file into process memory, with very
 // little real ELF parsing.
 //
-// NOTE: This means that userspace can, and unfortunately does,
+// NOTE(b/25323870): This means that userspace can, and unfortunately does,
 // depend on parts of the ELF that would normally not be mapped.  To maintain
 // compatibility with such binaries, we load the VDSO much like Linux.
 //
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 1ef1f0dd8..3f6f7ebd0 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -356,6 +356,6 @@ type MMapOpts struct {
 	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
 	// empty, MappingIdentity.MappedName() will be used instead.
 	//
-	// TODO: Replace entirely with MappingIdentity?
+	// TODO(jamieliu): Replace entirely with MappingIdentity?
 	Hint string
 }
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index f7ff06de0..7075792e0 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -331,7 +331,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 		Length:          aioRingBufferSize,
 		MappingIdentity: m,
 		Mappable:        m,
-		// TODO: Linux does "do_mmap_pgoff(..., PROT_READ |
+		// TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ |
 		// PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
 		// mapping read-only?
 		Perms:    usermem.Read,
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 0c4b8895d..7cdbf6e25 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -69,7 +69,7 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
 		start = *handle.(*usermem.Addr)
 	}
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME: If we use a usermem.Addr for the handle, we get
+		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
 		// "panic: autosave error: type usermem.Addr is not registered".
 		vmaAddr := vseg.End()
 		data = append(data, seqfile.SeqData{
@@ -88,7 +88,7 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
 	//
 	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
 	if start != vsyscallEnd {
-		// FIXME: Can't get a pointer to constant vsyscallEnd.
+		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		vmaAddr := vsyscallEnd
 		data = append(data, seqfile.SeqData{
 			Buf:    []byte(vsyscallMapsEntry),
@@ -134,7 +134,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
 	if vma.hint != "" {
 		s = vma.hint
 	} else if vma.id != nil {
-		// FIXME: We are holding mm.mappingMu here, which is
+		// FIXME(jamieliu): We are holding mm.mappingMu here, which is
 		// consistent with Linux's holding mmap_sem in
 		// fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
 		// However, it's not clear that fs.File.MappedName() is actually
@@ -162,7 +162,7 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 		start = *handle.(*usermem.Addr)
 	}
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME: If we use a usermem.Addr for the handle, we get
+		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
 		// "panic: autosave error: type usermem.Addr is not registered".
 		vmaAddr := vseg.End()
 		data = append(data, seqfile.SeqData{
@@ -174,7 +174,7 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 	// We always emulate vsyscall, so advertise it here. See
 	// ReadMapsSeqFileData for additional commentary.
 	if start != vsyscallEnd {
-		// FIXME: Can't get a pointer to constant vsyscallEnd.
+		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		vmaAddr := vsyscallEnd
 		data = append(data, seqfile.SeqData{
 			Buf:    []byte(vsyscallSmapsEntry),
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index cfbf7a104..3b5161998 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -136,7 +136,7 @@ func (m *SpecialMappable) Length() uint64 {
 // NewSharedAnonMappable returns a SpecialMappable that implements the
 // semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
 //
-// TODO: The use of SpecialMappable is a lazy code reuse hack. Linux
+// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
 // uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
 // do the same to get non-zero device and inode IDs.
 func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index cc7eb76d2..7b675b9b5 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -137,7 +137,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		return 0, err
 	}
 
-	// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
+	// TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new
 	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
 	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
 	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
@@ -148,7 +148,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
 
 	case opts.Mappable == nil && length <= privateAllocUnit:
-		// NOTE: Get pmas and map eagerly in the hope
+		// NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope
 		// that doing so will save on future page faults. We only do this for
 		// anonymous mappings, since otherwise the cost of
 		// memmap.Mappable.Translate is unknown; and only for small mappings,
@@ -698,7 +698,7 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 		return mm.brk.End, syserror.EINVAL
 	}
 
-	// TODO: This enforces RLIMIT_DATA, but is
+	// TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is
 	// slightly more permissive than the usual data limit. In particular,
 	// this only limits the size of the heap; a true RLIMIT_DATA limits the
 	// size of heap + data + bss. The segment sizes need to be plumbed from
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index e9c9a80ea..931995254 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -274,7 +274,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange
 		// Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End().
 		vma := vseg.ValuePtr()
 		if addr < vseg.Start() {
-			// TODO: Implement vma.growsDown here.
+			// TODO(jamieliu): Implement vma.growsDown here.
 			return vbegin, vgap, syserror.EFAULT
 		}
 
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
index c0a0af92d..d0f6bb225 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -62,7 +62,7 @@ func updateSystemValues(fd int) error {
 
 	// Calculate whether guestPCID is supported.
 	//
-	// FIXME: These should go through the much more pleasant
+	// FIXME(ascannell): These should go through the much more pleasant
 	// cpuid package interfaces, once a way to accept raw kvm CPUID entries
 	// is plumbed (or some rough equivalent).
 	for i := 0; i < int(cpuidSupported.nr); i++ {
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index d1c9458ea..0e48417b9 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -181,7 +181,7 @@ var (
 // this signal both to Contexts and to the sentry itself, under the assumption
 // that they originate from races with Context.Interrupt().
 //
-// NOTE: The Go runtime only guarantees that a small subset
+// NOTE(b/23420492): The Go runtime only guarantees that a small subset
 // of signals will be always be unblocked on all threads, one of which
 // is SIGCHLD.
 const SignalInterrupt = linux.SIGCHLD
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 82f125073..2a5d699ec 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -79,7 +79,7 @@ func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread)
 		// Before creating a new thread, see if we can find a thread
 		// whose system tid has disappeared.
 		//
-		// TODO: Other parts of this package depend on
+		// TODO(b/77216482): Other parts of this package depend on
 		// threads never exiting.
 		for origTID, t := range tp.threads {
 			// Signal zero is an easy existence check.
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 7c88010d8..4c6daec22 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -116,7 +116,7 @@ const (
 //
 // Note that sign-extension semantics apply to the highest order bit.
 //
-// FIXME: This should use the cpuid passed to Init.
+// FIXME(b/69382326): This should use the cpuid passed to Init.
 func VirtualAddressBits() uint32 {
 	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
 	return (ax >> 8) & 0xff
@@ -124,7 +124,7 @@ func VirtualAddressBits() uint32 {
 
 // PhysicalAddressBits returns the number of bits available for physical addresses.
 //
-// FIXME: This should use the cpuid passed to Init.
+// FIXME(b/69382326): This should use the cpuid passed to Init.
 func PhysicalAddressBits() uint32 {
 	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
 	return ax & 0xff
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 6b5d5f993..571245ce5 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -86,7 +86,7 @@ func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start,
 			//
 			// Otherwise ignore the signal.
 			//
-			// TODO: Drop in Go 1.12, which uses tgkill
+			// TODO(b/114489875): Drop in Go 1.12, which uses tgkill
 			// in runtime.raise.
 			switch signal {
 			case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index 5913d47a8..db6e71487 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -23,7 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 )
 
-// TODO: Move to pkg/abi/linux along with definitions in
+// TODO(b/34161764): Move to pkg/abi/linux along with definitions in
 // pkg/sentry/arch.
 type sigaction struct {
 	handler  uintptr
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 23138d874..768fa0dfa 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -608,7 +608,7 @@ func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
-	// TODO: Unlike other socket options, SO_TIMESTAMP is
+	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
 	// implemented specifically for epsocket.SocketOperations rather than
 	// commonEndpoint. commonEndpoint should be extended to support socket
 	// options where the implementation is not shared, as unix sockets need
@@ -658,7 +658,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
 func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) {
-	// TODO: Stop rejecting short optLen values in getsockopt.
+	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
 	switch name {
 	case linux.SO_TYPE:
 		if outLen < sizeOfInt32 {
@@ -789,7 +789,7 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		return linux.Linger{}, nil
 
 	case linux.SO_SNDTIMEO:
-		// TODO: Linux allows shorter lengths for partial results.
+		// TODO(igudger): Linux allows shorter lengths for partial results.
 		if outLen < linux.SizeOfTimeval {
 			return nil, syserr.ErrInvalidArgument
 		}
@@ -797,7 +797,7 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		return linux.NsecToTimeval(s.SendTimeout()), nil
 
 	case linux.SO_RCVTIMEO:
-		// TODO: Linux allows shorter lengths for partial results.
+		// TODO(igudger): Linux allows shorter lengths for partial results.
 		if outLen < linux.SizeOfTimeval {
 			return nil, syserr.ErrInvalidArgument
 		}
@@ -894,7 +894,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		// TODO: Translate fields once they are added to
+		// TODO(b/64800844): Translate fields once they are added to
 		// tcpip.TCPInfoOption.
 		info := linux.TCPInfo{}
 
@@ -995,7 +995,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
-	// TODO: Unlike other socket options, SO_TIMESTAMP is
+	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
 	// implemented specifically for epsocket.SocketOperations rather than
 	// commonEndpoint. commonEndpoint should be extended to support socket
 	// options where the implementation is not shared, as unix sockets need
@@ -1338,7 +1338,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
-			// TODO: Change AddMembership to use the standard
+			// TODO(igudger): Change AddMembership to use the standard
 			// any address representation.
 			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
 			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
@@ -1352,7 +1352,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
-			// TODO: Change DropMembership to use the standard
+			// TODO(igudger): Change DropMembership to use the standard
 			// any address representation.
 			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
 			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
@@ -1380,7 +1380,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		))
 
 	case linux.MCAST_JOIN_GROUP:
-		// FIXME: Implement MCAST_JOIN_GROUP.
+		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
 		t.Kernel().EmitUnimplementedEvent(t)
 		return syserr.ErrInvalidArgument
 
@@ -1695,7 +1695,7 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 
 // nonBlockingRead issues a non-blocking read.
 //
-// TODO: Support timestamps for stream sockets.
+// TODO(b/78348848): Support timestamps for stream sockets.
 func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	isPacket := s.isPacketBased()
 
@@ -1762,7 +1762,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		dst = dst.DropFirst(n)
 		num, err := dst.CopyOutFrom(ctx, safemem.FromVecReaderFunc{func(dsts [][]byte) (int64, error) {
 			n, _, err := s.Endpoint.Peek(dsts)
-			// TODO: Handle peek timestamp.
+			// TODO(b/78348848): Handle peek timestamp.
 			if err != nil {
 				return int64(n), syserr.TranslateNetstackError(err).ToError()
 			}
@@ -1963,7 +1963,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	// SIOCGSTAMP is implemented by epsocket rather than all commonEndpoint
 	// sockets.
-	// TODO: Add a commonEndpoint method to support SIOCGSTAMP.
+	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
 	if int(args[1].Int()) == syscall.SIOCGSTAMP {
 		s.readMu.Lock()
 		defer s.readMu.Unlock()
@@ -2153,19 +2153,19 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 
 	case syscall.SIOCGIFMAP:
 		// Gets the hardware parameters of the device.
-		// TODO: Implement.
+		// TODO(b/71872867): Implement.
 
 	case syscall.SIOCGIFTXQLEN:
 		// Gets the transmit queue length of the device.
-		// TODO: Implement.
+		// TODO(b/71872867): Implement.
 
 	case syscall.SIOCGIFDSTADDR:
 		// Gets the destination address of a point-to-point device.
-		// TODO: Implement.
+		// TODO(b/71872867): Implement.
 
 	case syscall.SIOCGIFBRDADDR:
 		// Gets the broadcast address of a device.
-		// TODO: Implement.
+		// TODO(b/71872867): Implement.
 
 	case syscall.SIOCGIFNETMASK:
 		// Gets the network mask of a device.
diff --git a/pkg/sentry/socket/epsocket/save_restore.go b/pkg/sentry/socket/epsocket/save_restore.go
index 34d9a7cf0..f19afb6c0 100644
--- a/pkg/sentry/socket/epsocket/save_restore.go
+++ b/pkg/sentry/socket/epsocket/save_restore.go
@@ -20,7 +20,7 @@ import (
 
 // afterLoad is invoked by stateify.
 func (s *Stack) afterLoad() {
-	s.Stack = stack.StackFromEnv // FIXME
+	s.Stack = stack.StackFromEnv // FIXME(b/36201077)
 	if s.Stack == nil {
 		panic("can't restore without netstack/tcpip/stack.Stack")
 	}
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index c0081c819..37c48f4bc 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -77,7 +77,7 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 				Family:    family,
 				PrefixLen: uint8(len(a.Address) * 8),
 				Addr:      []byte(a.Address),
-				// TODO: Other fields.
+				// TODO(b/68878065): Other fields.
 			})
 		}
 		nicAddrs[int32(id)] = addrs
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index c4848b313..49349074f 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -348,7 +348,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	// Whitelist flags.
 	//
-	// FIXME: We can't support MSG_ERRQUEUE because it uses ancillary
+	// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
 	// messages that netstack/tcpip/transport/unix doesn't understand. Kill the
 	// Socket interface's dependence on netstack.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 {
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 7e70b09b2..e414b829b 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -110,7 +110,7 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		m.PutAttr(linux.IFLA_ADDRESS, mac)
 		m.PutAttr(linux.IFLA_BROADCAST, brd)
 
-		// TODO: There are many more attributes.
+		// TODO(b/68878065): There are many more attributes.
 	}
 
 	return nil
@@ -122,7 +122,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
 	//
-	// TODO: Filter output by passed protocol family.
+	// TODO(b/68878065): Filter output by passed protocol family.
 
 	// The RTM_GETADDR dump response is a set of RTM_NEWADDR messages each
 	// containing an InterfaceAddrMessage followed by a set of netlink
@@ -151,7 +151,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 
 			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
 
-			// TODO: There are many more attributes.
+			// TODO(b/68878065): There are many more attributes.
 		}
 	}
 
@@ -175,7 +175,7 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	// TODO: Only the dump variant of the types below are
+	// TODO(b/68878065): Only the dump variant of the types below are
 	// supported.
 	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
 		return syserr.ErrNotSupported
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 0fe9b39b6..a34f9d3ca 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -168,7 +168,7 @@ func (s *Socket) EventUnregister(e *waiter.Entry) {
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (s *Socket) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	// TODO: no ioctls supported.
+	// TODO(b/68878065): no ioctls supported.
 	return 0, syserror.ENOTTY
 }
 
@@ -319,7 +319,7 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (in
 			t.Kernel().EmitUnimplementedEvent(t)
 		}
 	}
-	// TODO: other sockopts are not supported.
+	// TODO(b/68878065): other sockopts are not supported.
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
@@ -369,7 +369,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 		}
 
 	}
-	// TODO: other sockopts are not supported.
+	// TODO(b/68878065): other sockopts are not supported.
 	return syserr.ErrProtocolNotAvailable
 }
 
@@ -389,7 +389,7 @@ func (s *Socket) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error
 func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
 	sa := linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
-		// TODO: Support non-kernel peers. For now the peer
+		// TODO(b/68878065): Support non-kernel peers. For now the peer
 		// must be the kernel.
 		PortID: 0,
 	}
@@ -540,7 +540,7 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 			continue
 		}
 
-		// TODO: ACKs not supported yet.
+		// TODO(b/68877377): ACKs not supported yet.
 		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
 			return syserr.ErrNotSupported
 		}
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index 9c749b888..64106c4b5 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -50,7 +50,7 @@ type RPCConnection struct {
 // NewRPCConnection initializes a RPC connection to a socket gofer.
 func NewRPCConnection(s *unet.Socket) *RPCConnection {
 	conn := &RPCConnection{socket: s, requests: map[uint64]request{}}
-	go func() { // S/R-FIXME
+	go func() { // S/R-FIXME(b/77962828)
 		var nums [16]byte
 		for {
 			for n := 0; n < len(nums); {
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index d9bda78b0..f06d12231 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -64,7 +64,7 @@ func NewRPCNotifier(cn *conn.RPCConnection) (*Notifier, error) {
 		fdMap:   make(map[uint32]*fdInfo),
 	}
 
-	go w.waitAndNotify() // S/R-FIXME
+	go w.waitAndNotify() // S/R-FIXME(b/77962828)
 
 	return w, nil
 }
@@ -166,7 +166,7 @@ func (n *Notifier) waitAndNotify() error {
 		res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollWait).EpollWait.Result
 		if e, ok := res.(*pb.EpollWaitResponse_ErrorNumber); ok {
 			err := syscall.Errno(e.ErrorNumber)
-			// NOTE: I don't think epoll_wait can return EAGAIN but I'm being
+			// NOTE(magi): I don't think epoll_wait can return EAGAIN but I'm being
 			// conseratively careful here since exiting the notification thread
 			// would be really bad.
 			if err == syscall.EINTR || err == syscall.EAGAIN {
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 3418a6d75..cf8f69efb 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -288,7 +288,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	if blocking && se == syserr.ErrTryAgain {
 		// Register for notifications.
 		e, ch := waiter.NewChannelEntry(nil)
-		// FIXME: This waiter.EventHUp is a partial
+		// FIXME(b/119878986): This waiter.EventHUp is a partial
 		// measure, need to figure out how to translate linux events to
 		// internal events.
 		s.EventRegister(&e, waiter.EventIn|waiter.EventHUp)
@@ -370,7 +370,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	// We save the shutdown state because of strange differences on linux
 	// related to recvs on blocking vs. non-blocking sockets after a SHUT_RD.
 	// We need to emulate that behavior on the blocking side.
-	// TODO: There is a possible race that can exist with loopback,
+	// TODO(b/120096741): There is a possible race that can exist with loopback,
 	// where data could possibly be lost.
 	s.setShutdownFlags(how)
 
@@ -771,7 +771,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		return 0, syserr.FromError(err)
 	}
 
-	// TODO: this needs to change to map directly to a SendMsg syscall
+	// TODO(bgeffon): this needs to change to map directly to a SendMsg syscall
 	// in the RPC.
 	totalWritten := 0
 	n, err := rpcSendMsg(t, &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
index c056e4c9d..9586f5923 100644
--- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto
+++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
@@ -3,7 +3,7 @@ syntax = "proto3";
 // package syscall_rpc is a set of networking related system calls that can be
 // forwarded to a socket gofer.
 //
-// TODO: Document individual RPCs.
+// TODO(b/77963526): Document individual RPCs.
 package syscall_rpc;
 
 message SendmsgRequest {
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index a6d870b44..434a200d9 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -722,7 +722,7 @@ func (s SyscallMap) Name(sysno uintptr) string {
 // N.B. This is not in an init function because we can't be sure all syscall
 // tables are registered with the kernel when init runs.
 //
-// TODO: remove kernel package dependencies from this
+// TODO(gvisor.dev/issue/155): remove kernel package dependencies from this
 // package and have the kernel package self-initialize all syscall tables.
 func Initialize() {
 	for _, table := range kernel.SyscallTables() {
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 8759e5e32..304a12dde 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -89,7 +89,7 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		// side is gone. The partial write is returned. EPIPE will be
 		// returned on the next call.
 		//
-		// TODO: In some cases SIGPIPE should
+		// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
 		// also be sent to the application.
 		return nil
 	case syserror.ErrWouldBlock:
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index be793ca11..b9b4ccbd1 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -143,10 +143,10 @@ var AMD64 = &kernel.SyscallTable{
 		65: Semop,
 		66: Semctl,
 		67: Shmdt,
-		//     68: @Syscall(Msgget), TODO
-		//     69: @Syscall(Msgsnd), TODO
-		//     70: @Syscall(Msgrcv), TODO
-		//     71: @Syscall(Msgctl), TODO
+		//     68: @Syscall(Msgget), TODO(b/29354921)
+		//     69: @Syscall(Msgsnd), TODO(b/29354921)
+		//     70: @Syscall(Msgrcv), TODO(b/29354921)
+		//     71: @Syscall(Msgctl), TODO(b/29354921)
 		72:  Fcntl,
 		73:  Flock,
 		74:  Fsync,
@@ -197,8 +197,8 @@ var AMD64 = &kernel.SyscallTable{
 		119: Setresgid,
 		120: Getresgid,
 		121: Getpgid,
-		//     122: @Syscall(Setfsuid), TODO
-		//     123: @Syscall(Setfsgid), TODO
+		//     122: @Syscall(Setfsuid), TODO(b/112851702)
+		//     123: @Syscall(Setfsgid), TODO(b/112851702)
 		124: Getsid,
 		125: Capget,
 		126: Capset,
@@ -217,7 +217,7 @@ var AMD64 = &kernel.SyscallTable{
 		136: syscalls.ErrorWithEvent(syscall.ENOSYS),
 		137: Statfs,
 		138: Fstatfs,
-		//     139: @Syscall(Sysfs), TODO
+		//     139: @Syscall(Sysfs), TODO(gvisor.dev/issue/165)
 		140: Getpriority,
 		141: Setpriority,
 		// @Syscall(SchedSetparam, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
@@ -291,7 +291,7 @@ var AMD64 = &kernel.SyscallTable{
 		// @Syscall(Security, note:Not implemented in Linux)
 		185: syscalls.Error(syscall.ENOSYS),
 		186: Gettid,
-		187: nil, // @Syscall(Readahead), TODO
+		187: nil, // @Syscall(Readahead), TODO(b/29351341)
 		// @Syscall(Setxattr, returns:ENOTSUP, note:Requires filesystem support)
 		188: syscalls.ErrorWithEvent(syscall.ENOTSUP),
 		// @Syscall(Lsetxattr, returns:ENOTSUP, note:Requires filesystem support)
@@ -342,7 +342,7 @@ var AMD64 = &kernel.SyscallTable{
 		217: Getdents64,
 		218: SetTidAddress,
 		219: RestartSyscall,
-		//     220: @Syscall(Semtimedop), TODO
+		//     220: @Syscall(Semtimedop), TODO(b/29354920)
 		221: Fadvise64,
 		222: TimerCreate,
 		223: TimerSettime,
@@ -360,16 +360,16 @@ var AMD64 = &kernel.SyscallTable{
 		235: Utimes,
 		// @Syscall(Vserver, note:Not implemented by Linux)
 		236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux
-		// @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO
+		// @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO(b/117792295)
 		237: syscalls.CapError(linux.CAP_SYS_NICE), // may require cap_sys_nice
 		238: SetMempolicy,
 		239: GetMempolicy,
-		//     240: @Syscall(MqOpen), TODO
-		//     241: @Syscall(MqUnlink), TODO
-		//     242: @Syscall(MqTimedsend), TODO
-		//     243: @Syscall(MqTimedreceive), TODO
-		//     244: @Syscall(MqNotify), TODO
-		//     245: @Syscall(MqGetsetattr), TODO
+		//     240: @Syscall(MqOpen), TODO(b/29354921)
+		//     241: @Syscall(MqUnlink), TODO(b/29354921)
+		//     242: @Syscall(MqTimedsend), TODO(b/29354921)
+		//     243: @Syscall(MqTimedreceive), TODO(b/29354921)
+		//     244: @Syscall(MqNotify), TODO(b/29354921)
+		//     245: @Syscall(MqGetsetattr), TODO(b/29354921)
 		246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot
 		247: Waitid,
 		// @Syscall(AddKey, returns:EACCES, note:Not available to user)
@@ -407,22 +407,22 @@ var AMD64 = &kernel.SyscallTable{
 		273: syscalls.Error(syscall.ENOSYS),
 		// @Syscall(GetRobustList, note:Obsolete)
 		274: syscalls.Error(syscall.ENOSYS),
-		//     275: @Syscall(Splice), TODO
-		//     276: @Syscall(Tee), TODO
+		//     275: @Syscall(Splice), TODO(b/29354098)
+		//     276: @Syscall(Tee), TODO(b/29354098)
 		277: SyncFileRange,
-		//     278: @Syscall(Vmsplice), TODO
+		//     278: @Syscall(Vmsplice), TODO(b/29354098)
 		// @Syscall(MovePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
 		279: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice (mostly)
 		280: Utimensat,
 		281: EpollPwait,
-		//     282: @Syscall(Signalfd), TODO
+		//     282: @Syscall(Signalfd), TODO(b/19846426)
 		283: TimerfdCreate,
 		284: Eventfd,
 		285: Fallocate,
 		286: TimerfdSettime,
 		287: TimerfdGettime,
 		288: Accept4,
-		//     289: @Syscall(Signalfd4), TODO
+		//     289: @Syscall(Signalfd4), TODO(b/19846426)
 		290: Eventfd2,
 		291: EpollCreate1,
 		292: Dup3,
@@ -447,17 +447,17 @@ var AMD64 = &kernel.SyscallTable{
 		305: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
 		306: Syncfs,
 		307: SendMMsg,
-		//     308: @Syscall(Setns), TODO
+		//     308: @Syscall(Setns), TODO(b/29354995)
 		309: Getcpu,
-		//     310: @Syscall(ProcessVmReadv), TODO may require cap_sys_ptrace
-		//     311: @Syscall(ProcessVmWritev), TODO may require cap_sys_ptrace
+		//     310: @Syscall(ProcessVmReadv), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace
+		//     311: @Syscall(ProcessVmWritev), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace
 		// @Syscall(Kcmp, returns:EPERM or ENOSYS, note:Requires cap_sys_ptrace)
 		312: syscalls.CapError(linux.CAP_SYS_PTRACE),
 		// @Syscall(FinitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
 		313: syscalls.CapError(linux.CAP_SYS_MODULE),
-		//     314: @Syscall(SchedSetattr), TODO, we have no scheduler
-		//     315: @Syscall(SchedGetattr), TODO, we have no scheduler
-		//     316: @Syscall(Renameat2), TODO
+		//     314: @Syscall(SchedSetattr), TODO(b/118902272), we have no scheduler
+		//     315: @Syscall(SchedGetattr), TODO(b/118902272), we have no scheduler
+		//     316: @Syscall(Renameat2), TODO(b/118902772)
 		317: Seccomp,
 		318: GetRandom,
 		319: MemfdCreate,
@@ -465,9 +465,9 @@ var AMD64 = &kernel.SyscallTable{
 		320: syscalls.CapError(linux.CAP_SYS_BOOT),
 		// @Syscall(Bpf, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
 		321: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin for all commands
-		//     322: @Syscall(Execveat), TODO
-		//     323: @Syscall(Userfaultfd), TODO
-		//     324: @Syscall(Membarrier), TODO
+		//     322: @Syscall(Execveat), TODO(b/118901836)
+		//     323: @Syscall(Userfaultfd), TODO(b/118906345)
+		//     324: @Syscall(Membarrier), TODO(b/118904897)
 		325: Mlock2,
 		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
 		//	326: @Syscall(CopyFileRange),
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 355071131..61c2647bf 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -120,7 +120,7 @@ func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		// Does not exist.
 		return 0, nil, syserror.EINVAL
 	}
-	// FIXME: Linux blocks until all AIO to the destroyed context is
+	// FIXME(fvoznika): Linux blocks until all AIO to the destroyed context is
 	// done.
 	return 0, nil, nil
 }
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 50151f7b6..967464c85 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -259,7 +259,7 @@ func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileM
 		case linux.ModeCharacterDevice:
 			fallthrough
 		case linux.ModeBlockDevice:
-			// TODO: We don't support creating block or character
+			// TODO(b/72101894): We don't support creating block or character
 			// devices at the moment.
 			//
 			// When we start supporting block and character devices, we'll
@@ -1532,7 +1532,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
 		owner.GID = kgid
 	}
 
-	// FIXME: This is racy; the inode's owner may have changed in
+	// FIXME(b/62949101): This is racy; the inode's owner may have changed in
 	// the meantime. (Linux holds i_mutex while calling
 	// fs/attr.c:notify_change() => inode_operations::setattr =>
 	// inode_change_ok().)
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 8732861e0..805b251b1 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -185,7 +185,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
 		fallthrough
 	case linux.MADV_DONTDUMP, linux.MADV_DODUMP:
-		// TODO: Core dumping isn't implemented, so these are
+		// TODO(b/72045799): Core dumping isn't implemented, so these are
 		// no-ops.
 		fallthrough
 	case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
@@ -223,7 +223,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 	nodeFlag := flags&linux.MPOL_F_NODE != 0
 	addrFlag := flags&linux.MPOL_F_ADDR != 0
 
-	// TODO: Once sysfs is implemented, report a single numa node in
+	// TODO(rahat): Once sysfs is implemented, report a single numa node in
 	// /sys/devices/system/node.
 	if nodemask != 0 && maxnode < 1 {
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 8105e9b43..50c7d7a74 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -192,7 +192,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 }
 
 // Preadv2 implements linux syscall preadv2(2).
-// TODO: Implement RWF_HIPRI functionality.
+// TODO(b/120162627): Implement RWF_HIPRI functionality.
 func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	// While the syscall is
 	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 30ccc3f66..c8748958a 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -317,7 +317,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
 	if peerRequested {
-		// NOTE: Linux does not give you an error if it can't
+		// NOTE(magi): Linux does not give you an error if it can't
 		// write the data back out so neither do we.
 		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL {
 			return 0, err
@@ -735,7 +735,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 		return 0, err
 	}
 
-	// FIXME: Pretend we have an empty error queue.
+	// FIXME(b/63594852): Pretend we have an empty error queue.
 	if flags&linux.MSG_ERRQUEUE != 0 {
 		return 0, syscall.EAGAIN
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 61cafefb9..ddcb5b789 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -350,7 +350,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 	si.SetPid(int32(wr.TID))
 	si.SetUid(int32(wr.UID))
-	// TODO: convert kernel.ExitStatus to functions and make
+	// TODO(b/73541790): convert kernel.ExitStatus to functions and make
 	// WaitResult.Status a linux.WaitStatus
 	s := syscall.WaitStatus(wr.Status)
 	switch {
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index a5ad7efb2..e405608c4 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -192,8 +192,8 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 }
 
 // Pwritev2 implements linux syscall pwritev2(2).
-// TODO: Implement RWF_HIPRI functionality.
-// TODO: Implement O_SYNC and D_SYNC functionality.
+// TODO(b/120162627): Implement RWF_HIPRI functionality.
+// TODO(b/120161091): Implement O_SYNC and D_SYNC functionality.
 func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	// While the syscall is
 	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index c8cf4eca4..a98bcd7de 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -37,7 +37,7 @@ var fallbackMetric = metric.MustCreateNewUint64Metric("/time/fallback", false /*
 // clock.
 type CalibratedClock struct {
 	// mu protects the fields below.
-	// TODO: consider a sequence counter for read locking.
+	// TODO(mpratt): consider a sequence counter for read locking.
 	mu sync.RWMutex
 
 	// ref sample the reference clock that this clock is calibrated
@@ -140,7 +140,7 @@ func (c *CalibratedClock) updateParams(actual Parameters) {
 		// N.B. logErrorAdjustment will have already logged the error
 		// at warning level.
 		//
-		// TODO: We could allow Realtime clock jumps here.
+		// TODO(mpratt): We could allow Realtime clock jumps here.
 		c.resetLocked("Extreme clock error.")
 		return
 	}
@@ -229,7 +229,7 @@ func (c *CalibratedClock) GetTime() (int64, error) {
 
 // CalibratedClocks contains calibrated monotonic and realtime clocks.
 //
-// TODO: We know that Linux runs the monotonic and realtime clocks at
+// TODO(mpratt): We know that Linux runs the monotonic and realtime clocks at
 // the same rate, so rather than tracking both individually, we could do one
 // calibration for both clocks.
 type CalibratedClocks struct {
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index f3ad58454..8568b1193 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -43,7 +43,7 @@ const (
 	// These statements assume that the host clock does not change. Actual
 	// error will depend upon host clock changes.
 	//
-	// TODO: make error correction more robust to delayed
+	// TODO(b/68779214): make error correction more robust to delayed
 	// updates.
 	ApproxUpdateInterval = 1 * time.Second
 
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 99766a803..4c7d5014a 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -28,7 +28,7 @@ import (
 
 // IO provides access to the contents of a virtual memory space.
 //
-// FIXME: Implementations of IO cannot expect ctx to contain any
+// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any
 // meaningful data.
 type IO interface {
 	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
@@ -85,7 +85,7 @@ type IO interface {
 	// order.
 	CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)
 
-	// TODO: The requirement that CopyOutFrom/CopyInTo call src/dst
+	// TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst
 	// at most once, which is unnecessary in most cases, forces implementations
 	// to gather safemem.Blocks into a single slice to pass to src/dst. Add
 	// CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index c49b537a5..b4f1e3a4f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -236,7 +236,7 @@ func (w *Watchdog) runTurn() {
 				if !ok {
 					// New stuck task detected.
 					//
-					// TODO: Tasks blocked doing IO may be considered stuck in kernel.
+					// TODO(b/65849403): Tasks blocked doing IO may be considered stuck in kernel.
 					tc = &offender{lastUpdateTime: lastUpdateTime}
 					stuckTasks.Increment()
 					newTaskFound = true
diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go
index dad83e80c..232634dd4 100644
--- a/pkg/syserr/syserr.go
+++ b/pkg/syserr/syserr.go
@@ -49,7 +49,7 @@ func New(message string, linuxTranslation *linux.Errno) *Error {
 		return err
 	}
 
-	// TODO: Remove this.
+	// TODO(b/34162363): Remove this.
 	errno := linuxTranslation.Number()
 	if errno <= 0 || errno >= len(linuxBackwardsTranslations) {
 		panic(fmt.Sprint("invalid errno: ", errno))
@@ -106,12 +106,12 @@ type linuxBackwardsTranslation struct {
 	ok  bool
 }
 
-// TODO: Remove this.
+// TODO(b/34162363): Remove this.
 var linuxBackwardsTranslations [maxErrno]linuxBackwardsTranslation
 
 // ToError translates an Error to a corresponding error value.
 //
-// TODO: Remove this.
+// TODO(b/34162363): Remove this.
 func (e *Error) ToError() error {
 	if e == nil {
 		return nil
@@ -138,7 +138,7 @@ func (e *Error) ToLinux() *linux.Errno {
 	return e.errno
 }
 
-// TODO: Remove or replace most of these errors.
+// TODO(b/34162363): Remove or replace most of these errors.
 //
 // Some of the errors should be replaced with package specific errors and
 // others should be removed entirely.
@@ -278,7 +278,7 @@ var (
 
 // FromError converts a generic error to an *Error.
 //
-// TODO: Remove this function.
+// TODO(b/34162363): Remove this function.
 func FromError(err error) *Error {
 	if err == nil {
 		return nil
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index ed9a4eee5..1c3acda4b 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -64,7 +64,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	}
 	h := header.ICMPv4(v)
 
-	// TODO: Meaningfully handle all ICMP types.
+	// TODO(b/112892170): Meaningfully handle all ICMP types.
 	switch h.Type() {
 	case header.ICMPv4Echo:
 		received.Echo.Increment()
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 3210e6fc7..be28be36d 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -73,7 +73,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	}
 	h := header.ICMPv6(v)
 
-	// TODO: Meaningfully handle all ICMP types.
+	// TODO(b/112892170): Meaningfully handle all ICMP types.
 	switch h.Type() {
 	case header.ICMPv6PacketTooBig:
 		received.PacketTooBig.Increment()
@@ -247,7 +247,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 		DstAddr:       r.RemoteAddress,
 	})
 
-	// TODO: count this in ICMP stats.
+	// TODO(stijlist): count this in ICMP stats.
 	return linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
 }
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 8b6c17a90..c18571b0f 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -176,7 +176,7 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedN
 
 	for e := list.Front(); e != nil; e = e.Next() {
 		r := e.(*referencedNetworkEndpoint)
-		// TODO: allow broadcast address when SO_BROADCAST is set.
+		// TODO(crawshaw): allow broadcast address when SO_BROADCAST is set.
 		switch r.ep.ID().LocalAddress {
 		case header.IPv4Broadcast, header.IPv4Any:
 			continue
@@ -476,7 +476,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 		n.mu.RUnlock()
 		if ok && ref.tryIncRef() {
 			r.RemoteAddress = src
-			// TODO: Update the source NIC as well.
+			// TODO(b/123449044): Update the source NIC as well.
 			ref.ep.HandlePacket(&r, vv)
 			ref.decRef()
 		} else {
@@ -485,7 +485,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 			hdr := buffer.NewPrependableFromView(vv.First())
 			vv.RemoveFirst()
 
-			// TODO: use route.WritePacket.
+			// TODO(b/128629022): use route.WritePacket.
 			if err := n.linkEP.WritePacket(&r, nil /* gso */, hdr, vv, protocol); err != nil {
 				r.Stats().IP.OutgoingPacketErrors.Increment()
 			} else {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 8f7b6f781..cb9ffe9c2 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -476,7 +476,7 @@ func (s *Stack) Stats() tcpip.Stats {
 
 // SetForwarding enables or disables the packet forwarding between NICs.
 func (s *Stack) SetForwarding(enable bool) {
-	// TODO: Expose via /proc/sys/net/ipv4/ip_forward.
+	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
 	s.mu.Lock()
 	s.forwarding = enable
 	s.mu.Unlock()
@@ -484,7 +484,7 @@ func (s *Stack) SetForwarding(enable bool) {
 
 // Forwarding returns if the packet forwarding between NICs is enabled.
 func (s *Stack) Forwarding() bool {
-	// TODO: Expose via /proc/sys/net/ipv4/ip_forward.
+	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	return s.forwarding
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index f2c6c9a8d..3d7e4b719 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -15,5 +15,5 @@
 package stack
 
 // StackFromEnv is the global stack created in restore run.
-// FIXME
+// FIXME(b/36201077)
 var StackFromEnv *Stack
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 0c2589083..2df974bf2 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -453,7 +453,7 @@ func TestTransportForwarding(t *testing.T) {
 	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"}, stack.Options{})
 	s.SetForwarding(true)
 
-	// TODO: Change this to a channel NIC.
+	// TODO(b/123449044): Change this to a channel NIC.
 	id1 := loopback.New()
 	if err := s.CreateNIC(1, id1); err != nil {
 		t.Fatalf("CreateNIC #1 failed: %v", err)
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 80cd6b4e5..b09137f08 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -444,7 +444,7 @@ type PasscredOption int
 
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
-// TODO: Add and populate stat fields.
+// TODO(b/64800844): Add and populate stat fields.
 type TCPInfoOption struct {
 	RTT    time.Duration
 	RTTVar time.Duration
diff --git a/pkg/tcpip/transport/raw/raw.go b/pkg/tcpip/transport/raw/raw.go
index 8dada2e4f..f0f60ce91 100644
--- a/pkg/tcpip/transport/raw/raw.go
+++ b/pkg/tcpip/transport/raw/raw.go
@@ -100,7 +100,7 @@ type endpoint struct {
 }
 
 // NewEndpoint returns a raw  endpoint for the given protocols.
-// TODO: IP_HDRINCL, IPPROTO_RAW, and AF_PACKET.
+// TODO(b/129292371): IP_HDRINCL, IPPROTO_RAW, and AF_PACKET.
 func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	if netProto != header.IPv4ProtocolNumber {
 		return nil, tcpip.ErrUnknownProtocol
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index e5c05f8c0..d44d63e95 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -73,7 +73,7 @@ go_test(
         "tcp_test.go",
         "tcp_timestamp_test.go",
     ],
-    # FIXME
+    # FIXME(b/68809571)
     tags = ["flaky"],
     deps = [
         ":tcp",
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index deeea078d..114fb8c5b 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -211,7 +211,7 @@ func SocketPair(packet bool) (*Socket, *Socket, error) {
 	// variable between our two sockets. We only use SocketPair in tests
 	// anyway.
 	//
-	// NOTE: This is purely due to the fact that the raw
+	// NOTE(b/27107811): This is purely due to the fact that the raw
 	// syscall does not serve as a boundary for the sanitizer.
 	var race int32
 	a, err := NewSocket(fds[0])
diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go
index ecc670925..db5485539 100644
--- a/pkg/unet/unet_test.go
+++ b/pkg/unet/unet_test.go
@@ -40,7 +40,7 @@ func randomFilename() (string, error) {
 		return "", err
 	}
 
-	// NOTE: We try to use relative path if possible. This is
+	// NOTE(b/26918832): We try to use relative path if possible. This is
 	// to help conforming to the unix path length limit.
 	if rel, err := filepath.Rel(cwd, file); err == nil {
 		return rel, nil
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 2488981f9..712c50ee9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -231,7 +231,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	}
 	// Prevent CIDs containing ".." from confusing the sentry when creating
 	// /containers/<cid> directory.
-	// TODO: Once we have multiple independent roots, this
+	// TODO(b/129293409): Once we have multiple independent roots, this
 	// check won't be necessary.
 	if path.Clean(args.CID) != args.CID {
 		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
@@ -352,7 +352,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating network: %v", err)
 	}
 	if eps, ok := networkStack.(*epsocket.Stack); ok {
-		stack.StackFromEnv = eps.Stack // FIXME
+		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
 	}
 	info, err := o.FilePayload.Files[0].Stat()
 	if err != nil {
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 761142d98..07061b9b3 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -274,7 +274,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
-		// TODO: Support all the mount types and make this a
+		// TODO(nlacasse): Support all the mount types and make this a
 		// fatal error.  Most applications will "just work" without
 		// them, so this is a warning for now.
 		// we do not support.
@@ -425,7 +425,7 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 	if err != nil {
 		return err
 	}
-	// TODO: Fix this when we support all the mount types and
+	// TODO(nlacasse): Fix this when we support all the mount types and
 	// make this a fatal error.
 	if fsName == "" {
 		return nil
@@ -475,7 +475,7 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 		}
 	}
 
-	// TODO: handle '/tmp' properly (see mountTmp()).
+	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
 	if !tmpMounted {
 		tmpMount := specs.Mount{
 			Type:        tmpfs,
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 48ecb2626..75ec19c32 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -577,7 +577,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	// sentry currently supports only 1 mount namespace, which is tied to a
 	// single user namespace. Thus we must run in the same user namespace
 	// to access mounts.
-	// TODO: Create a new mount namespace for the container.
+	// TODO(b/63601033): Create a new mount namespace for the container.
 	creds := auth.NewUserCredentials(
 		auth.KUID(spec.Process.User.UID),
 		auth.KGID(spec.Process.User.GID),
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index d8f748aa0..f722df055 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -105,7 +105,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		return subcommands.ExitSuccess
 	}
 
-	// TODO: Make it possible to restore into same container.
+	// TODO(b/110843694): Make it possible to restore into same container.
 	// For now, we can fake it by destroying the container and making a
 	// new container with the same ID. This hack does not work with docker
 	// which uses the container pid to ensure that the restore-container is
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 1bed1a97e..a30c217f7 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -529,7 +529,7 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 // SignalContainer sends the signal to the container. If all is true and signal
 // is SIGKILL, then waits for all processes to exit before returning.
 // SignalContainer returns an error if the container is already stopped.
-// TODO: Distinguish different error types.
+// TODO(b/113680494): Distinguish different error types.
 func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
 	log.Debugf("Signal container %q: %v", c.ID, sig)
 	// Signaling container in Stopped state is allowed. When all=false,
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 9fe584aa3..603c4d929 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -242,10 +242,10 @@ func configs(opts ...configOption) []*boot.Config {
 		case overlay:
 			c.Overlay = true
 		case kvm:
-			// TODO: KVM tests are flaky. Disable until fixed.
+			// TODO(b/112165693): KVM tests are flaky. Disable until fixed.
 			continue
 
-			// TODO: KVM doesn't work with --race.
+			// TODO(b/68787993): KVM doesn't work with --race.
 			if testutil.RaceEnabled {
 				continue
 			}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 92495c69e..48a0dafe2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -267,7 +267,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	defer conn.Close()
 
 	var e boot.Event
-	// TODO: Pass in the container id (cid) here. The sandbox
+	// TODO(b/129292330): Pass in the container id (cid) here. The sandbox
 	// should return events only for that container.
 	if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil {
 		return nil, fmt.Errorf("retrieving event data from sandbox: %v", err)
@@ -457,7 +457,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	if conf.Platform == boot.PlatformPtrace {
-		// TODO: Also set a new PID namespace so that we limit
+		// TODO(b/75837838): Also set a new PID namespace so that we limit
 		// access to other host processes.
 		log.Infof("Sandbox will be started in the current PID namespace")
 	} else {
@@ -520,7 +520,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			// root for itself, so it has to have the CAP_SYS_ADMIN
 			// capability.
 			//
-			// FIXME: The current implementations of
+			// FIXME(b/122554829): The current implementations of
 			// os/exec doesn't allow to set ambient capabilities if
 			// a process is started in a new user namespace. As a
 			// workaround, we start the sandbox process with the 0
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 32f81b8d4..ac85bec71 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -90,7 +90,7 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
 	}
 
-	// TODO: Apply seccomp to application inside sandbox.
+	// TODO(b/72226747): Apply seccomp to application inside sandbox.
 	if spec.Linux != nil && spec.Linux.Seccomp != nil {
 		log.Warningf("Seccomp spec is being ignored")
 	}
@@ -220,7 +220,7 @@ func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.Task
 		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
 			return nil, err
 		}
-		// TODO: Support ambient capabilities.
+		// TODO(nlacasse): Support ambient capabilities.
 	}
 	return &caps, nil
 }
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 94e0f24e0..d35f59433 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -277,7 +277,7 @@ syscall_test(test = "//test/syscalls/linux:sendfile_test")
 
 syscall_test(test = "//test/syscalls/linux:sigaction_test")
 
-# TODO: Enable once the test passes in runsc.
+# TODO(b/119826902): Enable once the test passes in runsc.
 # syscall_test(test = "//test/syscalls/linux:sigaltstack_test")
 
 syscall_test(test = "//test/syscalls/linux:sigiret_test")
@@ -414,7 +414,7 @@ syscall_test(
 )
 
 syscall_test(
-    # NOTE: Large sendmsg may stall a long time.
+    # NOTE(b/116636318): Large sendmsg may stall a long time.
     size = "enormous",
     test = "//test/syscalls/linux:socket_unix_dgram_local_test",
 )
@@ -437,7 +437,7 @@ syscall_test(
 )
 
 syscall_test(
-    # NOTE: Large sendmsg may stall a long time.
+    # NOTE(b/116636318): Large sendmsg may stall a long time.
     size = "enormous",
     test = "//test/syscalls/linux:socket_unix_seqpacket_local_test",
 )
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index 610b030b2..cd74a769d 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -78,10 +78,10 @@ def _syscall_test(
     tags += [full_platform, "file_" + file_access]
 
     # Add tag to prevent the tests from running in a Bazel sandbox.
-    # TODO: Make the tests run without this tag.
+    # TODO(b/120560048): Make the tests run without this tag.
     tags.append("no-sandbox")
 
-    # TODO: KVM tests are tagged "manual" to until the platform is
+    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
     # more stable.
     if platform == "kvm":
         tags += ["manual"]
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 230648c9b..78baf548e 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -80,11 +80,11 @@ constexpr int kExitCode = 42;
 TEST(Syscall32Bit, Int80) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: 32-bit segments are broken (but not explictly
+      // TODO(b/111805002): 32-bit segments are broken (but not explictly
       // disabled).
       return;
     case Platform::kPtrace:
-      // TODO: The ptrace platform does not have a
+      // TODO(gvisor.dev/issue/167): The ptrace platform does not have a
       // consistent story here.
       return;
     case Platform::kNative:
@@ -99,10 +99,10 @@ TEST(Syscall32Bit, Int80) {
 TEST(Syscall32Bit, Sysenter) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: See above.
+      // TODO(b/111805002): See above.
       return;
     case Platform::kPtrace:
-      // TODO: See above.
+      // TODO(gvisor.dev/issue/167): See above.
       return;
     case Platform::kNative:
       break;
@@ -123,10 +123,10 @@ TEST(Syscall32Bit, Sysenter) {
 TEST(Syscall32Bit, Syscall) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: See above.
+      // TODO(b/111805002): See above.
       return;
     case Platform::kPtrace:
-      // TODO: See above.
+      // TODO(gvisor.dev/issue/167): See above.
       return;
     case Platform::kNative:
       break;
@@ -207,7 +207,7 @@ void FarCall32() {
 TEST(Call32Bit, Disallowed) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: See above.
+      // TODO(b/111805002): See above.
       return;
     case Platform::kPtrace:
       // The ptrace platform cannot prevent switching to compatibility mode.
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index 06643ccb8..b96aab9b9 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -103,7 +103,7 @@ TEST_F(AIOTest, BasicWrite) {
   // aio implementation uses aio_ring. gVisor doesn't and returns all zeroes.
   // Linux implements aio_ring, so skip the zeroes check.
   //
-  // TODO: Remove when gVisor implements aio_ring.
+  // TODO(b/65486370): Remove when gVisor implements aio_ring.
   auto ring = reinterpret_cast<struct aio_ring*>(ctx_);
   auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC;
   EXPECT_EQ(ring->magic, magic);
diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc
index 2f2ff3b7d..2f42fe326 100644
--- a/test/syscalls/linux/chmod.cc
+++ b/test/syscalls/linux/chmod.cc
@@ -235,7 +235,7 @@ TEST(ChmodTest, FchmodFileToNoPermissionsSucceeds_NoRandomSave) {
 
 // Verify that we can get a RW FD after chmod, even if a RO fd is left open.
 TEST(ChmodTest, ChmodWritableWithOpenFD) {
-  // FIXME: broken on hostfs.
+  // FIXME(b/72455313): broken on hostfs.
   if (IsRunningOnGvisor()) {
     return;
   }
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
index 7b1d83ad8..b4a3bfcba 100644
--- a/test/syscalls/linux/epoll.cc
+++ b/test/syscalls/linux/epoll.cc
@@ -56,7 +56,7 @@ TEST(EpollTest, AllWritable) {
   struct epoll_event result[kFDsPerEpoll];
   ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
               SyscallSucceedsWithValue(kFDsPerEpoll));
-  // TODO: Why do some tests check epoll_event::data, and others
+  // TODO(edahlgren): Why do some tests check epoll_event::data, and others
   // don't? Does Linux actually guarantee that, in any of these test cases,
   // epoll_wait will necessarily write out the epoll_events in the order that
   // they were registered?
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 187696ed9..c10d85398 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -285,7 +285,7 @@ ElfBinary<64> StandardElf() {
   elf.header.e_phoff = sizeof(elf.header);
   elf.header.e_phentsize = sizeof(decltype(elf)::ElfPhdr);
 
-  // TODO: Always include a PT_GNU_STACK segment to
+  // TODO(gvisor.dev/issue/153): Always include a PT_GNU_STACK segment to
   // disable executable stacks. With this omitted the stack (and all PROT_READ)
   // mappings should be executable, but gVisor doesn't support that.
   decltype(elf)::ElfPhdr phdr = {};
@@ -403,7 +403,7 @@ TEST(ElfTest, DataSegment) {
 
 // Linux will allow PT_LOAD segments to overlap.
 TEST(ElfTest, DirectlyOverlappingSegments) {
-  // NOTE: see PIEOutOfOrderSegments.
+  // NOTE(b/37289926): see PIEOutOfOrderSegments.
   SKIP_IF(IsRunningOnGvisor());
 
   ElfBinary<64> elf = StandardElf();
@@ -439,7 +439,7 @@ TEST(ElfTest, DirectlyOverlappingSegments) {
 
 // Linux allows out-of-order PT_LOAD segments.
 TEST(ElfTest, OutOfOrderSegments) {
-  // NOTE: see PIEOutOfOrderSegments.
+  // NOTE(b/37289926): see PIEOutOfOrderSegments.
   SKIP_IF(IsRunningOnGvisor());
 
   ElfBinary<64> elf = StandardElf();
@@ -670,7 +670,7 @@ TEST(ElfTest, PIENonZeroStart) {
 }
 
 TEST(ElfTest, PIEOutOfOrderSegments) {
-  // TODO: This triggers a bug in Linux where it computes the size
+  // TODO(b/37289926): This triggers a bug in Linux where it computes the size
   // of the binary as 0x20000 - 0x40000 = 0xfffffffffffe0000, which obviously
   // fails to map.
   //
@@ -1005,7 +1005,7 @@ TEST(ElfTest, NoExecute) {
 
 // Execute, but no read permissions on the binary works just fine.
 TEST(ElfTest, NoRead) {
-  // TODO: gVisor's backing filesystem may prevent the
+  // TODO(gvisor.dev/issue/160): gVisor's backing filesystem may prevent the
   // sentry from reading the executable.
   SKIP_IF(IsRunningOnGvisor());
 
@@ -1024,7 +1024,7 @@ TEST(ElfTest, NoRead) {
 
   ASSERT_NO_ERRNO(WaitStopped(child));
 
-  // TODO: A task with a non-readable executable is marked
+  // TODO(gvisor.dev/issue/160): A task with a non-readable executable is marked
   // non-dumpable, preventing access to proc files. gVisor does not implement
   // this behavior.
 }
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 19c9a5053..43f568111 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -52,7 +52,7 @@ class FileTest : public ::testing::Test {
     test_file_fd_ = ASSERT_NO_ERRNO_AND_VALUE(
         Open(test_file_name_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR));
 
-    // FIXME: enable when mknod syscall is supported.
+    // FIXME(edahlgren): enable when mknod syscall is supported.
     // test_fifo_name_ = NewTempAbsPath();
     // ASSERT_THAT(mknod(test_fifo_name_.c_str()), S_IFIFO|0644, 0,
     //             SyscallSucceeds());
@@ -97,7 +97,7 @@ class FileTest : public ::testing::Test {
     UnlinkFile();
     ClosePipes();
 
-    // FIXME: enable when mknod syscall is supported.
+    // FIXME(edahlgren): enable when mknod syscall is supported.
     // close(test_fifo_[0]);
     // close(test_fifo_[1]);
     // unlink(test_fifo_name_.c_str());
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index de29047e0..c7741a177 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -158,7 +158,7 @@ TEST_F(IoctlTest, FIOASYNCNoTarget) {
 }
 
 TEST_F(IoctlTest, FIOASYNCSelfTarget) {
-  // FIXME: gVisor erroneously sends SIGIO on close(2), which would
+  // FIXME(b/120624367): gVisor erroneously sends SIGIO on close(2), which would
   // kill the test when pair goes out of scope. Temporarily ignore SIGIO so that
   // that the close signal is ignored.
   struct sigaction sa;
@@ -195,7 +195,7 @@ TEST_F(IoctlTest, FIOASYNCSelfTarget) {
 // Equivalent to FIOASYNCSelfTarget except that FIOSETOWN is called before
 // FIOASYNC.
 TEST_F(IoctlTest, FIOASYNCSelfTarget2) {
-  // FIXME: gVisor erroneously sends SIGIO on close(2), which would
+  // FIXME(b/120624367): gVisor erroneously sends SIGIO on close(2), which would
   // kill the test when pair goes out of scope. Temporarily ignore SIGIO so that
   // that the close signal is ignored.
   struct sigaction sa;
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 4ad787cc0..0a149c2e5 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -24,7 +24,7 @@ namespace gvisor {
 namespace testing {
 
 PosixErrorOr<int> InterfaceIndex(std::string name) {
-  // TODO: Consider using netlink.
+  // TODO(igudger): Consider using netlink.
   ifreq req = {};
   memcpy(req.ifr_name, name.c_str(), name.size());
   ASSIGN_OR_RETURN_ERRNO(auto sock, Socket(AF_INET, SOCK_DGRAM, 0));
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
index fb6a1546e..6a4f1423c 100644
--- a/test/syscalls/linux/lseek.cc
+++ b/test/syscalls/linux/lseek.cc
@@ -194,7 +194,7 @@ TEST(LseekTest, EtcPasswdDup) {
   ASSERT_THAT(lseek(fd3.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(1000));
 }
 
-// TODO: Add tests where we have donated in sockets.
+// TODO(magi): Add tests where we have donated in sockets.
 
 }  // namespace
 
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index 84db45eb3..50807b68f 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -36,7 +36,7 @@ class MkdirTest : public ::testing::Test {
 
   // TearDown unlinks created files.
   void TearDown() override {
-    // FIXME: We don't currently implement rmdir.
+    // FIXME(edahlgren): We don't currently implement rmdir.
     // We do this unconditionally because there's no harm in trying.
     rmdir(dirname_.c_str());
   }
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index b500e79a4..a4fb9d1e0 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -816,7 +816,7 @@ class MMapFileTest : public MMapTest {
 // MAP_POPULATE allowed.
 // There isn't a good way to verify it actually did anything.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, MapPopulate) {
   ASSERT_THAT(
       Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd_.get(), 0),
@@ -825,7 +825,7 @@ TEST_F(MMapFileTest, MapPopulate) {
 
 // MAP_POPULATE on a short file.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, MapPopulateShort) {
   ASSERT_THAT(Map(0, 2 * kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE,
                   fd_.get(), 0),
@@ -923,7 +923,7 @@ TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
 
 // MAP_SHARED PROT_READ not allowed on write-only FDs.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
@@ -936,7 +936,7 @@ TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) {
 // MAP_SHARED PROT_WRITE not allowed on write-only FDs.
 // The FD must always be readable.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, WriteSharedOnWriteOnlyFd) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
@@ -1371,7 +1371,7 @@ TEST_F(MMapFileTest, WritePrivate) {
 
 // SIGBUS raised when writing past end of file to a private mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, SigBusDeathWritePrivate) {
   SetupGvisorDeathTest();
 
@@ -1390,7 +1390,7 @@ TEST_F(MMapFileTest, SigBusDeathWritePrivate) {
 
 // SIGBUS raised when reading past end of file on a shared mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, SigBusDeathReadShared) {
   SetupGvisorDeathTest();
 
@@ -1410,7 +1410,7 @@ TEST_F(MMapFileTest, SigBusDeathReadShared) {
 
 // SIGBUS raised when reading past end of file on a shared mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, SigBusDeathWriteShared) {
   SetupGvisorDeathTest();
 
@@ -1459,7 +1459,7 @@ TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWritePrivate) {
 // Tests that SIGBUS is not raised when reading from a file-mapped page
 // containing EOF, *after* the EOF for a shared mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
@@ -1476,7 +1476,7 @@ TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) {
 // Tests that SIGBUS is not raised when writing to a file-mapped page containing
 // EOF, *after* the EOF for a shared mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWriteShared) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index cdc226300..22e4666c2 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -279,7 +279,7 @@ TEST_F(OpenTest, Null) {
   ASSERT_THAT(open(&c, O_RDONLY), SyscallFailsWithErrno(ENOENT));
 }
 
-// NOTE: While the man pages specify that this behavior should be
+// NOTE(b/119785738): While the man pages specify that this behavior should be
 // undefined, Linux truncates the file on opening read only if we have write
 // permission, so we will too.
 TEST_F(OpenTest, CanTruncateReadOnly) {
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 073a6b8c1..71288ebc4 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -158,7 +158,7 @@ TEST_F(PartialBadBufferTest, PreadvSmall) {
 }
 
 TEST_F(PartialBadBufferTest, WriteBig) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -168,7 +168,7 @@ TEST_F(PartialBadBufferTest, WriteBig) {
 }
 
 TEST_F(PartialBadBufferTest, WriteSmall) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -178,7 +178,7 @@ TEST_F(PartialBadBufferTest, WriteSmall) {
 }
 
 TEST_F(PartialBadBufferTest, PwriteBig) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -188,7 +188,7 @@ TEST_F(PartialBadBufferTest, PwriteBig) {
 }
 
 TEST_F(PartialBadBufferTest, PwriteSmall) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -198,7 +198,7 @@ TEST_F(PartialBadBufferTest, PwriteSmall) {
 }
 
 TEST_F(PartialBadBufferTest, WritevBig) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -211,7 +211,7 @@ TEST_F(PartialBadBufferTest, WritevBig) {
 }
 
 TEST_F(PartialBadBufferTest, WritevSmall) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -224,7 +224,7 @@ TEST_F(PartialBadBufferTest, WritevSmall) {
 }
 
 TEST_F(PartialBadBufferTest, PwritevBig) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -238,7 +238,7 @@ TEST_F(PartialBadBufferTest, PwritevBig) {
 }
 
 TEST_F(PartialBadBufferTest, PwritevSmall) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -279,7 +279,7 @@ TEST_F(PartialBadBufferTest, GetdentsOneEntry) {
 // Verify that when write returns EFAULT the kernel hasn't silently written
 // the initial valid bytes.
 TEST_F(PartialBadBufferTest, WriteEfaultIsntPartial) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index c49ec9f09..abd10b11b 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -36,7 +36,7 @@ namespace {
 
 // Buffer size of a pipe.
 //
-// TODO: Get this from F_GETPIPE_SZ.
+// TODO(b/35762278): Get this from F_GETPIPE_SZ.
 constexpr int kPipeSize = 65536;
 
 class PipeTest : public ::testing::Test {
@@ -316,7 +316,7 @@ TEST_F(PipeTest, BlockWriteClosed) {
 // Blocking write returns EPIPE when read end is closed even if something has
 // been written.
 //
-// FIXME: Pipe writes blocking early allows S/R to interrupt the
+// FIXME(b/35924046): Pipe writes blocking early allows S/R to interrupt the
 // write(2) call before the buffer is full. Then the next call will will return
 // non-zero instead of EPIPE.
 TEST_F(PipeTest, BlockPartialWriteClosed_NoRandomSave) {
@@ -329,7 +329,7 @@ TEST_F(PipeTest, BlockPartialWriteClosed_NoRandomSave) {
     // Write more than fits in the buffer. Blocks then returns partial write
     // when the other end is closed. The next call returns EPIPE.
     if (IsRunningOnGvisor()) {
-      // FIXME: Pipe writes block early on gVisor, resulting in a
+      // FIXME(b/35924046): Pipe writes block early on gVisor, resulting in a
       // shorter than expected partial write.
       ASSERT_THAT(write(wfd, buf.data(), buf.size()),
                   SyscallSucceedsWithValue(::testing::Gt(0)));
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 3ec31ae8b..7ba274226 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -61,7 +61,7 @@
 #include "test/util/thread_util.h"
 #include "test/util/timer_util.h"
 
-// NOTE: No, this isn't really a syscall but this is a really simple
+// NOTE(magi): No, this isn't really a syscall but this is a really simple
 // way to get it tested on both gVisor, PTrace and Linux.
 
 using ::testing::AllOf;
@@ -489,7 +489,7 @@ TEST(ProcSelfMaps, Map1) {
 }
 
 TEST(ProcSelfMaps, Map2) {
-  // NOTE: The permissions must be different or the pages will get merged.
+  // NOTE(magi): The permissions must be different or the pages will get merged.
   Mapping map1 = ASSERT_NO_ERRNO_AND_VALUE(
       MmapAnon(kPageSize, PROT_READ | PROT_EXEC, MAP_PRIVATE));
   Mapping map2 =
@@ -564,7 +564,7 @@ TEST(ProcSelfMaps, MapUnmap) {
 }
 
 TEST(ProcSelfMaps, Mprotect) {
-  // FIXME: Linux's mprotect() sometimes fails to merge VMAs in this
+  // FIXME(jamieliu): Linux's mprotect() sometimes fails to merge VMAs in this
   // case.
   SKIP_IF(!IsRunningOnGvisor());
 
@@ -977,7 +977,7 @@ void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
   *after = ASSERT_NO_ERRNO_AND_VALUE(CurrentRSS());
 }
 
-// TODO: Test for PROT_READ + MAP_POPULATE anonymous mappings. Their
+// TODO(b/73896574): Test for PROT_READ + MAP_POPULATE anonymous mappings. Their
 // semantics are more subtle:
 //
 // Small pages -> Zero page mapped, not counted in RSS
@@ -1140,7 +1140,7 @@ TEST(ProcPidStatusTest, ValuesAreTabDelimited) {
 
 // Threads properly counts running threads.
 //
-// TODO: Test zombied threads while the thread group leader is still
+// TODO(mpratt): Test zombied threads while the thread group leader is still
 // running with generalized fork and clone children from the wait test.
 TEST(ProcPidStatusTest, Threads) {
   char buf[4096] = {};
@@ -1274,7 +1274,7 @@ TEST(ProcPidSymlink, SubprocessRunning) {
               SyscallSucceedsWithValue(sizeof(buf)));
 }
 
-// FIXME: Inconsistent behavior between gVisor and linux
+// FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
 // on proc files.
 TEST(ProcPidSymlink, SubprocessZombied) {
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
@@ -1298,13 +1298,13 @@ TEST(ProcPidSymlink, SubprocessZombied) {
                 SyscallFailsWithErrno(want));
   }
 
-  // FIXME: Inconsistent behavior between gVisor and linux
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   // 4.17 & gVisor: Syscall succeeds and returns 1
   // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 
-  // FIXME: Inconsistent behavior between gVisor and linux
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   // 4.17 &  gVisor: Syscall succeeds and returns 1.
   // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
@@ -1313,7 +1313,7 @@ TEST(ProcPidSymlink, SubprocessZombied) {
 
 // Test whether /proc/PID/ symlinks can be read for an exited process.
 TEST(ProcPidSymlink, SubprocessExited) {
-  // FIXME: These all succeed on gVisor.
+  // FIXME(gvisor.dev/issue/164): These all succeed on gVisor.
   SKIP_IF(IsRunningOnGvisor());
 
   char buf[1];
@@ -1404,7 +1404,7 @@ TEST(ProcPidFile, SubprocessZombie) {
   EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
-  // FIXME: Inconsistent behavior between gVisor and linux
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   // gVisor & 4.17: Succeeds and returns 1.
   // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
@@ -1415,7 +1415,7 @@ TEST(ProcPidFile, SubprocessZombie) {
 TEST(ProcPidFile, SubprocessExited) {
   char buf[1];
 
-  // FIXME: Inconsistent behavior between kernels
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels
   // gVisor: Fails with ESRCH.
   // 4.17: Succeeds and returns 1.
   // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
@@ -1425,7 +1425,7 @@ TEST(ProcPidFile, SubprocessExited) {
               SyscallFailsWithErrno(ESRCH));
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Succeeds on gVisor.
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
     EXPECT_THAT(ReadWhileExited("comm", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
@@ -1434,25 +1434,25 @@ TEST(ProcPidFile, SubprocessExited) {
               SyscallSucceedsWithValue(sizeof(buf)));
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Succeeds on gVisor.
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
     EXPECT_THAT(ReadWhileExited("io", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Returns EOF on gVisor.
+    // FIXME(gvisor.dev/issue/164): Returns EOF on gVisor.
     EXPECT_THAT(ReadWhileExited("maps", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Succeeds on gVisor.
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
     EXPECT_THAT(ReadWhileExited("stat", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Succeeds on gVisor.
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
     EXPECT_THAT(ReadWhileExited("status", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
index 5f9c42ce5..cf5c462f3 100644
--- a/test/syscalls/linux/proc_pid_smaps.cc
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -82,7 +82,7 @@ struct ProcPidSmapsEntry {
 // Given the value part of a /proc/[pid]/smaps field containing a value in kB
 // (for example, "    4 kB", returns the value in kB (in this example, 4).
 PosixErrorOr<size_t> SmapsValueKb(absl::string_view value) {
-  // TODO: let us use RE2 or <regex>
+  // TODO(jamieliu): let us use RE2 or <regex>
   std::pair<absl::string_view, absl::string_view> parts =
       absl::StrSplit(value, ' ', absl::SkipEmpty());
   if (parts.second != "kB") {
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 1c9d7d4f4..e0c56f1fc 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -823,7 +823,7 @@ TEST(PtraceTest,
 TEST(PtraceTest, Int3) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: int3 isn't handled properly.
+      // TODO(b/124248694): int3 isn't handled properly.
       return;
     default:
       break;
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index 60ae6de1f..485b1e48d 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -30,7 +30,7 @@ namespace {
 
 // This test is currently very rudimentary.
 //
-// TODO:
+// TODO(edahlgren):
 // * bad buffer states (EFAULT).
 // * bad fds (wrong permission, wrong type of file, EBADF).
 // * check offset is not incremented.
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index 2c129b7e8..cf22c395e 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -41,7 +41,7 @@ class ReadvSocketTest : public SocketTest {
     ASSERT_THAT(write(test_unix_seqpacket_socket_[1], kReadvTestData,
                       kReadvTestDataSize),
                 SyscallSucceedsWithValue(kReadvTestDataSize));
-    // FIXME: Enable when possible.
+    // FIXME(b/69821513): Enable when possible.
     // ASSERT_THAT(write(test_tcp_socket_[1], kReadvTestData,
     // kReadvTestDataSize),
     //             SyscallSucceedsWithValue(kReadvTestDataSize));
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
index 1f2fed7cc..ff948f9d5 100644
--- a/test/syscalls/linux/rtsignal.cc
+++ b/test/syscalls/linux/rtsignal.cc
@@ -75,7 +75,7 @@ class RtSignalTest : public ::testing::Test {
 static int rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t* uinfo) {
   int ret;
   do {
-    // NOTE: rt_sigqueueinfo(2) could return EAGAIN for RT signals.
+    // NOTE(b/25434735): rt_sigqueueinfo(2) could return EAGAIN for RT signals.
     ret = syscall(SYS_rt_sigqueueinfo, tgid, sig, uinfo);
   } while (ret == -1 && errno == EAGAIN);
   return ret;
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index cdc5c0ce8..14d7827c2 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -221,7 +221,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
   std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
   std::unique_ptr<ScopedThread> listen_thread[kThreadCount];
   int accept_counts[kThreadCount] = {};
-  // TODO: figure how to not disable S/R for the whole test.
+  // TODO(avagin): figure how to not disable S/R for the whole test.
   // We need to take into account that this test executes a lot of system
   // calls from many threads.
   DisableSave ds;
@@ -325,7 +325,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
   std::atomic<int> packets_received = ATOMIC_VAR_INIT(0);
   std::unique_ptr<ScopedThread> receiver_thread[kThreadCount];
   int packets_per_socket[kThreadCount] = {};
-  // TODO: figure how to not disable S/R for the whole test.
+  // TODO(avagin): figure how to not disable S/R for the whole test.
   DisableSave ds;  // Too expensive.
 
   for (int i = 0; i < kThreadCount; i++) {
@@ -642,7 +642,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME
+  // FIXME(b/114268588)
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -743,7 +743,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME
+  // FIXME(b/114268588)
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -867,7 +867,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME
+  // FIXME(b/114268588)
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 8b4fc57b6..9dd9e1bd6 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -244,7 +244,7 @@ TestAddress V4Multicast() {
 // set interface or group membership.
 TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
        TestSendMulticastSelfNoGroup) {
-  // FIXME: A group membership is not required for external
+  // FIXME(b/125485338): A group membership is not required for external
   // multicast on gVisor.
   SKIP_IF(IsRunningOnGvisor());
 
@@ -371,7 +371,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
 // Check that multicast packets won't be delivered to another socket with no
 // set interface or group membership.
 TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastNoGroup) {
-  // FIXME: A group membership is not required for external
+  // FIXME(b/125485338): A group membership is not required for external
   // multicast on gVisor.
   SKIP_IF(IsRunningOnGvisor());
 
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 8d2e7d333..ed4ae1c71 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -180,7 +180,7 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // RTM_NEWLINK contains at least the header and ifinfomsg.
   EXPECT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
 
-  // TODO: Check ifinfomsg contents and following attrs.
+  // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
 TEST(NetlinkRouteTest, GetLinkDump) {
@@ -370,7 +370,7 @@ TEST(NetlinkRouteTest, GetAddrDump) {
         // RTM_NEWADDR contains at least the header and ifaddrmsg.
         EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct ifaddrmsg));
 
-        // TODO: Check ifaddrmsg contents and following attrs.
+        // TODO(mpratt): Check ifaddrmsg contents and following attrs.
       }));
 }
 
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index 8b3f6a647..f0f86c01c 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -33,7 +33,7 @@ namespace gvisor {
 namespace testing {
 
 TEST_P(BlockingStreamSocketPairTest, BlockPartialWriteClosed) {
-    // FIXME: gVisor doesn't support SO_SNDBUF on UDS, nor does it
+    // FIXME(b/35921550): gVisor doesn't support SO_SNDBUF on UDS, nor does it
     // enforce any limit; it will write arbitrary amounts of data without
     // blocking.
     SKIP_IF(IsRunningOnGvisor());
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 035087566..0be23e541 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -353,7 +353,7 @@ PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
   }
   MaybeSave();  // Successful accept.
 
-  // FIXME
+  // FIXME(b/110484944)
   if (connect_result == -1) {
     absl::SleepFor(absl::Seconds(1));
   }
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 7332b768e..fafb23ad1 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -186,7 +186,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassNoSpace) {
 // BasicFDPassNoSpaceMsgCtrunc sends an FD, but does not provide any space to
 // receive it. It then verifies that the MSG_CTRUNC flag is set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicFDPassNoSpaceMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -224,7 +224,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassNoSpaceMsgCtrunc) {
 // accomidate the FD, but msg_control is set to NULL. In this case, msg_control
 // should override msg_controllen.
 TEST_P(UnixSocketPairTest, BasicFDPassNullControlMsgCtrunc) {
-  // FIXME: Fix handling of NULL msg_control.
+  // FIXME(gvisor.dev/issue/207): Fix handling of NULL msg_control.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -259,7 +259,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassNullControlMsgCtrunc) {
 // space to receive it. It then verifies that the MSG_CTRUNC flag is set in the
 // msghdr.
 TEST_P(UnixSocketPairTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -296,7 +296,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
 // space to receive two of them. It then verifies that the MSG_CTRUNC flag is
 // set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicThreeFDPassTruncationMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -408,7 +408,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassUnalignedRecvNoMsgTrunc) {
 // provides enough space to receive one of them. It then verifies that the
 // MSG_CTRUNC flag is set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicTwoFDPassUnalignedRecvTruncationMsgTrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -1010,7 +1010,7 @@ TEST_P(UnixSocketPairTest, CredPassNoMsgCtrunc) {
 // the data without providing space for any credentials and verifies that
 // MSG_CTRUNC is set in the msghdr.
 TEST_P(UnixSocketPairTest, CredPassNoSpaceMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -1061,7 +1061,7 @@ TEST_P(UnixSocketPairTest, CredPassNoSpaceMsgCtrunc) {
 // the data while providing enough space for only the first field of the
 // credentials and verifies that MSG_CTRUNC is set in the msghdr.
 TEST_P(UnixSocketPairTest, CredPassTruncatedMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -1615,7 +1615,7 @@ TEST_P(UnixSocketPairTest, SocketShutdown) {
 }
 
 TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
-  // TODO: We should be returning ENXIO and NOT EIO.
+  // TODO(b/122310852): We should be returning ENXIO and NOT EIO.
   SKIP_IF(IsRunningOnGvisor());
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
index c17d3990f..5dd5e6d77 100644
--- a/test/syscalls/linux/socket_unix_dgram.cc
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -28,7 +28,7 @@ namespace testing {
 namespace {
 
 TEST_P(DgramUnixSocketPairTest, WriteOneSideClosed) {
-  // FIXME: gVisor datagram sockets return EPIPE instead of
+  // FIXME(b/35925052): gVisor datagram sockets return EPIPE instead of
   // ECONNREFUSED.
   SKIP_IF(IsRunningOnGvisor());
 
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index 460eb8320..3becb513d 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -31,7 +31,7 @@ using NonBlockingDgramUnixSocketPairTest = SocketPairTest;
 
 TEST_P(NonBlockingDgramUnixSocketPairTest, ReadOneSideClosed) {
   if (IsRunningOnGvisor()) {
-    // FIXME: gVisor datagram sockets return 0 instead of
+    // FIXME(b/70803293): gVisor datagram sockets return 0 instead of
     // EAGAIN.
     return;
   }
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index 8e0cbee4c..a565978f9 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -47,7 +47,7 @@ TEST_P(UnixNonStreamSocketPairTest, RecvMsgTooLarge) {
   const int ret = RetryEINTR(write)(sockets->second_fd(), write_buf.data(),
                                     write_buf.size());
   if (ret < 0 && errno == ENOBUFS) {
-    // NOTE: Linux may stall the write for a long time and
+    // NOTE(b/116636318): Linux may stall the write for a long time and
     // ultimately return ENOBUFS. Allow this error, since a retry will likely
     // result in the same error.
     return;
@@ -136,7 +136,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
     // N.B. At minimum, the socketpair gofer should provide a socket that is
     // already the correct size.
     //
-    // TODO: When internal UDS support SO_SNDBUF, we can assert that
+    // TODO(b/35921550): When internal UDS support SO_SNDBUF, we can assert that
     // we always get the right SO_SNDBUF on gVisor.
     GTEST_SKIP() << "SO_SNDBUF = " << actual_sndbuf << ", want " << sndbuf;
   }
@@ -156,7 +156,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
   msg.msg_iov = &iov;
   msg.msg_iovlen = 1;
 
-  // NOTE: Linux has poor behavior in the presence of
+  // NOTE(b/116636318,b/115833655): Linux has poor behavior in the presence of
   // physical memory fragmentation. As a result, this may stall for a long time
   // and ultimately return ENOBUFS. Allow this error, since it means that we
   // made it to the host kernel and started the sendmsg.
@@ -192,7 +192,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
     // N.B. At minimum, the socketpair gofer should provide a socket that is
     // already the correct size.
     //
-    // TODO: When internal UDS support SO_SNDBUF, we can assert that
+    // TODO(b/35921550): When internal UDS support SO_SNDBUF, we can assert that
     // we always get the right SO_SNDBUF on gVisor.
     GTEST_SKIP() << "SO_SNDBUF = " << actual_sndbuf << ", want " << sndbuf;
   }
@@ -201,7 +201,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
   const int ret = RetryEINTR(write)(sockets->first_fd(), write_buf.data(),
                                     write_buf.size());
   if (ret < 0 && errno == ENOBUFS) {
-    // NOTE: Linux may stall the write for a long time and
+    // NOTE(b/116636318): Linux may stall the write for a long time and
     // ultimately return ENOBUFS. Allow this error, since a retry will likely
     // result in the same error.
     return;
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
index 270d7203f..21209b244 100644
--- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -42,7 +42,7 @@ TEST_P(UnboundUnixSeqpacketSocketPairTest, SendtoWithoutConnect) {
 }
 
 TEST_P(UnboundUnixSeqpacketSocketPairTest, SendtoWithoutConnectIgnoresAddr) {
-  // FIXME: gVisor tries to find /foo/bar and thus returns ENOENT.
+  // FIXME(b/68223466): gVisor tries to find /foo/bar and thus returns ENOENT.
   if (IsRunningOnGvisor()) {
     return;
   }
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
index 4db5b4be1..b95f9569e 100644
--- a/test/syscalls/linux/socket_unix_unbound_stream.cc
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -269,7 +269,7 @@ TEST_P(UnixStreamSocketPairTest, SinglePeek) {
     // 9f389e35674f5b086edd70ed524ca0f287259725 which changes this behavior. We
     // used to target 3.11 compatibility, so disable this test on newer kernels.
     //
-    // NOTE: Bring this up to Linux 4.4 compatibility.
+    // NOTE(b/118902768): Bring this up to Linux 4.4 compatibility.
     auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
     SKIP_IF(version.major > 4 || (version.major == 4 && version.minor >= 3));
   }
@@ -686,7 +686,7 @@ TEST_P(UnboundUnixStreamSocketPairTest, SendtoWithoutConnect) {
 }
 
 TEST_P(UnboundUnixStreamSocketPairTest, SendtoWithoutConnectIgnoresAddr) {
-  // FIXME: gVisor tries to find /foo/bar and thus returns ENOENT.
+  // FIXME(b/68223466): gVisor tries to find /foo/bar and thus returns ENOENT.
   if (IsRunningOnGvisor()) {
     return;
   }
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 48a2059de..746318d09 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -416,7 +416,7 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
   EXPECT_EQ(st_child_before.st_gid, st_child_fd.st_gid);
   EXPECT_EQ(st_child_before.st_size, st_child_fd.st_size);
 
-  // TODO: This isn't ideal but since fstatfs(2) will always return
+  // TODO(b/34861058): This isn't ideal but since fstatfs(2) will always return
   // OVERLAYFS_SUPER_MAGIC we have no way to know if this fs is backed by a
   // gofer which doesn't support links.
   EXPECT_TRUE(st_child_fd.st_nlink == 0 || st_child_fd.st_nlink == 1);
diff --git a/test/syscalls/linux/stat_times.cc b/test/syscalls/linux/stat_times.cc
index 442957c65..8346e9a8e 100644
--- a/test/syscalls/linux/stat_times.cc
+++ b/test/syscalls/linux/stat_times.cc
@@ -68,7 +68,7 @@ TEST_F(StatTimesTest, FileCreationTimes) {
 TEST_F(StatTimesTest, FileCtimeChanges) {
   auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
-  MaybeSave();  // FIXME: ctime is inconsistent.
+  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
 
   absl::Time atime, mtime, ctime;
   std::tie(atime, mtime, ctime) = GetTime(file);
@@ -150,7 +150,7 @@ TEST_F(StatTimesTest, FileAtimeChanges) {
   const auto file = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), contents, 0666));
 
-  MaybeSave();  // FIXME: ctime is inconsistent.
+  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
 
   absl::Time atime, mtime, ctime;
   std::tie(atime, mtime, ctime) = GetTime(file);
@@ -184,7 +184,7 @@ TEST_F(StatTimesTest, DirAtimeChanges) {
   const auto file =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
 
-  MaybeSave();  // FIXME: ctime is inconsistent.
+  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
 
   absl::Time atime, mtime, ctime;
   std::tie(atime, mtime, ctime) = GetTime(dir);
@@ -193,7 +193,7 @@ TEST_F(StatTimesTest, DirAtimeChanges) {
 
   const absl::Time before = absl::Now() - absl::Seconds(1);
 
-  // NOTE: Keep an fd open. This ensures that the inode backing the
+  // NOTE(b/37756234): Keep an fd open. This ensures that the inode backing the
   // directory won't be destroyed before the final GetTime to avoid writing out
   // timestamps and causing side effects.
   const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0));
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 1057f5892..33620a874 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -191,7 +191,7 @@ TEST_P(TcpSocketTest, SenderAddressIgnoredOnPeek) {
 TEST_P(TcpSocketTest, SendtoAddressIgnored) {
   struct sockaddr_storage addr;
   memset(&addr, 0, sizeof(addr));
-  addr.ss_family = GetParam();  // FIXME
+  addr.ss_family = GetParam();  // FIXME(b/63803955)
 
   char data = '\0';
   EXPECT_THAT(
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index 9842ccc9b..3e8ce5327 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -32,7 +32,7 @@ namespace {
 static int tkill(pid_t tid, int sig) {
   int ret;
   do {
-    // NOTE: tkill(2) could return EAGAIN for RT signals.
+    // NOTE(b/25434735): tkill(2) could return EAGAIN for RT signals.
     ret = syscall(SYS_tkill, tid, sig);
   } while (ret == -1 && errno == EAGAIN);
   return ret;
diff --git a/test/syscalls/linux/udp_bind.cc b/test/syscalls/linux/udp_bind.cc
index 902be47d3..547eb2a6c 100644
--- a/test/syscalls/linux/udp_bind.cc
+++ b/test/syscalls/linux/udp_bind.cc
@@ -286,7 +286,7 @@ INSTANTIATE_TEST_SUITE_P(
         []() {
           SendtoTestParam param = {};
           param.description = "connected IPv6 sendto IPv4 mapped IPv6";
-          // TODO: Determine if this inconsistent behavior is worth
+          // TODO(igudger): Determine if this inconsistent behavior is worth
           // implementing.
           param.skip_on_gvisor = true;
           param.send_domain = AF_INET6;
@@ -299,7 +299,7 @@ INSTANTIATE_TEST_SUITE_P(
         []() {
           SendtoTestParam param = {};
           param.description = "connected IPv6 sendto IPv4";
-          // TODO: Determine if this inconsistent behavior is worth
+          // TODO(igudger): Determine if this inconsistent behavior is worth
           // implementing.
           param.skip_on_gvisor = true;
           param.send_domain = AF_INET6;
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index c0c1f2960..d78a09b1e 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -169,7 +169,7 @@ TEST(UidGidRootTest, SetgidNotFromThreadGroupLeader) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
 
   const gid_t gid = FLAGS_scratch_gid1;
-  // NOTE: Do setgid in a separate thread so that we can test if
+  // NOTE(b/64676707): Do setgid in a separate thread so that we can test if
   // info.si_pid is set correctly.
   ScopedThread([gid] { ASSERT_THAT(setgid(gid), SyscallSucceeds()); });
   EXPECT_NO_ERRNO(CheckGIDs(gid, gid, gid));
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index d95ee74ec..bf776cd93 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -33,7 +33,7 @@ namespace testing {
 
 namespace {
 
-// TODO: utimes(nullptr) does not pick the "now" time in the
+// TODO(b/36516566): utimes(nullptr) does not pick the "now" time in the
 // application's time domain, so when asserting that times are within a window,
 // we expand the window to allow for differences between the time domains.
 constexpr absl::Duration kClockSlack = absl::Milliseconds(100);
@@ -235,7 +235,7 @@ void TestUtimensat(int dirFd, std::string const& path) {
   EXPECT_LE(mtime3, after);
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Gofers set atime and mtime to different "now" times.
+    // FIXME(b/36516566): Gofers set atime and mtime to different "now" times.
     EXPECT_EQ(atime3, mtime3);
   }
 }
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index cfab8a976..fcd606bec 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -40,7 +40,7 @@ using ::testing::UnorderedElementsAre;
 // These unit tests focus on the wait4(2) system call, but include a basic
 // checks for the i386 waitpid(2) syscall, which is a subset of wait4(2).
 //
-// NOTE: Some functionality is not tested as
+// NOTE(b/22640830,b/27680907,b/29049891): Some functionality is not tested as
 // it is not currently supported by gVisor:
 // * UID in waitid(2) siginfo.
 // * Process groups.
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index 432bd6066..7f80b2fa8 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -33,7 +33,7 @@ namespace testing {
 namespace {
 // This test is currently very rudimentary.
 //
-// TODO:
+// TODO(edahlgren):
 // * bad buffer states (EFAULT).
 // * bad fds (wrong permission, wrong type of file, EBADF).
 // * check offset is incremented.
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
index a63a0d084..131f0a2ba 100644
--- a/third_party/gvsync/downgradable_rwmutex_unsafe.go
+++ b/third_party/gvsync/downgradable_rwmutex_unsafe.go
@@ -49,7 +49,7 @@ func (rw *DowngradableRWMutex) RLock() {
 // RUnlock undoes a single RLock call.
 func (rw *DowngradableRWMutex) RUnlock() {
 	if RaceEnabled {
-		// TODO: Why does this need to be ReleaseMerge instead of
+		// TODO(jamieliu): Why does this need to be ReleaseMerge instead of
 		// Release? IIUC this establishes Unlock happens-before RUnlock, which
 		// seems unnecessary.
 		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
diff --git a/vdso/cycle_clock.h b/vdso/cycle_clock.h
index 26d6690c0..309e07a3f 100644
--- a/vdso/cycle_clock.h
+++ b/vdso/cycle_clock.h
@@ -23,7 +23,7 @@ namespace vdso {
 
 #if __x86_64__
 
-// TODO: The appropriate barrier instruction to use with rdtsc on
+// TODO(b/74613497): The appropriate barrier instruction to use with rdtsc on
 // x86_64 depends on the vendor. Intel processors can use lfence but AMD may
 // need mfence, depending on MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT.
 
diff --git a/vdso/vdso_amd64.lds b/vdso/vdso_amd64.lds
index 166779931..e2615ae9e 100644
--- a/vdso/vdso_amd64.lds
+++ b/vdso/vdso_amd64.lds
@@ -56,7 +56,7 @@ SECTIONS {
   .altinstr_replacement  : { *(.altinstr_replacement) }
 
   /*
-   * TODO: Remove this alignment? Then the VDSO would fit
+   * TODO(gvisor.dev/issue/157): Remove this alignment? Then the VDSO would fit
    * in a single page.
    */
   . = ALIGN(0x1000);
diff --git a/vdso/vdso_arm64.lds b/vdso/vdso_arm64.lds
index 19f8efa01..469185468 100644
--- a/vdso/vdso_arm64.lds
+++ b/vdso/vdso_arm64.lds
@@ -59,7 +59,7 @@ SECTIONS {
   .altinstr_replacement  : { *(.altinstr_replacement) }
 
   /*
-   * TODO: Remove this alignment? Then the VDSO would fit
+   * TODO(gvisor.dev/issue/157): Remove this alignment? Then the VDSO would fit
    * in a single page.
    */
   . = ALIGN(0x1000);
-- 
cgit v1.2.3


From 4d52a5520101a88424fb63dd99412a1db33fbd06 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 29 Apr 2019 14:25:05 -0700
Subject: Change copyright notice to "The gVisor Authors"

Based on the guidelines at
https://opensource.google.com/docs/releasing/authors/.

1. $ rg -l "Google LLC" | xargs sed -i 's/Google LLC.*/The gVisor Authors./'
2. Manual fixup of "Google Inc" references.
3. Add AUTHORS file. Authors may request to be added to this file.
4. Point netstack AUTHORS to gVisor AUTHORS. Drop CONTRIBUTORS.

Fixes #209

PiperOrigin-RevId: 245823212
Change-Id: I64530b24ad021a7d683137459cafc510f5ee1de9
---
 AUTHORS                                                           | 8 ++++++++
 kokoro/run_build.sh                                               | 2 +-
 kokoro/run_tests.sh                                               | 2 +-
 pkg/abi/abi.go                                                    | 2 +-
 pkg/abi/abi_linux.go                                              | 2 +-
 pkg/abi/flag.go                                                   | 2 +-
 pkg/abi/linux/aio.go                                              | 2 +-
 pkg/abi/linux/ashmem.go                                           | 2 +-
 pkg/abi/linux/audit.go                                            | 2 +-
 pkg/abi/linux/binder.go                                           | 2 +-
 pkg/abi/linux/bpf.go                                              | 2 +-
 pkg/abi/linux/capability.go                                       | 2 +-
 pkg/abi/linux/dev.go                                              | 2 +-
 pkg/abi/linux/elf.go                                              | 2 +-
 pkg/abi/linux/errors.go                                           | 2 +-
 pkg/abi/linux/eventfd.go                                          | 2 +-
 pkg/abi/linux/exec.go                                             | 2 +-
 pkg/abi/linux/fcntl.go                                            | 2 +-
 pkg/abi/linux/file.go                                             | 2 +-
 pkg/abi/linux/fs.go                                               | 2 +-
 pkg/abi/linux/futex.go                                            | 2 +-
 pkg/abi/linux/inotify.go                                          | 2 +-
 pkg/abi/linux/ioctl.go                                            | 2 +-
 pkg/abi/linux/ip.go                                               | 2 +-
 pkg/abi/linux/ipc.go                                              | 2 +-
 pkg/abi/linux/limits.go                                           | 2 +-
 pkg/abi/linux/linux.go                                            | 2 +-
 pkg/abi/linux/mm.go                                               | 2 +-
 pkg/abi/linux/netdevice.go                                        | 2 +-
 pkg/abi/linux/netlink.go                                          | 2 +-
 pkg/abi/linux/netlink_route.go                                    | 2 +-
 pkg/abi/linux/poll.go                                             | 2 +-
 pkg/abi/linux/prctl.go                                            | 2 +-
 pkg/abi/linux/ptrace.go                                           | 2 +-
 pkg/abi/linux/rusage.go                                           | 2 +-
 pkg/abi/linux/sched.go                                            | 2 +-
 pkg/abi/linux/seccomp.go                                          | 2 +-
 pkg/abi/linux/sem.go                                              | 2 +-
 pkg/abi/linux/shm.go                                              | 2 +-
 pkg/abi/linux/signal.go                                           | 2 +-
 pkg/abi/linux/socket.go                                           | 2 +-
 pkg/abi/linux/tcp.go                                              | 2 +-
 pkg/abi/linux/time.go                                             | 2 +-
 pkg/abi/linux/timer.go                                            | 2 +-
 pkg/abi/linux/tty.go                                              | 2 +-
 pkg/abi/linux/uio.go                                              | 2 +-
 pkg/abi/linux/utsname.go                                          | 2 +-
 pkg/amutex/amutex.go                                              | 2 +-
 pkg/amutex/amutex_test.go                                         | 2 +-
 pkg/atomicbitops/atomic_bitops.go                                 | 2 +-
 pkg/atomicbitops/atomic_bitops_amd64.s                            | 2 +-
 pkg/atomicbitops/atomic_bitops_common.go                          | 2 +-
 pkg/atomicbitops/atomic_bitops_test.go                            | 2 +-
 pkg/binary/binary.go                                              | 2 +-
 pkg/binary/binary_test.go                                         | 2 +-
 pkg/bits/bits.go                                                  | 2 +-
 pkg/bits/bits_template.go                                         | 2 +-
 pkg/bits/uint64_arch_amd64.go                                     | 2 +-
 pkg/bits/uint64_arch_amd64_asm.s                                  | 2 +-
 pkg/bits/uint64_arch_generic.go                                   | 2 +-
 pkg/bits/uint64_test.go                                           | 2 +-
 pkg/bpf/bpf.go                                                    | 2 +-
 pkg/bpf/decoder.go                                                | 2 +-
 pkg/bpf/decoder_test.go                                           | 2 +-
 pkg/bpf/input_bytes.go                                            | 2 +-
 pkg/bpf/interpreter.go                                            | 2 +-
 pkg/bpf/interpreter_test.go                                       | 2 +-
 pkg/bpf/program_builder.go                                        | 2 +-
 pkg/bpf/program_builder_test.go                                   | 2 +-
 pkg/compressio/compressio.go                                      | 2 +-
 pkg/compressio/compressio_test.go                                 | 2 +-
 pkg/control/client/client.go                                      | 2 +-
 pkg/control/server/server.go                                      | 2 +-
 pkg/cpuid/cpu_amd64.s                                             | 2 +-
 pkg/cpuid/cpuid.go                                                | 2 +-
 pkg/cpuid/cpuid_parse_test.go                                     | 2 +-
 pkg/cpuid/cpuid_test.go                                           | 2 +-
 pkg/dhcp/client.go                                                | 2 +-
 pkg/dhcp/dhcp.go                                                  | 2 +-
 pkg/dhcp/dhcp_string.go                                           | 2 +-
 pkg/dhcp/dhcp_test.go                                             | 2 +-
 pkg/dhcp/server.go                                                | 2 +-
 pkg/eventchannel/event.go                                         | 2 +-
 pkg/eventchannel/event.proto                                      | 2 +-
 pkg/fd/fd.go                                                      | 2 +-
 pkg/fd/fd_test.go                                                 | 2 +-
 pkg/fdnotifier/fdnotifier.go                                      | 2 +-
 pkg/fdnotifier/poll_unsafe.go                                     | 2 +-
 pkg/gate/gate.go                                                  | 2 +-
 pkg/gate/gate_test.go                                             | 2 +-
 pkg/ilist/list.go                                                 | 2 +-
 pkg/ilist/list_test.go                                            | 2 +-
 pkg/linewriter/linewriter.go                                      | 2 +-
 pkg/linewriter/linewriter_test.go                                 | 2 +-
 pkg/log/glog.go                                                   | 2 +-
 pkg/log/glog_unsafe.go                                            | 2 +-
 pkg/log/json.go                                                   | 2 +-
 pkg/log/json_k8s.go                                               | 2 +-
 pkg/log/json_test.go                                              | 2 +-
 pkg/log/log.go                                                    | 2 +-
 pkg/log/log_test.go                                               | 2 +-
 pkg/metric/metric.go                                              | 2 +-
 pkg/metric/metric.proto                                           | 2 +-
 pkg/metric/metric_test.go                                         | 2 +-
 pkg/p9/buffer.go                                                  | 2 +-
 pkg/p9/buffer_test.go                                             | 2 +-
 pkg/p9/client.go                                                  | 2 +-
 pkg/p9/client_file.go                                             | 2 +-
 pkg/p9/client_test.go                                             | 2 +-
 pkg/p9/file.go                                                    | 2 +-
 pkg/p9/handlers.go                                                | 2 +-
 pkg/p9/local_server/local_server.go                               | 2 +-
 pkg/p9/messages.go                                                | 2 +-
 pkg/p9/messages_test.go                                           | 2 +-
 pkg/p9/p9.go                                                      | 2 +-
 pkg/p9/p9_test.go                                                 | 2 +-
 pkg/p9/p9test/client_test.go                                      | 2 +-
 pkg/p9/p9test/p9test.go                                           | 2 +-
 pkg/p9/path_tree.go                                               | 2 +-
 pkg/p9/pool.go                                                    | 2 +-
 pkg/p9/pool_test.go                                               | 2 +-
 pkg/p9/server.go                                                  | 2 +-
 pkg/p9/transport.go                                               | 2 +-
 pkg/p9/transport_test.go                                          | 2 +-
 pkg/p9/version.go                                                 | 2 +-
 pkg/p9/version_test.go                                            | 2 +-
 pkg/rand/rand.go                                                  | 2 +-
 pkg/rand/rand_linux.go                                            | 2 +-
 pkg/refs/refcounter.go                                            | 2 +-
 pkg/refs/refcounter_state.go                                      | 2 +-
 pkg/refs/refcounter_test.go                                       | 2 +-
 pkg/seccomp/seccomp.go                                            | 2 +-
 pkg/seccomp/seccomp_rules.go                                      | 2 +-
 pkg/seccomp/seccomp_test.go                                       | 2 +-
 pkg/seccomp/seccomp_test_victim.go                                | 2 +-
 pkg/seccomp/seccomp_unsafe.go                                     | 2 +-
 pkg/secio/full_reader.go                                          | 2 +-
 pkg/secio/secio.go                                                | 2 +-
 pkg/secio/secio_test.go                                           | 2 +-
 pkg/segment/range.go                                              | 2 +-
 pkg/segment/set.go                                                | 2 +-
 pkg/segment/set_state.go                                          | 2 +-
 pkg/segment/test/segment_test.go                                  | 2 +-
 pkg/segment/test/set_functions.go                                 | 2 +-
 pkg/sentry/arch/aligned.go                                        | 2 +-
 pkg/sentry/arch/arch.go                                           | 2 +-
 pkg/sentry/arch/arch_amd64.go                                     | 2 +-
 pkg/sentry/arch/arch_amd64.s                                      | 2 +-
 pkg/sentry/arch/arch_state_x86.go                                 | 2 +-
 pkg/sentry/arch/arch_x86.go                                       | 2 +-
 pkg/sentry/arch/auxv.go                                           | 2 +-
 pkg/sentry/arch/registers.proto                                   | 2 +-
 pkg/sentry/arch/signal_act.go                                     | 2 +-
 pkg/sentry/arch/signal_amd64.go                                   | 2 +-
 pkg/sentry/arch/signal_info.go                                    | 2 +-
 pkg/sentry/arch/signal_stack.go                                   | 2 +-
 pkg/sentry/arch/stack.go                                          | 2 +-
 pkg/sentry/arch/syscalls_amd64.go                                 | 2 +-
 pkg/sentry/context/context.go                                     | 2 +-
 pkg/sentry/context/contexttest/contexttest.go                     | 2 +-
 pkg/sentry/control/control.go                                     | 2 +-
 pkg/sentry/control/pprof.go                                       | 2 +-
 pkg/sentry/control/proc.go                                        | 2 +-
 pkg/sentry/control/proc_test.go                                   | 2 +-
 pkg/sentry/control/state.go                                       | 2 +-
 pkg/sentry/device/device.go                                       | 2 +-
 pkg/sentry/device/device_test.go                                  | 2 +-
 pkg/sentry/fs/anon/anon.go                                        | 2 +-
 pkg/sentry/fs/anon/device.go                                      | 2 +-
 pkg/sentry/fs/ashmem/area.go                                      | 2 +-
 pkg/sentry/fs/ashmem/device.go                                    | 2 +-
 pkg/sentry/fs/ashmem/pin_board.go                                 | 2 +-
 pkg/sentry/fs/ashmem/pin_board_test.go                            | 2 +-
 pkg/sentry/fs/attr.go                                             | 2 +-
 pkg/sentry/fs/binder/binder.go                                    | 2 +-
 pkg/sentry/fs/context.go                                          | 2 +-
 pkg/sentry/fs/copy_up.go                                          | 2 +-
 pkg/sentry/fs/copy_up_test.go                                     | 2 +-
 pkg/sentry/fs/dentry.go                                           | 2 +-
 pkg/sentry/fs/dev/dev.go                                          | 2 +-
 pkg/sentry/fs/dev/device.go                                       | 2 +-
 pkg/sentry/fs/dev/fs.go                                           | 2 +-
 pkg/sentry/fs/dev/full.go                                         | 2 +-
 pkg/sentry/fs/dev/null.go                                         | 2 +-
 pkg/sentry/fs/dev/random.go                                       | 2 +-
 pkg/sentry/fs/dirent.go                                           | 2 +-
 pkg/sentry/fs/dirent_cache.go                                     | 2 +-
 pkg/sentry/fs/dirent_cache_limiter.go                             | 2 +-
 pkg/sentry/fs/dirent_cache_test.go                                | 2 +-
 pkg/sentry/fs/dirent_refs_test.go                                 | 2 +-
 pkg/sentry/fs/dirent_state.go                                     | 2 +-
 pkg/sentry/fs/fdpipe/pipe.go                                      | 2 +-
 pkg/sentry/fs/fdpipe/pipe_opener.go                               | 2 +-
 pkg/sentry/fs/fdpipe/pipe_opener_test.go                          | 2 +-
 pkg/sentry/fs/fdpipe/pipe_state.go                                | 2 +-
 pkg/sentry/fs/fdpipe/pipe_test.go                                 | 2 +-
 pkg/sentry/fs/file.go                                             | 2 +-
 pkg/sentry/fs/file_operations.go                                  | 2 +-
 pkg/sentry/fs/file_overlay.go                                     | 2 +-
 pkg/sentry/fs/file_overlay_test.go                                | 2 +-
 pkg/sentry/fs/file_state.go                                       | 2 +-
 pkg/sentry/fs/file_test.go                                        | 2 +-
 pkg/sentry/fs/filesystems.go                                      | 2 +-
 pkg/sentry/fs/filetest/filetest.go                                | 2 +-
 pkg/sentry/fs/flags.go                                            | 2 +-
 pkg/sentry/fs/fs.go                                               | 2 +-
 pkg/sentry/fs/fsutil/dirty_set.go                                 | 2 +-
 pkg/sentry/fs/fsutil/dirty_set_test.go                            | 2 +-
 pkg/sentry/fs/fsutil/file.go                                      | 2 +-
 pkg/sentry/fs/fsutil/file_range_set.go                            | 2 +-
 pkg/sentry/fs/fsutil/frame_ref_set.go                             | 2 +-
 pkg/sentry/fs/fsutil/fsutil.go                                    | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper.go                          | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper_state.go                    | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go                   | 2 +-
 pkg/sentry/fs/fsutil/host_mappable.go                             | 2 +-
 pkg/sentry/fs/fsutil/inode.go                                     | 2 +-
 pkg/sentry/fs/fsutil/inode_cached.go                              | 2 +-
 pkg/sentry/fs/fsutil/inode_cached_test.go                         | 2 +-
 pkg/sentry/fs/gofer/attr.go                                       | 2 +-
 pkg/sentry/fs/gofer/cache_policy.go                               | 2 +-
 pkg/sentry/fs/gofer/context_file.go                               | 2 +-
 pkg/sentry/fs/gofer/device.go                                     | 2 +-
 pkg/sentry/fs/gofer/file.go                                       | 2 +-
 pkg/sentry/fs/gofer/file_state.go                                 | 2 +-
 pkg/sentry/fs/gofer/fs.go                                         | 2 +-
 pkg/sentry/fs/gofer/gofer_test.go                                 | 2 +-
 pkg/sentry/fs/gofer/handles.go                                    | 2 +-
 pkg/sentry/fs/gofer/inode.go                                      | 2 +-
 pkg/sentry/fs/gofer/inode_state.go                                | 2 +-
 pkg/sentry/fs/gofer/path.go                                       | 2 +-
 pkg/sentry/fs/gofer/session.go                                    | 2 +-
 pkg/sentry/fs/gofer/session_state.go                              | 2 +-
 pkg/sentry/fs/gofer/socket.go                                     | 2 +-
 pkg/sentry/fs/gofer/util.go                                       | 2 +-
 pkg/sentry/fs/host/control.go                                     | 2 +-
 pkg/sentry/fs/host/descriptor.go                                  | 2 +-
 pkg/sentry/fs/host/descriptor_state.go                            | 2 +-
 pkg/sentry/fs/host/descriptor_test.go                             | 2 +-
 pkg/sentry/fs/host/device.go                                      | 2 +-
 pkg/sentry/fs/host/file.go                                        | 2 +-
 pkg/sentry/fs/host/fs.go                                          | 2 +-
 pkg/sentry/fs/host/fs_test.go                                     | 2 +-
 pkg/sentry/fs/host/inode.go                                       | 2 +-
 pkg/sentry/fs/host/inode_state.go                                 | 2 +-
 pkg/sentry/fs/host/inode_test.go                                  | 2 +-
 pkg/sentry/fs/host/ioctl_unsafe.go                                | 2 +-
 pkg/sentry/fs/host/socket.go                                      | 2 +-
 pkg/sentry/fs/host/socket_iovec.go                                | 2 +-
 pkg/sentry/fs/host/socket_state.go                                | 2 +-
 pkg/sentry/fs/host/socket_test.go                                 | 2 +-
 pkg/sentry/fs/host/socket_unsafe.go                               | 2 +-
 pkg/sentry/fs/host/tty.go                                         | 2 +-
 pkg/sentry/fs/host/util.go                                        | 2 +-
 pkg/sentry/fs/host/util_unsafe.go                                 | 2 +-
 pkg/sentry/fs/host/wait_test.go                                   | 2 +-
 pkg/sentry/fs/inode.go                                            | 2 +-
 pkg/sentry/fs/inode_inotify.go                                    | 2 +-
 pkg/sentry/fs/inode_operations.go                                 | 2 +-
 pkg/sentry/fs/inode_overlay.go                                    | 2 +-
 pkg/sentry/fs/inode_overlay_test.go                               | 2 +-
 pkg/sentry/fs/inotify.go                                          | 2 +-
 pkg/sentry/fs/inotify_event.go                                    | 2 +-
 pkg/sentry/fs/inotify_watch.go                                    | 2 +-
 pkg/sentry/fs/lock/lock.go                                        | 2 +-
 pkg/sentry/fs/lock/lock_range_test.go                             | 2 +-
 pkg/sentry/fs/lock/lock_set_functions.go                          | 2 +-
 pkg/sentry/fs/lock/lock_test.go                                   | 2 +-
 pkg/sentry/fs/mock.go                                             | 2 +-
 pkg/sentry/fs/mount.go                                            | 2 +-
 pkg/sentry/fs/mount_overlay.go                                    | 2 +-
 pkg/sentry/fs/mount_test.go                                       | 2 +-
 pkg/sentry/fs/mounts.go                                           | 2 +-
 pkg/sentry/fs/mounts_test.go                                      | 2 +-
 pkg/sentry/fs/offset.go                                           | 2 +-
 pkg/sentry/fs/overlay.go                                          | 2 +-
 pkg/sentry/fs/path.go                                             | 2 +-
 pkg/sentry/fs/path_test.go                                        | 2 +-
 pkg/sentry/fs/proc/cpuinfo.go                                     | 2 +-
 pkg/sentry/fs/proc/device/device.go                               | 2 +-
 pkg/sentry/fs/proc/exec_args.go                                   | 2 +-
 pkg/sentry/fs/proc/fds.go                                         | 2 +-
 pkg/sentry/fs/proc/filesystems.go                                 | 2 +-
 pkg/sentry/fs/proc/fs.go                                          | 2 +-
 pkg/sentry/fs/proc/inode.go                                       | 2 +-
 pkg/sentry/fs/proc/loadavg.go                                     | 2 +-
 pkg/sentry/fs/proc/meminfo.go                                     | 2 +-
 pkg/sentry/fs/proc/mounts.go                                      | 2 +-
 pkg/sentry/fs/proc/net.go                                         | 2 +-
 pkg/sentry/fs/proc/net_test.go                                    | 2 +-
 pkg/sentry/fs/proc/proc.go                                        | 2 +-
 pkg/sentry/fs/proc/rpcinet_proc.go                                | 2 +-
 pkg/sentry/fs/proc/seqfile/seqfile.go                             | 2 +-
 pkg/sentry/fs/proc/seqfile/seqfile_test.go                        | 2 +-
 pkg/sentry/fs/proc/stat.go                                        | 2 +-
 pkg/sentry/fs/proc/sys.go                                         | 2 +-
 pkg/sentry/fs/proc/sys_net.go                                     | 2 +-
 pkg/sentry/fs/proc/sys_net_state.go                               | 2 +-
 pkg/sentry/fs/proc/sys_net_test.go                                | 2 +-
 pkg/sentry/fs/proc/task.go                                        | 2 +-
 pkg/sentry/fs/proc/uid_gid_map.go                                 | 2 +-
 pkg/sentry/fs/proc/uptime.go                                      | 2 +-
 pkg/sentry/fs/proc/version.go                                     | 2 +-
 pkg/sentry/fs/ramfs/dir.go                                        | 2 +-
 pkg/sentry/fs/ramfs/socket.go                                     | 2 +-
 pkg/sentry/fs/ramfs/symlink.go                                    | 2 +-
 pkg/sentry/fs/ramfs/tree.go                                       | 2 +-
 pkg/sentry/fs/ramfs/tree_test.go                                  | 2 +-
 pkg/sentry/fs/restore.go                                          | 2 +-
 pkg/sentry/fs/save.go                                             | 2 +-
 pkg/sentry/fs/seek.go                                             | 2 +-
 pkg/sentry/fs/sync.go                                             | 2 +-
 pkg/sentry/fs/sys/device.go                                       | 2 +-
 pkg/sentry/fs/sys/devices.go                                      | 2 +-
 pkg/sentry/fs/sys/fs.go                                           | 2 +-
 pkg/sentry/fs/sys/sys.go                                          | 2 +-
 pkg/sentry/fs/timerfd/timerfd.go                                  | 2 +-
 pkg/sentry/fs/tmpfs/device.go                                     | 2 +-
 pkg/sentry/fs/tmpfs/file_regular.go                               | 2 +-
 pkg/sentry/fs/tmpfs/file_test.go                                  | 2 +-
 pkg/sentry/fs/tmpfs/fs.go                                         | 2 +-
 pkg/sentry/fs/tmpfs/inode_file.go                                 | 2 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                                      | 2 +-
 pkg/sentry/fs/tty/dir.go                                          | 2 +-
 pkg/sentry/fs/tty/fs.go                                           | 2 +-
 pkg/sentry/fs/tty/line_discipline.go                              | 2 +-
 pkg/sentry/fs/tty/master.go                                       | 2 +-
 pkg/sentry/fs/tty/queue.go                                        | 2 +-
 pkg/sentry/fs/tty/slave.go                                        | 2 +-
 pkg/sentry/fs/tty/terminal.go                                     | 2 +-
 pkg/sentry/fs/tty/tty_test.go                                     | 2 +-
 pkg/sentry/hostcpu/getcpu_amd64.s                                 | 2 +-
 pkg/sentry/hostcpu/hostcpu.go                                     | 2 +-
 pkg/sentry/hostcpu/hostcpu_test.go                                | 2 +-
 pkg/sentry/inet/context.go                                        | 2 +-
 pkg/sentry/inet/inet.go                                           | 2 +-
 pkg/sentry/inet/test_stack.go                                     | 2 +-
 pkg/sentry/kernel/abstract_socket_namespace.go                    | 2 +-
 pkg/sentry/kernel/auth/auth.go                                    | 2 +-
 pkg/sentry/kernel/auth/capability_set.go                          | 2 +-
 pkg/sentry/kernel/auth/context.go                                 | 2 +-
 pkg/sentry/kernel/auth/credentials.go                             | 2 +-
 pkg/sentry/kernel/auth/id.go                                      | 2 +-
 pkg/sentry/kernel/auth/id_map.go                                  | 2 +-
 pkg/sentry/kernel/auth/id_map_functions.go                        | 2 +-
 pkg/sentry/kernel/auth/user_namespace.go                          | 2 +-
 pkg/sentry/kernel/context.go                                      | 2 +-
 pkg/sentry/kernel/contexttest/contexttest.go                      | 2 +-
 pkg/sentry/kernel/epoll/epoll.go                                  | 2 +-
 pkg/sentry/kernel/epoll/epoll_state.go                            | 2 +-
 pkg/sentry/kernel/epoll/epoll_test.go                             | 2 +-
 pkg/sentry/kernel/eventfd/eventfd.go                              | 2 +-
 pkg/sentry/kernel/eventfd/eventfd_test.go                         | 2 +-
 pkg/sentry/kernel/fasync/fasync.go                                | 2 +-
 pkg/sentry/kernel/fd_map.go                                       | 2 +-
 pkg/sentry/kernel/fd_map_test.go                                  | 2 +-
 pkg/sentry/kernel/fs_context.go                                   | 2 +-
 pkg/sentry/kernel/futex/futex.go                                  | 2 +-
 pkg/sentry/kernel/futex/futex_test.go                             | 2 +-
 pkg/sentry/kernel/ipc_namespace.go                                | 2 +-
 pkg/sentry/kernel/kdefs/kdefs.go                                  | 2 +-
 pkg/sentry/kernel/kernel.go                                       | 2 +-
 pkg/sentry/kernel/kernel_state.go                                 | 2 +-
 pkg/sentry/kernel/memevent/memory_events.go                       | 2 +-
 pkg/sentry/kernel/memevent/memory_events.proto                    | 2 +-
 pkg/sentry/kernel/pending_signals.go                              | 2 +-
 pkg/sentry/kernel/pending_signals_state.go                        | 2 +-
 pkg/sentry/kernel/pipe/buffers.go                                 | 2 +-
 pkg/sentry/kernel/pipe/device.go                                  | 2 +-
 pkg/sentry/kernel/pipe/node.go                                    | 2 +-
 pkg/sentry/kernel/pipe/node_test.go                               | 2 +-
 pkg/sentry/kernel/pipe/pipe.go                                    | 2 +-
 pkg/sentry/kernel/pipe/pipe_test.go                               | 2 +-
 pkg/sentry/kernel/pipe/reader.go                                  | 2 +-
 pkg/sentry/kernel/pipe/reader_writer.go                           | 2 +-
 pkg/sentry/kernel/pipe/writer.go                                  | 2 +-
 pkg/sentry/kernel/posixtimer.go                                   | 2 +-
 pkg/sentry/kernel/ptrace.go                                       | 2 +-
 pkg/sentry/kernel/ptrace_amd64.go                                 | 2 +-
 pkg/sentry/kernel/ptrace_arm64.go                                 | 2 +-
 pkg/sentry/kernel/rseq.go                                         | 2 +-
 pkg/sentry/kernel/sched/cpuset.go                                 | 2 +-
 pkg/sentry/kernel/sched/cpuset_test.go                            | 2 +-
 pkg/sentry/kernel/sched/sched.go                                  | 2 +-
 pkg/sentry/kernel/seccomp.go                                      | 2 +-
 pkg/sentry/kernel/semaphore/semaphore.go                          | 2 +-
 pkg/sentry/kernel/semaphore/semaphore_test.go                     | 2 +-
 pkg/sentry/kernel/sessions.go                                     | 2 +-
 pkg/sentry/kernel/shm/device.go                                   | 2 +-
 pkg/sentry/kernel/shm/shm.go                                      | 2 +-
 pkg/sentry/kernel/signal.go                                       | 2 +-
 pkg/sentry/kernel/signal_handlers.go                              | 2 +-
 pkg/sentry/kernel/syscalls.go                                     | 2 +-
 pkg/sentry/kernel/syscalls_state.go                               | 2 +-
 pkg/sentry/kernel/syslog.go                                       | 2 +-
 pkg/sentry/kernel/table_test.go                                   | 2 +-
 pkg/sentry/kernel/task.go                                         | 2 +-
 pkg/sentry/kernel/task_acct.go                                    | 2 +-
 pkg/sentry/kernel/task_block.go                                   | 2 +-
 pkg/sentry/kernel/task_clone.go                                   | 2 +-
 pkg/sentry/kernel/task_context.go                                 | 2 +-
 pkg/sentry/kernel/task_exec.go                                    | 2 +-
 pkg/sentry/kernel/task_exit.go                                    | 2 +-
 pkg/sentry/kernel/task_futex.go                                   | 2 +-
 pkg/sentry/kernel/task_identity.go                                | 2 +-
 pkg/sentry/kernel/task_log.go                                     | 2 +-
 pkg/sentry/kernel/task_net.go                                     | 2 +-
 pkg/sentry/kernel/task_run.go                                     | 2 +-
 pkg/sentry/kernel/task_sched.go                                   | 2 +-
 pkg/sentry/kernel/task_signals.go                                 | 2 +-
 pkg/sentry/kernel/task_start.go                                   | 2 +-
 pkg/sentry/kernel/task_stop.go                                    | 2 +-
 pkg/sentry/kernel/task_syscall.go                                 | 2 +-
 pkg/sentry/kernel/task_test.go                                    | 2 +-
 pkg/sentry/kernel/task_usermem.go                                 | 2 +-
 pkg/sentry/kernel/thread_group.go                                 | 2 +-
 pkg/sentry/kernel/threads.go                                      | 2 +-
 pkg/sentry/kernel/time/context.go                                 | 2 +-
 pkg/sentry/kernel/time/time.go                                    | 2 +-
 pkg/sentry/kernel/timekeeper.go                                   | 2 +-
 pkg/sentry/kernel/timekeeper_state.go                             | 2 +-
 pkg/sentry/kernel/timekeeper_test.go                              | 2 +-
 pkg/sentry/kernel/uncaught_signal.proto                           | 2 +-
 pkg/sentry/kernel/uts_namespace.go                                | 2 +-
 pkg/sentry/kernel/vdso.go                                         | 2 +-
 pkg/sentry/kernel/version.go                                      | 2 +-
 pkg/sentry/limits/context.go                                      | 2 +-
 pkg/sentry/limits/limits.go                                       | 2 +-
 pkg/sentry/limits/limits_test.go                                  | 2 +-
 pkg/sentry/limits/linux.go                                        | 2 +-
 pkg/sentry/loader/elf.go                                          | 2 +-
 pkg/sentry/loader/interpreter.go                                  | 2 +-
 pkg/sentry/loader/loader.go                                       | 2 +-
 pkg/sentry/loader/vdso.go                                         | 2 +-
 pkg/sentry/loader/vdso_state.go                                   | 2 +-
 pkg/sentry/memmap/mapping_set.go                                  | 2 +-
 pkg/sentry/memmap/mapping_set_test.go                             | 2 +-
 pkg/sentry/memmap/memmap.go                                       | 2 +-
 pkg/sentry/memutil/memutil.go                                     | 2 +-
 pkg/sentry/memutil/memutil_unsafe.go                              | 2 +-
 pkg/sentry/mm/address_space.go                                    | 2 +-
 pkg/sentry/mm/aio_context.go                                      | 2 +-
 pkg/sentry/mm/aio_context_state.go                                | 2 +-
 pkg/sentry/mm/debug.go                                            | 2 +-
 pkg/sentry/mm/io.go                                               | 2 +-
 pkg/sentry/mm/lifecycle.go                                        | 2 +-
 pkg/sentry/mm/metadata.go                                         | 2 +-
 pkg/sentry/mm/mm.go                                               | 2 +-
 pkg/sentry/mm/mm_test.go                                          | 2 +-
 pkg/sentry/mm/pma.go                                              | 2 +-
 pkg/sentry/mm/procfs.go                                           | 2 +-
 pkg/sentry/mm/save_restore.go                                     | 2 +-
 pkg/sentry/mm/shm.go                                              | 2 +-
 pkg/sentry/mm/special_mappable.go                                 | 2 +-
 pkg/sentry/mm/syscalls.go                                         | 2 +-
 pkg/sentry/mm/vma.go                                              | 2 +-
 pkg/sentry/pgalloc/context.go                                     | 2 +-
 pkg/sentry/pgalloc/pgalloc.go                                     | 2 +-
 pkg/sentry/pgalloc/pgalloc_test.go                                | 2 +-
 pkg/sentry/pgalloc/pgalloc_unsafe.go                              | 2 +-
 pkg/sentry/pgalloc/save_restore.go                                | 2 +-
 pkg/sentry/platform/context.go                                    | 2 +-
 pkg/sentry/platform/interrupt/interrupt.go                        | 2 +-
 pkg/sentry/platform/interrupt/interrupt_test.go                   | 2 +-
 pkg/sentry/platform/kvm/address_space.go                          | 2 +-
 pkg/sentry/platform/kvm/allocator.go                              | 2 +-
 pkg/sentry/platform/kvm/bluepill.go                               | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64.go                         | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64.s                          | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go                  | 2 +-
 pkg/sentry/platform/kvm/bluepill_fault.go                         | 2 +-
 pkg/sentry/platform/kvm/bluepill_unsafe.go                        | 2 +-
 pkg/sentry/platform/kvm/context.go                                | 2 +-
 pkg/sentry/platform/kvm/kvm.go                                    | 2 +-
 pkg/sentry/platform/kvm/kvm_amd64.go                              | 2 +-
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go                       | 2 +-
 pkg/sentry/platform/kvm/kvm_const.go                              | 2 +-
 pkg/sentry/platform/kvm/kvm_test.go                               | 2 +-
 pkg/sentry/platform/kvm/machine.go                                | 2 +-
 pkg/sentry/platform/kvm/machine_amd64.go                          | 2 +-
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go                   | 2 +-
 pkg/sentry/platform/kvm/machine_unsafe.go                         | 2 +-
 pkg/sentry/platform/kvm/physical_map.go                           | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil.go                      | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil_amd64.go                | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil_amd64.s                 | 2 +-
 pkg/sentry/platform/kvm/virtual_map.go                            | 2 +-
 pkg/sentry/platform/kvm/virtual_map_test.go                       | 2 +-
 pkg/sentry/platform/mmap_min_addr.go                              | 2 +-
 pkg/sentry/platform/platform.go                                   | 2 +-
 pkg/sentry/platform/procid/procid.go                              | 2 +-
 pkg/sentry/platform/procid/procid_amd64.s                         | 2 +-
 pkg/sentry/platform/procid/procid_arm64.s                         | 2 +-
 pkg/sentry/platform/procid/procid_net_test.go                     | 2 +-
 pkg/sentry/platform/procid/procid_test.go                         | 2 +-
 pkg/sentry/platform/ptrace/ptrace.go                              | 2 +-
 pkg/sentry/platform/ptrace/ptrace_unsafe.go                       | 2 +-
 pkg/sentry/platform/ptrace/stub_amd64.s                           | 2 +-
 pkg/sentry/platform/ptrace/stub_unsafe.go                         | 2 +-
 pkg/sentry/platform/ptrace/subprocess.go                          | 2 +-
 pkg/sentry/platform/ptrace/subprocess_amd64.go                    | 2 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go                    | 2 +-
 pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go       | 2 +-
 pkg/sentry/platform/ptrace/subprocess_unsafe.go                   | 2 +-
 pkg/sentry/platform/ring0/defs.go                                 | 2 +-
 pkg/sentry/platform/ring0/defs_amd64.go                           | 2 +-
 pkg/sentry/platform/ring0/entry_amd64.go                          | 2 +-
 pkg/sentry/platform/ring0/entry_amd64.s                           | 2 +-
 pkg/sentry/platform/ring0/gen_offsets/main.go                     | 2 +-
 pkg/sentry/platform/ring0/kernel.go                               | 2 +-
 pkg/sentry/platform/ring0/kernel_amd64.go                         | 2 +-
 pkg/sentry/platform/ring0/kernel_unsafe.go                        | 2 +-
 pkg/sentry/platform/ring0/lib_amd64.go                            | 2 +-
 pkg/sentry/platform/ring0/lib_amd64.s                             | 2 +-
 pkg/sentry/platform/ring0/offsets_amd64.go                        | 2 +-
 pkg/sentry/platform/ring0/pagetables/allocator.go                 | 2 +-
 pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go          | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go                | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go          | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go     | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_test.go           | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_x86.go            | 2 +-
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go                 | 2 +-
 pkg/sentry/platform/ring0/pagetables/walker_amd64.go              | 2 +-
 pkg/sentry/platform/ring0/ring0.go                                | 2 +-
 pkg/sentry/platform/ring0/x86.go                                  | 2 +-
 pkg/sentry/platform/safecopy/atomic_amd64.s                       | 2 +-
 pkg/sentry/platform/safecopy/safecopy.go                          | 2 +-
 pkg/sentry/platform/safecopy/safecopy_test.go                     | 2 +-
 pkg/sentry/platform/safecopy/safecopy_unsafe.go                   | 2 +-
 pkg/sentry/platform/safecopy/sighandler_amd64.s                   | 2 +-
 pkg/sentry/platform/safecopy/sighandler_arm64.s                   | 2 +-
 pkg/sentry/safemem/block_unsafe.go                                | 2 +-
 pkg/sentry/safemem/io.go                                          | 2 +-
 pkg/sentry/safemem/io_test.go                                     | 2 +-
 pkg/sentry/safemem/safemem.go                                     | 2 +-
 pkg/sentry/safemem/seq_test.go                                    | 2 +-
 pkg/sentry/safemem/seq_unsafe.go                                  | 2 +-
 pkg/sentry/sighandling/sighandling.go                             | 2 +-
 pkg/sentry/sighandling/sighandling_unsafe.go                      | 2 +-
 pkg/sentry/socket/control/control.go                              | 2 +-
 pkg/sentry/socket/epsocket/device.go                              | 2 +-
 pkg/sentry/socket/epsocket/epsocket.go                            | 2 +-
 pkg/sentry/socket/epsocket/provider.go                            | 2 +-
 pkg/sentry/socket/epsocket/save_restore.go                        | 2 +-
 pkg/sentry/socket/epsocket/stack.go                               | 2 +-
 pkg/sentry/socket/hostinet/device.go                              | 2 +-
 pkg/sentry/socket/hostinet/hostinet.go                            | 2 +-
 pkg/sentry/socket/hostinet/save_restore.go                        | 2 +-
 pkg/sentry/socket/hostinet/socket.go                              | 2 +-
 pkg/sentry/socket/hostinet/socket_unsafe.go                       | 2 +-
 pkg/sentry/socket/hostinet/stack.go                               | 2 +-
 pkg/sentry/socket/netlink/message.go                              | 2 +-
 pkg/sentry/socket/netlink/port/port.go                            | 2 +-
 pkg/sentry/socket/netlink/port/port_test.go                       | 2 +-
 pkg/sentry/socket/netlink/provider.go                             | 2 +-
 pkg/sentry/socket/netlink/route/protocol.go                       | 2 +-
 pkg/sentry/socket/netlink/socket.go                               | 2 +-
 pkg/sentry/socket/rpcinet/conn/conn.go                            | 2 +-
 pkg/sentry/socket/rpcinet/device.go                               | 2 +-
 pkg/sentry/socket/rpcinet/notifier/notifier.go                    | 2 +-
 pkg/sentry/socket/rpcinet/rpcinet.go                              | 2 +-
 pkg/sentry/socket/rpcinet/socket.go                               | 2 +-
 pkg/sentry/socket/rpcinet/stack.go                                | 2 +-
 pkg/sentry/socket/rpcinet/stack_unsafe.go                         | 2 +-
 pkg/sentry/socket/socket.go                                       | 2 +-
 pkg/sentry/socket/unix/device.go                                  | 2 +-
 pkg/sentry/socket/unix/io.go                                      | 2 +-
 pkg/sentry/socket/unix/transport/connectioned.go                  | 2 +-
 pkg/sentry/socket/unix/transport/connectioned_state.go            | 2 +-
 pkg/sentry/socket/unix/transport/connectionless.go                | 2 +-
 pkg/sentry/socket/unix/transport/queue.go                         | 2 +-
 pkg/sentry/socket/unix/transport/unix.go                          | 2 +-
 pkg/sentry/socket/unix/unix.go                                    | 2 +-
 pkg/sentry/state/state.go                                         | 2 +-
 pkg/sentry/state/state_metadata.go                                | 2 +-
 pkg/sentry/state/state_unsafe.go                                  | 2 +-
 pkg/sentry/strace/capability.go                                   | 2 +-
 pkg/sentry/strace/clone.go                                        | 2 +-
 pkg/sentry/strace/futex.go                                        | 2 +-
 pkg/sentry/strace/linux64.go                                      | 2 +-
 pkg/sentry/strace/open.go                                         | 2 +-
 pkg/sentry/strace/poll.go                                         | 2 +-
 pkg/sentry/strace/ptrace.go                                       | 2 +-
 pkg/sentry/strace/signal.go                                       | 2 +-
 pkg/sentry/strace/socket.go                                       | 2 +-
 pkg/sentry/strace/strace.go                                       | 2 +-
 pkg/sentry/strace/strace.proto                                    | 2 +-
 pkg/sentry/strace/syscalls.go                                     | 2 +-
 pkg/sentry/syscalls/epoll.go                                      | 2 +-
 pkg/sentry/syscalls/linux/error.go                                | 2 +-
 pkg/sentry/syscalls/linux/flags.go                                | 2 +-
 pkg/sentry/syscalls/linux/linux64.go                              | 2 +-
 pkg/sentry/syscalls/linux/sigset.go                               | 2 +-
 pkg/sentry/syscalls/linux/sys_aio.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_capability.go                       | 2 +-
 pkg/sentry/syscalls/linux/sys_epoll.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_eventfd.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_file.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_futex.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_getdents.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_identity.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_inotify.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_lseek.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_mmap.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_mount.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_pipe.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_poll.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_prctl.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_random.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_read.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_rlimit.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_rusage.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_sched.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_seccomp.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_sem.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_shm.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_signal.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_socket.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_stat.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_sync.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_sysinfo.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_syslog.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_thread.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_time.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_timer.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_timerfd.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_tls.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_utsname.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_write.go                            | 2 +-
 pkg/sentry/syscalls/linux/timespec.go                             | 2 +-
 pkg/sentry/syscalls/syscalls.go                                   | 2 +-
 pkg/sentry/time/calibrated_clock.go                               | 2 +-
 pkg/sentry/time/calibrated_clock_test.go                          | 2 +-
 pkg/sentry/time/clock_id.go                                       | 2 +-
 pkg/sentry/time/clocks.go                                         | 2 +-
 pkg/sentry/time/muldiv_amd64.s                                    | 2 +-
 pkg/sentry/time/muldiv_arm64.s                                    | 2 +-
 pkg/sentry/time/parameters.go                                     | 2 +-
 pkg/sentry/time/parameters_test.go                                | 2 +-
 pkg/sentry/time/sampler.go                                        | 2 +-
 pkg/sentry/time/sampler_test.go                                   | 2 +-
 pkg/sentry/time/sampler_unsafe.go                                 | 2 +-
 pkg/sentry/time/tsc_amd64.s                                       | 2 +-
 pkg/sentry/time/tsc_arm64.s                                       | 2 +-
 pkg/sentry/unimpl/events.go                                       | 2 +-
 pkg/sentry/unimpl/unimplemented_syscall.proto                     | 2 +-
 pkg/sentry/uniqueid/context.go                                    | 2 +-
 pkg/sentry/usage/cpu.go                                           | 2 +-
 pkg/sentry/usage/io.go                                            | 2 +-
 pkg/sentry/usage/memory.go                                        | 2 +-
 pkg/sentry/usage/memory_unsafe.go                                 | 2 +-
 pkg/sentry/usage/usage.go                                         | 2 +-
 pkg/sentry/usermem/access_type.go                                 | 2 +-
 pkg/sentry/usermem/addr.go                                        | 2 +-
 pkg/sentry/usermem/addr_range_seq_test.go                         | 2 +-
 pkg/sentry/usermem/addr_range_seq_unsafe.go                       | 2 +-
 pkg/sentry/usermem/bytes_io.go                                    | 2 +-
 pkg/sentry/usermem/bytes_io_unsafe.go                             | 2 +-
 pkg/sentry/usermem/usermem.go                                     | 2 +-
 pkg/sentry/usermem/usermem_arm64.go                               | 2 +-
 pkg/sentry/usermem/usermem_test.go                                | 2 +-
 pkg/sentry/usermem/usermem_unsafe.go                              | 2 +-
 pkg/sentry/usermem/usermem_x86.go                                 | 2 +-
 pkg/sentry/watchdog/watchdog.go                                   | 2 +-
 pkg/sleep/commit_amd64.s                                          | 2 +-
 pkg/sleep/commit_asm.go                                           | 2 +-
 pkg/sleep/commit_noasm.go                                         | 2 +-
 pkg/sleep/empty.s                                                 | 2 +-
 pkg/sleep/sleep_test.go                                           | 2 +-
 pkg/sleep/sleep_unsafe.go                                         | 2 +-
 pkg/state/decode.go                                               | 2 +-
 pkg/state/encode.go                                               | 2 +-
 pkg/state/encode_unsafe.go                                        | 2 +-
 pkg/state/map.go                                                  | 2 +-
 pkg/state/object.proto                                            | 2 +-
 pkg/state/printer.go                                              | 2 +-
 pkg/state/state.go                                                | 2 +-
 pkg/state/state_test.go                                           | 2 +-
 pkg/state/statefile/statefile.go                                  | 2 +-
 pkg/state/statefile/statefile_test.go                             | 2 +-
 pkg/state/stats.go                                                | 2 +-
 pkg/syserr/host_linux.go                                          | 2 +-
 pkg/syserr/netstack.go                                            | 2 +-
 pkg/syserr/syserr.go                                              | 2 +-
 pkg/syserror/syserror.go                                          | 2 +-
 pkg/syserror/syserror_test.go                                     | 2 +-
 pkg/tcpip/adapters/gonet/gonet.go                                 | 2 +-
 pkg/tcpip/adapters/gonet/gonet_test.go                            | 2 +-
 pkg/tcpip/buffer/prependable.go                                   | 2 +-
 pkg/tcpip/buffer/view.go                                          | 2 +-
 pkg/tcpip/buffer/view_test.go                                     | 2 +-
 pkg/tcpip/checker/checker.go                                      | 2 +-
 pkg/tcpip/hash/jenkins/jenkins.go                                 | 2 +-
 pkg/tcpip/hash/jenkins/jenkins_test.go                            | 2 +-
 pkg/tcpip/header/arp.go                                           | 2 +-
 pkg/tcpip/header/checksum.go                                      | 2 +-
 pkg/tcpip/header/eth.go                                           | 2 +-
 pkg/tcpip/header/gue.go                                           | 2 +-
 pkg/tcpip/header/icmpv4.go                                        | 2 +-
 pkg/tcpip/header/icmpv6.go                                        | 2 +-
 pkg/tcpip/header/interfaces.go                                    | 2 +-
 pkg/tcpip/header/ipv4.go                                          | 2 +-
 pkg/tcpip/header/ipv6.go                                          | 2 +-
 pkg/tcpip/header/ipv6_fragment.go                                 | 2 +-
 pkg/tcpip/header/ipversion_test.go                                | 2 +-
 pkg/tcpip/header/tcp.go                                           | 2 +-
 pkg/tcpip/header/tcp_test.go                                      | 2 +-
 pkg/tcpip/header/udp.go                                           | 2 +-
 pkg/tcpip/link/channel/channel.go                                 | 2 +-
 pkg/tcpip/link/fdbased/endpoint.go                                | 2 +-
 pkg/tcpip/link/fdbased/endpoint_test.go                           | 2 +-
 pkg/tcpip/link/fdbased/endpoint_unsafe.go                         | 2 +-
 pkg/tcpip/link/fdbased/mmap.go                                    | 2 +-
 pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go                       | 2 +-
 pkg/tcpip/link/loopback/loopback.go                               | 2 +-
 pkg/tcpip/link/muxed/injectable.go                                | 2 +-
 pkg/tcpip/link/muxed/injectable_test.go                           | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_amd64.s                       | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go               | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_unsafe.go                     | 2 +-
 pkg/tcpip/link/rawfile/errors.go                                  | 2 +-
 pkg/tcpip/link/rawfile/rawfile_unsafe.go                          | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe.go                             | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe_test.go                        | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go                      | 2 +-
 pkg/tcpip/link/sharedmem/pipe/rx.go                               | 2 +-
 pkg/tcpip/link/sharedmem/pipe/tx.go                               | 2 +-
 pkg/tcpip/link/sharedmem/queue/queue_test.go                      | 2 +-
 pkg/tcpip/link/sharedmem/queue/rx.go                              | 2 +-
 pkg/tcpip/link/sharedmem/queue/tx.go                              | 2 +-
 pkg/tcpip/link/sharedmem/rx.go                                    | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem.go                             | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem_test.go                        | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem_unsafe.go                      | 2 +-
 pkg/tcpip/link/sharedmem/tx.go                                    | 2 +-
 pkg/tcpip/link/sniffer/pcap.go                                    | 2 +-
 pkg/tcpip/link/sniffer/sniffer.go                                 | 2 +-
 pkg/tcpip/link/tun/tun_unsafe.go                                  | 2 +-
 pkg/tcpip/link/waitable/waitable.go                               | 2 +-
 pkg/tcpip/link/waitable/waitable_test.go                          | 2 +-
 pkg/tcpip/network/arp/arp.go                                      | 2 +-
 pkg/tcpip/network/arp/arp_test.go                                 | 2 +-
 pkg/tcpip/network/fragmentation/frag_heap.go                      | 2 +-
 pkg/tcpip/network/fragmentation/frag_heap_test.go                 | 2 +-
 pkg/tcpip/network/fragmentation/fragmentation.go                  | 2 +-
 pkg/tcpip/network/fragmentation/fragmentation_test.go             | 2 +-
 pkg/tcpip/network/fragmentation/reassembler.go                    | 2 +-
 pkg/tcpip/network/fragmentation/reassembler_test.go               | 2 +-
 pkg/tcpip/network/hash/hash.go                                    | 2 +-
 pkg/tcpip/network/ip_test.go                                      | 2 +-
 pkg/tcpip/network/ipv4/icmp.go                                    | 2 +-
 pkg/tcpip/network/ipv4/ipv4.go                                    | 2 +-
 pkg/tcpip/network/ipv4/ipv4_test.go                               | 2 +-
 pkg/tcpip/network/ipv6/icmp.go                                    | 2 +-
 pkg/tcpip/network/ipv6/icmp_test.go                               | 2 +-
 pkg/tcpip/network/ipv6/ipv6.go                                    | 2 +-
 pkg/tcpip/ports/ports.go                                          | 2 +-
 pkg/tcpip/ports/ports_test.go                                     | 2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go                          | 2 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go                             | 2 +-
 pkg/tcpip/seqnum/seqnum.go                                        | 2 +-
 pkg/tcpip/stack/linkaddrcache.go                                  | 2 +-
 pkg/tcpip/stack/linkaddrcache_test.go                             | 2 +-
 pkg/tcpip/stack/nic.go                                            | 2 +-
 pkg/tcpip/stack/registration.go                                   | 2 +-
 pkg/tcpip/stack/route.go                                          | 2 +-
 pkg/tcpip/stack/stack.go                                          | 2 +-
 pkg/tcpip/stack/stack_global_state.go                             | 2 +-
 pkg/tcpip/stack/stack_test.go                                     | 2 +-
 pkg/tcpip/stack/transport_demuxer.go                              | 2 +-
 pkg/tcpip/stack/transport_test.go                                 | 2 +-
 pkg/tcpip/tcpip.go                                                | 2 +-
 pkg/tcpip/tcpip_test.go                                           | 2 +-
 pkg/tcpip/time.s                                                  | 2 +-
 pkg/tcpip/time_unsafe.go                                          | 2 +-
 pkg/tcpip/transport/icmp/endpoint.go                              | 2 +-
 pkg/tcpip/transport/icmp/endpoint_state.go                        | 2 +-
 pkg/tcpip/transport/icmp/protocol.go                              | 2 +-
 pkg/tcpip/transport/raw/raw.go                                    | 2 +-
 pkg/tcpip/transport/raw/state.go                                  | 2 +-
 pkg/tcpip/transport/tcp/accept.go                                 | 2 +-
 pkg/tcpip/transport/tcp/connect.go                                | 2 +-
 pkg/tcpip/transport/tcp/cubic.go                                  | 2 +-
 pkg/tcpip/transport/tcp/dual_stack_test.go                        | 2 +-
 pkg/tcpip/transport/tcp/endpoint.go                               | 2 +-
 pkg/tcpip/transport/tcp/endpoint_state.go                         | 2 +-
 pkg/tcpip/transport/tcp/forwarder.go                              | 2 +-
 pkg/tcpip/transport/tcp/protocol.go                               | 2 +-
 pkg/tcpip/transport/tcp/rcv.go                                    | 2 +-
 pkg/tcpip/transport/tcp/reno.go                                   | 2 +-
 pkg/tcpip/transport/tcp/sack.go                                   | 2 +-
 pkg/tcpip/transport/tcp/sack_scoreboard.go                        | 2 +-
 pkg/tcpip/transport/tcp/sack_scoreboard_test.go                   | 2 +-
 pkg/tcpip/transport/tcp/segment.go                                | 2 +-
 pkg/tcpip/transport/tcp/segment_heap.go                           | 2 +-
 pkg/tcpip/transport/tcp/segment_queue.go                          | 2 +-
 pkg/tcpip/transport/tcp/segment_state.go                          | 2 +-
 pkg/tcpip/transport/tcp/snd.go                                    | 2 +-
 pkg/tcpip/transport/tcp/snd_state.go                              | 2 +-
 pkg/tcpip/transport/tcp/tcp_sack_test.go                          | 2 +-
 pkg/tcpip/transport/tcp/tcp_test.go                               | 2 +-
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go                     | 2 +-
 pkg/tcpip/transport/tcp/testing/context/context.go                | 2 +-
 pkg/tcpip/transport/tcp/timer.go                                  | 2 +-
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go                 | 2 +-
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go            | 2 +-
 pkg/tcpip/transport/udp/endpoint.go                               | 2 +-
 pkg/tcpip/transport/udp/endpoint_state.go                         | 2 +-
 pkg/tcpip/transport/udp/forwarder.go                              | 2 +-
 pkg/tcpip/transport/udp/protocol.go                               | 2 +-
 pkg/tcpip/transport/udp/udp_test.go                               | 2 +-
 pkg/tmutex/tmutex.go                                              | 2 +-
 pkg/tmutex/tmutex_test.go                                         | 2 +-
 pkg/unet/unet.go                                                  | 2 +-
 pkg/unet/unet_test.go                                             | 2 +-
 pkg/unet/unet_unsafe.go                                           | 2 +-
 pkg/urpc/urpc.go                                                  | 2 +-
 pkg/urpc/urpc_test.go                                             | 2 +-
 pkg/waiter/waiter.go                                              | 2 +-
 pkg/waiter/waiter_test.go                                         | 2 +-
 runsc/boot/compat.go                                              | 2 +-
 runsc/boot/compat_amd64.go                                        | 2 +-
 runsc/boot/compat_test.go                                         | 2 +-
 runsc/boot/config.go                                              | 2 +-
 runsc/boot/controller.go                                          | 2 +-
 runsc/boot/debug.go                                               | 2 +-
 runsc/boot/events.go                                              | 2 +-
 runsc/boot/fds.go                                                 | 2 +-
 runsc/boot/filter/config.go                                       | 2 +-
 runsc/boot/filter/extra_filters.go                                | 2 +-
 runsc/boot/filter/extra_filters_msan.go                           | 2 +-
 runsc/boot/filter/extra_filters_race.go                           | 2 +-
 runsc/boot/filter/filter.go                                       | 2 +-
 runsc/boot/fs.go                                                  | 2 +-
 runsc/boot/limits.go                                              | 2 +-
 runsc/boot/loader.go                                              | 2 +-
 runsc/boot/loader_test.go                                         | 2 +-
 runsc/boot/network.go                                             | 2 +-
 runsc/boot/strace.go                                              | 2 +-
 runsc/cgroup/cgroup.go                                            | 2 +-
 runsc/cgroup/cgroup_test.go                                       | 2 +-
 runsc/cmd/boot.go                                                 | 2 +-
 runsc/cmd/capability.go                                           | 2 +-
 runsc/cmd/capability_test.go                                      | 2 +-
 runsc/cmd/checkpoint.go                                           | 2 +-
 runsc/cmd/chroot.go                                               | 2 +-
 runsc/cmd/cmd.go                                                  | 2 +-
 runsc/cmd/create.go                                               | 2 +-
 runsc/cmd/debug.go                                                | 2 +-
 runsc/cmd/delete.go                                               | 2 +-
 runsc/cmd/delete_test.go                                          | 2 +-
 runsc/cmd/do.go                                                   | 2 +-
 runsc/cmd/events.go                                               | 2 +-
 runsc/cmd/exec.go                                                 | 2 +-
 runsc/cmd/exec_test.go                                            | 2 +-
 runsc/cmd/gofer.go                                                | 2 +-
 runsc/cmd/gofer_test.go                                           | 2 +-
 runsc/cmd/kill.go                                                 | 2 +-
 runsc/cmd/list.go                                                 | 2 +-
 runsc/cmd/path.go                                                 | 2 +-
 runsc/cmd/pause.go                                                | 2 +-
 runsc/cmd/ps.go                                                   | 2 +-
 runsc/cmd/restore.go                                              | 2 +-
 runsc/cmd/resume.go                                               | 2 +-
 runsc/cmd/run.go                                                  | 2 +-
 runsc/cmd/spec.go                                                 | 2 +-
 runsc/cmd/start.go                                                | 2 +-
 runsc/cmd/state.go                                                | 2 +-
 runsc/cmd/wait.go                                                 | 2 +-
 runsc/console/console.go                                          | 2 +-
 runsc/container/console_test.go                                   | 2 +-
 runsc/container/container.go                                      | 2 +-
 runsc/container/container_test.go                                 | 2 +-
 runsc/container/hook.go                                           | 2 +-
 runsc/container/multi_container_test.go                           | 2 +-
 runsc/container/shared_volume_test.go                             | 2 +-
 runsc/container/status.go                                         | 2 +-
 runsc/container/test_app.go                                       | 2 +-
 runsc/fsgofer/filter/config.go                                    | 2 +-
 runsc/fsgofer/filter/extra_filters.go                             | 2 +-
 runsc/fsgofer/filter/extra_filters_msan.go                        | 2 +-
 runsc/fsgofer/filter/extra_filters_race.go                        | 2 +-
 runsc/fsgofer/filter/filter.go                                    | 2 +-
 runsc/fsgofer/fsgofer.go                                          | 2 +-
 runsc/fsgofer/fsgofer_test.go                                     | 2 +-
 runsc/fsgofer/fsgofer_unsafe.go                                   | 2 +-
 runsc/main.go                                                     | 2 +-
 runsc/sandbox/network.go                                          | 2 +-
 runsc/sandbox/network_unsafe.go                                   | 2 +-
 runsc/sandbox/sandbox.go                                          | 2 +-
 runsc/specutils/fs.go                                             | 2 +-
 runsc/specutils/namespace.go                                      | 2 +-
 runsc/specutils/specutils.go                                      | 2 +-
 runsc/specutils/specutils_test.go                                 | 2 +-
 runsc/test/image/image.go                                         | 2 +-
 runsc/test/image/image_test.go                                    | 2 +-
 runsc/test/image/mysql.sql                                        | 2 +-
 runsc/test/image/ruby.rb                                          | 2 +-
 runsc/test/image/ruby.sh                                          | 2 +-
 runsc/test/install.sh                                             | 2 +-
 runsc/test/integration/exec_test.go                               | 2 +-
 runsc/test/integration/integration.go                             | 2 +-
 runsc/test/integration/integration_test.go                        | 2 +-
 runsc/test/root/cgroup_test.go                                    | 2 +-
 runsc/test/root/chroot_test.go                                    | 2 +-
 runsc/test/root/crictl_test.go                                    | 2 +-
 runsc/test/root/root.go                                           | 2 +-
 runsc/test/root/testdata/busybox.go                               | 2 +-
 runsc/test/root/testdata/containerd_config.go                     | 2 +-
 runsc/test/root/testdata/httpd.go                                 | 2 +-
 runsc/test/root/testdata/httpd_mount_paths.go                     | 2 +-
 runsc/test/root/testdata/sandbox.go                               | 2 +-
 runsc/test/testutil/crictl.go                                     | 2 +-
 runsc/test/testutil/docker.go                                     | 2 +-
 runsc/test/testutil/testutil.go                                   | 2 +-
 runsc/test/testutil/testutil_race.go                              | 2 +-
 runsc/tools/dockercfg/dockercfg.go                                | 2 +-
 runsc/version.go                                                  | 2 +-
 test/syscalls/gtest/gtest.go                                      | 2 +-
 test/syscalls/linux/32bit.cc                                      | 2 +-
 test/syscalls/linux/accept_bind.cc                                | 2 +-
 test/syscalls/linux/accept_bind_stream.cc                         | 2 +-
 test/syscalls/linux/access.cc                                     | 2 +-
 test/syscalls/linux/affinity.cc                                   | 2 +-
 test/syscalls/linux/aio.cc                                        | 2 +-
 test/syscalls/linux/alarm.cc                                      | 2 +-
 test/syscalls/linux/arch_prctl.cc                                 | 2 +-
 test/syscalls/linux/bad.cc                                        | 2 +-
 test/syscalls/linux/base_poll_test.cc                             | 2 +-
 test/syscalls/linux/base_poll_test.h                              | 2 +-
 test/syscalls/linux/bind.cc                                       | 2 +-
 test/syscalls/linux/brk.cc                                        | 2 +-
 test/syscalls/linux/chdir.cc                                      | 2 +-
 test/syscalls/linux/chmod.cc                                      | 2 +-
 test/syscalls/linux/chown.cc                                      | 2 +-
 test/syscalls/linux/chroot.cc                                     | 2 +-
 test/syscalls/linux/clock_getres.cc                               | 2 +-
 test/syscalls/linux/clock_gettime.cc                              | 2 +-
 test/syscalls/linux/clock_nanosleep.cc                            | 2 +-
 test/syscalls/linux/concurrency.cc                                | 2 +-
 test/syscalls/linux/creat.cc                                      | 2 +-
 test/syscalls/linux/dev.cc                                        | 2 +-
 test/syscalls/linux/dup.cc                                        | 2 +-
 test/syscalls/linux/epoll.cc                                      | 2 +-
 test/syscalls/linux/eventfd.cc                                    | 2 +-
 test/syscalls/linux/exceptions.cc                                 | 2 +-
 test/syscalls/linux/exec.cc                                       | 2 +-
 test/syscalls/linux/exec.h                                        | 2 +-
 test/syscalls/linux/exec_assert_closed_workload.cc                | 2 +-
 test/syscalls/linux/exec_basic_workload.cc                        | 2 +-
 test/syscalls/linux/exec_binary.cc                                | 2 +-
 test/syscalls/linux/exec_proc_exe_workload.cc                     | 2 +-
 test/syscalls/linux/exec_state_workload.cc                        | 2 +-
 test/syscalls/linux/exit.cc                                       | 2 +-
 test/syscalls/linux/exit_script.sh                                | 2 +-
 test/syscalls/linux/fadvise64.cc                                  | 2 +-
 test/syscalls/linux/fallocate.cc                                  | 2 +-
 test/syscalls/linux/fault.cc                                      | 2 +-
 test/syscalls/linux/fchdir.cc                                     | 2 +-
 test/syscalls/linux/fcntl.cc                                      | 2 +-
 test/syscalls/linux/file_base.h                                   | 2 +-
 test/syscalls/linux/flock.cc                                      | 2 +-
 test/syscalls/linux/fork.cc                                       | 2 +-
 test/syscalls/linux/fpsig_fork.cc                                 | 2 +-
 test/syscalls/linux/fpsig_nested.cc                               | 2 +-
 test/syscalls/linux/fsync.cc                                      | 2 +-
 test/syscalls/linux/futex.cc                                      | 2 +-
 test/syscalls/linux/getcpu.cc                                     | 2 +-
 test/syscalls/linux/getdents.cc                                   | 2 +-
 test/syscalls/linux/getrandom.cc                                  | 2 +-
 test/syscalls/linux/getrusage.cc                                  | 2 +-
 test/syscalls/linux/inotify.cc                                    | 2 +-
 test/syscalls/linux/ioctl.cc                                      | 2 +-
 test/syscalls/linux/ip_socket_test_util.cc                        | 2 +-
 test/syscalls/linux/ip_socket_test_util.h                         | 2 +-
 test/syscalls/linux/itimer.cc                                     | 2 +-
 test/syscalls/linux/kill.cc                                       | 2 +-
 test/syscalls/linux/link.cc                                       | 2 +-
 test/syscalls/linux/lseek.cc                                      | 2 +-
 test/syscalls/linux/madvise.cc                                    | 2 +-
 test/syscalls/linux/memfd.cc                                      | 2 +-
 test/syscalls/linux/memory_accounting.cc                          | 2 +-
 test/syscalls/linux/mempolicy.cc                                  | 2 +-
 test/syscalls/linux/mincore.cc                                    | 2 +-
 test/syscalls/linux/mkdir.cc                                      | 2 +-
 test/syscalls/linux/mknod.cc                                      | 2 +-
 test/syscalls/linux/mlock.cc                                      | 2 +-
 test/syscalls/linux/mmap.cc                                       | 2 +-
 test/syscalls/linux/mount.cc                                      | 2 +-
 test/syscalls/linux/mremap.cc                                     | 2 +-
 test/syscalls/linux/msync.cc                                      | 2 +-
 test/syscalls/linux/munmap.cc                                     | 2 +-
 test/syscalls/linux/open.cc                                       | 2 +-
 test/syscalls/linux/open_create.cc                                | 2 +-
 test/syscalls/linux/partial_bad_buffer.cc                         | 2 +-
 test/syscalls/linux/pause.cc                                      | 2 +-
 test/syscalls/linux/pipe.cc                                       | 2 +-
 test/syscalls/linux/poll.cc                                       | 2 +-
 test/syscalls/linux/ppoll.cc                                      | 2 +-
 test/syscalls/linux/prctl.cc                                      | 2 +-
 test/syscalls/linux/prctl_setuid.cc                               | 2 +-
 test/syscalls/linux/pread64.cc                                    | 2 +-
 test/syscalls/linux/preadv.cc                                     | 2 +-
 test/syscalls/linux/preadv2.cc                                    | 2 +-
 test/syscalls/linux/priority.cc                                   | 2 +-
 test/syscalls/linux/priority_execve.cc                            | 2 +-
 test/syscalls/linux/proc.cc                                       | 2 +-
 test/syscalls/linux/proc_net.cc                                   | 2 +-
 test/syscalls/linux/proc_net_unix.cc                              | 2 +-
 test/syscalls/linux/proc_pid_smaps.cc                             | 2 +-
 test/syscalls/linux/proc_pid_uid_gid_map.cc                       | 2 +-
 test/syscalls/linux/pselect.cc                                    | 2 +-
 test/syscalls/linux/ptrace.cc                                     | 2 +-
 test/syscalls/linux/pty.cc                                        | 2 +-
 test/syscalls/linux/pwrite64.cc                                   | 2 +-
 test/syscalls/linux/pwritev2.cc                                   | 2 +-
 test/syscalls/linux/raw_socket_ipv4.cc                            | 2 +-
 test/syscalls/linux/read.cc                                       | 2 +-
 test/syscalls/linux/readv.cc                                      | 2 +-
 test/syscalls/linux/readv_common.cc                               | 2 +-
 test/syscalls/linux/readv_common.h                                | 2 +-
 test/syscalls/linux/readv_socket.cc                               | 2 +-
 test/syscalls/linux/rename.cc                                     | 2 +-
 test/syscalls/linux/rlimits.cc                                    | 2 +-
 test/syscalls/linux/rtsignal.cc                                   | 2 +-
 test/syscalls/linux/sched.cc                                      | 2 +-
 test/syscalls/linux/sched_yield.cc                                | 2 +-
 test/syscalls/linux/seccomp.cc                                    | 2 +-
 test/syscalls/linux/select.cc                                     | 2 +-
 test/syscalls/linux/semaphore.cc                                  | 2 +-
 test/syscalls/linux/sendfile.cc                                   | 2 +-
 test/syscalls/linux/sendfile_socket.cc                            | 2 +-
 test/syscalls/linux/shm.cc                                        | 2 +-
 test/syscalls/linux/sigaction.cc                                  | 2 +-
 test/syscalls/linux/sigaltstack.cc                                | 2 +-
 test/syscalls/linux/sigaltstack_check.cc                          | 2 +-
 test/syscalls/linux/sigiret.cc                                    | 2 +-
 test/syscalls/linux/sigprocmask.cc                                | 2 +-
 test/syscalls/linux/sigstop.cc                                    | 2 +-
 test/syscalls/linux/sigtimedwait.cc                               | 2 +-
 test/syscalls/linux/socket_abstract.cc                            | 2 +-
 test/syscalls/linux/socket_blocking.cc                            | 2 +-
 test/syscalls/linux/socket_blocking.h                             | 2 +-
 test/syscalls/linux/socket_filesystem.cc                          | 2 +-
 test/syscalls/linux/socket_generic.cc                             | 2 +-
 test/syscalls/linux/socket_generic.h                              | 2 +-
 test/syscalls/linux/socket_inet_loopback.cc                       | 2 +-
 test/syscalls/linux/socket_ip_loopback_blocking.cc                | 2 +-
 test/syscalls/linux/socket_ip_tcp_generic.cc                      | 2 +-
 test/syscalls/linux/socket_ip_tcp_generic.h                       | 2 +-
 test/syscalls/linux/socket_ip_tcp_generic_loopback.cc             | 2 +-
 test/syscalls/linux/socket_ip_tcp_loopback.cc                     | 2 +-
 test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc            | 2 +-
 test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc            | 2 +-
 test/syscalls/linux/socket_ip_tcp_udp_generic.cc                  | 2 +-
 test/syscalls/linux/socket_ip_udp_generic.cc                      | 2 +-
 test/syscalls/linux/socket_ip_udp_generic.h                       | 2 +-
 test/syscalls/linux/socket_ip_udp_loopback.cc                     | 2 +-
 test/syscalls/linux/socket_ip_udp_loopback_blocking.cc            | 2 +-
 test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc            | 2 +-
 .../syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc | 2 +-
 test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h | 2 +-
 .../linux/socket_ipv4_tcp_unbound_external_networking_test.cc     | 2 +-
 test/syscalls/linux/socket_ipv4_udp_unbound.cc                    | 2 +-
 test/syscalls/linux/socket_ipv4_udp_unbound.h                     | 2 +-
 .../syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc | 2 +-
 test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h | 2 +-
 .../linux/socket_ipv4_udp_unbound_external_networking_test.cc     | 2 +-
 test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc           | 2 +-
 test/syscalls/linux/socket_netdevice.cc                           | 2 +-
 test/syscalls/linux/socket_netlink_route.cc                       | 2 +-
 test/syscalls/linux/socket_netlink_util.cc                        | 2 +-
 test/syscalls/linux/socket_netlink_util.h                         | 2 +-
 test/syscalls/linux/socket_non_blocking.cc                        | 2 +-
 test/syscalls/linux/socket_non_blocking.h                         | 2 +-
 test/syscalls/linux/socket_non_stream.cc                          | 2 +-
 test/syscalls/linux/socket_non_stream.h                           | 2 +-
 test/syscalls/linux/socket_non_stream_blocking.cc                 | 2 +-
 test/syscalls/linux/socket_non_stream_blocking.h                  | 2 +-
 test/syscalls/linux/socket_stream.cc                              | 2 +-
 test/syscalls/linux/socket_stream.h                               | 2 +-
 test/syscalls/linux/socket_stream_blocking.cc                     | 2 +-
 test/syscalls/linux/socket_stream_blocking.h                      | 2 +-
 test/syscalls/linux/socket_stream_nonblock.cc                     | 2 +-
 test/syscalls/linux/socket_stream_nonblock.h                      | 2 +-
 test/syscalls/linux/socket_test_util.cc                           | 2 +-
 test/syscalls/linux/socket_test_util.h                            | 2 +-
 test/syscalls/linux/socket_unix.cc                                | 2 +-
 test/syscalls/linux/socket_unix.h                                 | 2 +-
 test/syscalls/linux/socket_unix_abstract.cc                       | 2 +-
 test/syscalls/linux/socket_unix_abstract_nonblock.cc              | 2 +-
 test/syscalls/linux/socket_unix_blocking_local.cc                 | 2 +-
 test/syscalls/linux/socket_unix_dgram.cc                          | 2 +-
 test/syscalls/linux/socket_unix_dgram.h                           | 2 +-
 test/syscalls/linux/socket_unix_dgram_local.cc                    | 2 +-
 test/syscalls/linux/socket_unix_dgram_non_blocking.cc             | 2 +-
 test/syscalls/linux/socket_unix_domain.cc                         | 2 +-
 test/syscalls/linux/socket_unix_filesystem.cc                     | 2 +-
 test/syscalls/linux/socket_unix_filesystem_nonblock.cc            | 2 +-
 test/syscalls/linux/socket_unix_non_stream.cc                     | 2 +-
 test/syscalls/linux/socket_unix_non_stream.h                      | 2 +-
 test/syscalls/linux/socket_unix_non_stream_blocking_local.cc      | 2 +-
 test/syscalls/linux/socket_unix_pair.cc                           | 2 +-
 test/syscalls/linux/socket_unix_pair_nonblock.cc                  | 2 +-
 test/syscalls/linux/socket_unix_seqpacket.cc                      | 2 +-
 test/syscalls/linux/socket_unix_seqpacket.h                       | 2 +-
 test/syscalls/linux/socket_unix_seqpacket_local.cc                | 2 +-
 test/syscalls/linux/socket_unix_stream.cc                         | 2 +-
 test/syscalls/linux/socket_unix_stream_blocking_local.cc          | 2 +-
 test/syscalls/linux/socket_unix_stream_local.cc                   | 2 +-
 test/syscalls/linux/socket_unix_stream_nonblock_local.cc          | 2 +-
 test/syscalls/linux/socket_unix_unbound_abstract.cc               | 2 +-
 test/syscalls/linux/socket_unix_unbound_dgram.cc                  | 2 +-
 test/syscalls/linux/socket_unix_unbound_filesystem.cc             | 2 +-
 test/syscalls/linux/socket_unix_unbound_seqpacket.cc              | 2 +-
 test/syscalls/linux/socket_unix_unbound_stream.cc                 | 2 +-
 test/syscalls/linux/stat.cc                                       | 2 +-
 test/syscalls/linux/stat_times.cc                                 | 2 +-
 test/syscalls/linux/statfs.cc                                     | 2 +-
 test/syscalls/linux/sticky.cc                                     | 2 +-
 test/syscalls/linux/symlink.cc                                    | 2 +-
 test/syscalls/linux/sync.cc                                       | 2 +-
 test/syscalls/linux/sync_file_range.cc                            | 2 +-
 test/syscalls/linux/sysinfo.cc                                    | 2 +-
 test/syscalls/linux/syslog.cc                                     | 2 +-
 test/syscalls/linux/sysret.cc                                     | 2 +-
 test/syscalls/linux/tcp_socket.cc                                 | 2 +-
 test/syscalls/linux/temp_umask.h                                  | 2 +-
 test/syscalls/linux/tgkill.cc                                     | 2 +-
 test/syscalls/linux/time.cc                                       | 2 +-
 test/syscalls/linux/timerfd.cc                                    | 2 +-
 test/syscalls/linux/timers.cc                                     | 2 +-
 test/syscalls/linux/tkill.cc                                      | 2 +-
 test/syscalls/linux/truncate.cc                                   | 2 +-
 test/syscalls/linux/udp_bind.cc                                   | 2 +-
 test/syscalls/linux/udp_socket.cc                                 | 2 +-
 test/syscalls/linux/uidgid.cc                                     | 2 +-
 test/syscalls/linux/uname.cc                                      | 2 +-
 test/syscalls/linux/unix_domain_socket_test_util.cc               | 2 +-
 test/syscalls/linux/unix_domain_socket_test_util.h                | 2 +-
 test/syscalls/linux/unlink.cc                                     | 2 +-
 test/syscalls/linux/unshare.cc                                    | 2 +-
 test/syscalls/linux/utimes.cc                                     | 2 +-
 test/syscalls/linux/vdso.cc                                       | 2 +-
 test/syscalls/linux/vdso_clock_gettime.cc                         | 2 +-
 test/syscalls/linux/vfork.cc                                      | 2 +-
 test/syscalls/linux/vsyscall.cc                                   | 2 +-
 test/syscalls/linux/wait.cc                                       | 2 +-
 test/syscalls/linux/write.cc                                      | 2 +-
 test/syscalls/syscall_test_runner.go                              | 2 +-
 test/syscalls/syscall_test_runner.sh                              | 2 +-
 test/util/capability_util.cc                                      | 2 +-
 test/util/capability_util.h                                       | 2 +-
 test/util/cleanup.h                                               | 2 +-
 test/util/epoll_util.cc                                           | 2 +-
 test/util/epoll_util.h                                            | 2 +-
 test/util/eventfd_util.h                                          | 2 +-
 test/util/file_descriptor.h                                       | 2 +-
 test/util/fs_util.cc                                              | 2 +-
 test/util/fs_util.h                                               | 2 +-
 test/util/fs_util_test.cc                                         | 2 +-
 test/util/logging.cc                                              | 2 +-
 test/util/logging.h                                               | 2 +-
 test/util/memory_util.h                                           | 2 +-
 test/util/mount_util.h                                            | 2 +-
 test/util/multiprocess_util.cc                                    | 2 +-
 test/util/multiprocess_util.h                                     | 2 +-
 test/util/posix_error.cc                                          | 2 +-
 test/util/posix_error.h                                           | 2 +-
 test/util/posix_error_test.cc                                     | 2 +-
 test/util/proc_util.cc                                            | 2 +-
 test/util/proc_util.h                                             | 2 +-
 test/util/proc_util_test.cc                                       | 2 +-
 test/util/rlimit_util.cc                                          | 2 +-
 test/util/rlimit_util.h                                           | 2 +-
 test/util/save_util.cc                                            | 2 +-
 test/util/save_util.h                                             | 2 +-
 test/util/signal_util.cc                                          | 2 +-
 test/util/signal_util.h                                           | 2 +-
 test/util/temp_path.cc                                            | 2 +-
 test/util/temp_path.h                                             | 2 +-
 test/util/test_main.cc                                            | 2 +-
 test/util/test_util.cc                                            | 2 +-
 test/util/test_util.h                                             | 2 +-
 test/util/test_util_test.cc                                       | 2 +-
 test/util/thread_util.h                                           | 2 +-
 test/util/timer_util.cc                                           | 2 +-
 test/util/timer_util.h                                            | 2 +-
 third_party/gvsync/atomicptr_unsafe.go                            | 2 +-
 third_party/gvsync/atomicptrtest/atomicptr_test.go                | 2 +-
 third_party/gvsync/downgradable_rwmutex_test.go                   | 2 +-
 third_party/gvsync/downgradable_rwmutex_unsafe.go                 | 2 +-
 third_party/gvsync/gvsync.go                                      | 2 +-
 third_party/gvsync/memmove_unsafe.go                              | 2 +-
 third_party/gvsync/norace_unsafe.go                               | 2 +-
 third_party/gvsync/race_unsafe.go                                 | 2 +-
 third_party/gvsync/seqatomic_unsafe.go                            | 2 +-
 third_party/gvsync/seqatomictest/seqatomic_test.go                | 2 +-
 third_party/gvsync/seqcount.go                                    | 2 +-
 third_party/gvsync/seqcount_test.go                               | 2 +-
 tools/go_generics/generics.go                                     | 2 +-
 tools/go_generics/generics_tests/all_stmts/input.go               | 2 +-
 tools/go_generics/generics_tests/all_stmts/output/output.go       | 2 +-
 tools/go_generics/generics_tests/all_types/input.go               | 2 +-
 tools/go_generics/generics_tests/all_types/lib/lib.go             | 2 +-
 tools/go_generics/generics_tests/all_types/output/output.go       | 2 +-
 tools/go_generics/generics_tests/consts/input.go                  | 2 +-
 tools/go_generics/generics_tests/consts/output/output.go          | 2 +-
 tools/go_generics/generics_tests/imports/input.go                 | 2 +-
 tools/go_generics/generics_tests/imports/output/output.go         | 2 +-
 tools/go_generics/generics_tests/remove_typedef/input.go          | 2 +-
 tools/go_generics/generics_tests/remove_typedef/output/output.go  | 2 +-
 tools/go_generics/generics_tests/simple/input.go                  | 2 +-
 tools/go_generics/generics_tests/simple/output/output.go          | 2 +-
 tools/go_generics/globals/globals_visitor.go                      | 2 +-
 tools/go_generics/globals/scope.go                                | 2 +-
 tools/go_generics/go_generics_unittest.sh                         | 2 +-
 tools/go_generics/go_merge/main.go                                | 2 +-
 tools/go_generics/imports.go                                      | 2 +-
 tools/go_generics/remove.go                                       | 2 +-
 tools/go_generics/rules_tests/template.go                         | 2 +-
 tools/go_generics/rules_tests/template_test.go                    | 2 +-
 tools/go_stateify/main.go                                         | 2 +-
 tools/tag_release.sh                                              | 2 +-
 tools/workspace_status.sh                                         | 2 +-
 vdso/barrier.h                                                    | 2 +-
 vdso/check_vdso.py                                                | 2 +-
 vdso/compiler.h                                                   | 2 +-
 vdso/cycle_clock.h                                                | 2 +-
 vdso/seqlock.h                                                    | 2 +-
 vdso/syscalls.h                                                   | 2 +-
 vdso/vdso.cc                                                      | 2 +-
 vdso/vdso_time.cc                                                 | 2 +-
 vdso/vdso_time.h                                                  | 2 +-
 1235 files changed, 1242 insertions(+), 1234 deletions(-)
 create mode 100644 AUTHORS

(limited to 'runsc')

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 000000000..01ba46567
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,8 @@
+# This is the list of gVisor authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+#
+# Please send a patch if you would like to be included in this list.
+Google LLC
diff --git a/kokoro/run_build.sh b/kokoro/run_build.sh
index 89e24b037..63fffda48 100755
--- a/kokoro/run_build.sh
+++ b/kokoro/run_build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 8a3ce7402..08f678e39 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/pkg/abi/abi.go b/pkg/abi/abi.go
index 7770f0405..d56c481c9 100644
--- a/pkg/abi/abi.go
+++ b/pkg/abi/abi.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go
index 9d9f361a4..3059479bd 100644
--- a/pkg/abi/abi_linux.go
+++ b/pkg/abi/abi_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/flag.go b/pkg/abi/flag.go
index b48757da8..dcdd66d4e 100644
--- a/pkg/abi/flag.go
+++ b/pkg/abi/flag.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go
index 1b7ca714a..3c6e0079d 100644
--- a/pkg/abi/linux/aio.go
+++ b/pkg/abi/linux/aio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ashmem.go b/pkg/abi/linux/ashmem.go
index ced1e44d4..2a722abe0 100644
--- a/pkg/abi/linux/ashmem.go
+++ b/pkg/abi/linux/ashmem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/audit.go b/pkg/abi/linux/audit.go
index b39ba4515..6cca69af9 100644
--- a/pkg/abi/linux/audit.go
+++ b/pkg/abi/linux/audit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/binder.go b/pkg/abi/linux/binder.go
index 522dc6f53..63b08324a 100644
--- a/pkg/abi/linux/binder.go
+++ b/pkg/abi/linux/binder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index d9cd09948..aa3d3ce70 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index 7d96f013e..c120cac64 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index 5b1199aac..421e11256 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/elf.go b/pkg/abi/linux/elf.go
index 928067c04..fb1c679d2 100644
--- a/pkg/abi/linux/elf.go
+++ b/pkg/abi/linux/elf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/errors.go b/pkg/abi/linux/errors.go
index e5f6f3f07..93f85a864 100644
--- a/pkg/abi/linux/errors.go
+++ b/pkg/abi/linux/errors.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/eventfd.go b/pkg/abi/linux/eventfd.go
index 5614f5cf1..9c479fc8f 100644
--- a/pkg/abi/linux/eventfd.go
+++ b/pkg/abi/linux/eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/exec.go b/pkg/abi/linux/exec.go
index a07c29243..579d46c41 100644
--- a/pkg/abi/linux/exec.go
+++ b/pkg/abi/linux/exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index c8558933a..cc8f2702d 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 46b10ca97..753fec3ed 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index a9f2ba132..c82ab9b5b 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go
index afdf4123b..08bfde3b5 100644
--- a/pkg/abi/linux/futex.go
+++ b/pkg/abi/linux/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/inotify.go b/pkg/abi/linux/inotify.go
index 79c5d3593..2d08194ba 100644
--- a/pkg/abi/linux/inotify.go
+++ b/pkg/abi/linux/inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 191b26e4d..04bb767dc 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index 77ac1062c..31e56ffa6 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go
index 10681768b..2ef8d6cbb 100644
--- a/pkg/abi/linux/ipc.go
+++ b/pkg/abi/linux/ipc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index e0aa5b31d..c74dfcd53 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/linux.go b/pkg/abi/linux/linux.go
index d365f693d..8a8f831cd 100644
--- a/pkg/abi/linux/linux.go
+++ b/pkg/abi/linux/linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index eda8d9788..0b02f938a 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netdevice.go b/pkg/abi/linux/netdevice.go
index e3b6b1e40..aef1acf75 100644
--- a/pkg/abi/linux/netdevice.go
+++ b/pkg/abi/linux/netdevice.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index 25c5e17fd..5e718c363 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 4200b6506..630dc339a 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/poll.go b/pkg/abi/linux/poll.go
index 9f0b15d1c..c04d26e4c 100644
--- a/pkg/abi/linux/poll.go
+++ b/pkg/abi/linux/poll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index db3206f36..dae2de290 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ptrace.go b/pkg/abi/linux/ptrace.go
index 7db4f5464..23e605ab2 100644
--- a/pkg/abi/linux/ptrace.go
+++ b/pkg/abi/linux/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/rusage.go b/pkg/abi/linux/rusage.go
index 7fea4b589..d8302dc85 100644
--- a/pkg/abi/linux/rusage.go
+++ b/pkg/abi/linux/rusage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/sched.go b/pkg/abi/linux/sched.go
index ef96a3801..193d9a242 100644
--- a/pkg/abi/linux/sched.go
+++ b/pkg/abi/linux/sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index 8673a27bf..4eeb5cd7a 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index b80c93daf..de422c519 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
index 82a80e609..e45aadb10 100644
--- a/pkg/abi/linux/shm.go
+++ b/pkg/abi/linux/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index 395f9f31e..9cbd77dda 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 6fa4e7c3e..417840731 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go
index 67908deb9..174d470e2 100644
--- a/pkg/abi/linux/tcp.go
+++ b/pkg/abi/linux/tcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index bbd21e726..fa9ee27e1 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/timer.go b/pkg/abi/linux/timer.go
index a6f420bdb..e32d09e10 100644
--- a/pkg/abi/linux/timer.go
+++ b/pkg/abi/linux/timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index bff882d89..8ac02aee8 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/uio.go b/pkg/abi/linux/uio.go
index 7e00d9959..1fd1e9802 100644
--- a/pkg/abi/linux/uio.go
+++ b/pkg/abi/linux/uio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/utsname.go b/pkg/abi/linux/utsname.go
index f80ed7d4a..60f220a67 100644
--- a/pkg/abi/linux/utsname.go
+++ b/pkg/abi/linux/utsname.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go
index 26b674435..85e819304 100644
--- a/pkg/amutex/amutex.go
+++ b/pkg/amutex/amutex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go
index 104e0dab1..6a0af006e 100644
--- a/pkg/amutex/amutex_test.go
+++ b/pkg/amutex/amutex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops.go b/pkg/atomicbitops/atomic_bitops.go
index 9a57f9599..63aa2b7f1 100644
--- a/pkg/atomicbitops/atomic_bitops.go
+++ b/pkg/atomicbitops/atomic_bitops.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_amd64.s b/pkg/atomicbitops/atomic_bitops_amd64.s
index b37e3aad3..db0972001 100644
--- a/pkg/atomicbitops/atomic_bitops_amd64.s
+++ b/pkg/atomicbitops/atomic_bitops_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_common.go b/pkg/atomicbitops/atomic_bitops_common.go
index b03242baa..b2a943dcb 100644
--- a/pkg/atomicbitops/atomic_bitops_common.go
+++ b/pkg/atomicbitops/atomic_bitops_common.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go
index ee6207cb3..965e9be79 100644
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ b/pkg/atomicbitops/atomic_bitops_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/binary/binary.go b/pkg/binary/binary.go
index 02f7e9fb8..631785f7b 100644
--- a/pkg/binary/binary.go
+++ b/pkg/binary/binary.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/binary/binary_test.go b/pkg/binary/binary_test.go
index 200961c70..4d609a438 100644
--- a/pkg/binary/binary_test.go
+++ b/pkg/binary/binary_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/bits.go b/pkg/bits/bits.go
index eb3c80f49..a26433ad6 100644
--- a/pkg/bits/bits.go
+++ b/pkg/bits/bits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go
index 8c578cca2..93a435b80 100644
--- a/pkg/bits/bits_template.go
+++ b/pkg/bits/bits_template.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_amd64.go b/pkg/bits/uint64_arch_amd64.go
index 1fef89394..faccaa61a 100644
--- a/pkg/bits/uint64_arch_amd64.go
+++ b/pkg/bits/uint64_arch_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_amd64_asm.s b/pkg/bits/uint64_arch_amd64_asm.s
index 8c7322f0f..8ff364181 100644
--- a/pkg/bits/uint64_arch_amd64_asm.s
+++ b/pkg/bits/uint64_arch_amd64_asm.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_generic.go b/pkg/bits/uint64_arch_generic.go
index cfb47400b..7dd2d1480 100644
--- a/pkg/bits/uint64_arch_generic.go
+++ b/pkg/bits/uint64_arch_generic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go
index d6dbaf602..1b018d808 100644
--- a/pkg/bits/uint64_test.go
+++ b/pkg/bits/uint64_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/bpf.go b/pkg/bpf/bpf.go
index 98d44d911..eb546f48f 100644
--- a/pkg/bpf/bpf.go
+++ b/pkg/bpf/bpf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/decoder.go b/pkg/bpf/decoder.go
index ae6b8839a..45c192215 100644
--- a/pkg/bpf/decoder.go
+++ b/pkg/bpf/decoder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/decoder_test.go b/pkg/bpf/decoder_test.go
index f093e1e41..8c4bdad21 100644
--- a/pkg/bpf/decoder_test.go
+++ b/pkg/bpf/decoder_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/input_bytes.go b/pkg/bpf/input_bytes.go
index 745c0749b..86b216cfc 100644
--- a/pkg/bpf/input_bytes.go
+++ b/pkg/bpf/input_bytes.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/interpreter.go b/pkg/bpf/interpreter.go
index 86c7add4d..86de523a2 100644
--- a/pkg/bpf/interpreter.go
+++ b/pkg/bpf/interpreter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/interpreter_test.go b/pkg/bpf/interpreter_test.go
index c46a43991..67b00ffe3 100644
--- a/pkg/bpf/interpreter_test.go
+++ b/pkg/bpf/interpreter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/program_builder.go b/pkg/bpf/program_builder.go
index b4ce228e1..fc9d27203 100644
--- a/pkg/bpf/program_builder.go
+++ b/pkg/bpf/program_builder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/program_builder_test.go b/pkg/bpf/program_builder_test.go
index 0e0b79d88..5b2ad67de 100644
--- a/pkg/bpf/program_builder_test.go
+++ b/pkg/bpf/program_builder_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go
index 4daaa82b6..8c14ccbfa 100644
--- a/pkg/compressio/compressio.go
+++ b/pkg/compressio/compressio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/compressio/compressio_test.go b/pkg/compressio/compressio_test.go
index 1bbabee79..86dc47e44 100644
--- a/pkg/compressio/compressio_test.go
+++ b/pkg/compressio/compressio_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/control/client/client.go b/pkg/control/client/client.go
index 0d0c9f148..3fec27846 100644
--- a/pkg/control/client/client.go
+++ b/pkg/control/client/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go
index c46b5d70b..1a15da1a8 100644
--- a/pkg/control/server/server.go
+++ b/pkg/control/server/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpu_amd64.s b/pkg/cpuid/cpu_amd64.s
index 905c1d12e..ac80d3c8a 100644
--- a/pkg/cpuid/cpu_amd64.s
+++ b/pkg/cpuid/cpu_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index 61441150e..3eb2bcd2b 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid_parse_test.go b/pkg/cpuid/cpuid_parse_test.go
index e8f87a10e..dd9969db4 100644
--- a/pkg/cpuid/cpuid_parse_test.go
+++ b/pkg/cpuid/cpuid_parse_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid_test.go b/pkg/cpuid/cpuid_test.go
index 64ade1cbe..6ae14d2da 100644
--- a/pkg/cpuid/cpuid_test.go
+++ b/pkg/cpuid/cpuid_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 2ba79be32..b7cde3819 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp.go b/pkg/dhcp/dhcp.go
index 6945bcd35..f96ffd891 100644
--- a/pkg/dhcp/dhcp.go
+++ b/pkg/dhcp/dhcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp_string.go b/pkg/dhcp/dhcp_string.go
index 8533895bd..29ce98593 100644
--- a/pkg/dhcp/dhcp_string.go
+++ b/pkg/dhcp/dhcp_string.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index e1d8ef603..751626bb0 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index 9549ff705..6a1972860 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index 41a7b5ed3..4c8ae573b 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/eventchannel/event.proto b/pkg/eventchannel/event.proto
index c1679c7e7..34468f072 100644
--- a/pkg/eventchannel/event.proto
+++ b/pkg/eventchannel/event.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index d40758c22..2785243a2 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fd/fd_test.go b/pkg/fd/fd_test.go
index 42bb3ef6c..5fb0ad47d 100644
--- a/pkg/fd/fd_test.go
+++ b/pkg/fd/fd_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go
index aa4906ca0..f0b028b0b 100644
--- a/pkg/fdnotifier/fdnotifier.go
+++ b/pkg/fdnotifier/fdnotifier.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go
index 05be9aeb5..bc5e0ac44 100644
--- a/pkg/fdnotifier/poll_unsafe.go
+++ b/pkg/fdnotifier/poll_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/gate/gate.go b/pkg/gate/gate.go
index 48122bf5a..bda6aae09 100644
--- a/pkg/gate/gate.go
+++ b/pkg/gate/gate.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 95620fa8e..7467e7d07 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 51c9b6df3..019caadca 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/ilist/list_test.go b/pkg/ilist/list_test.go
index f37946dc2..3f9abfb56 100644
--- a/pkg/ilist/list_test.go
+++ b/pkg/ilist/list_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/linewriter/linewriter.go b/pkg/linewriter/linewriter.go
index 5fbd4e779..cd6e4e2ce 100644
--- a/pkg/linewriter/linewriter.go
+++ b/pkg/linewriter/linewriter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/linewriter/linewriter_test.go b/pkg/linewriter/linewriter_test.go
index 9140ee6af..96dc7e6e0 100644
--- a/pkg/linewriter/linewriter_test.go
+++ b/pkg/linewriter/linewriter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index 24d5390d7..5732785b4 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/glog_unsafe.go b/pkg/log/glog_unsafe.go
index bb06aa7d3..ea17ae349 100644
--- a/pkg/log/glog_unsafe.go
+++ b/pkg/log/glog_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json.go b/pkg/log/json.go
index 96bd13d87..a278c8fc8 100644
--- a/pkg/log/json.go
+++ b/pkg/log/json.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go
index 9c2f8d2b7..c2c019915 100644
--- a/pkg/log/json_k8s.go
+++ b/pkg/log/json_k8s.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json_test.go b/pkg/log/json_test.go
index b8c7a795e..f25224fe1 100644
--- a/pkg/log/json_test.go
+++ b/pkg/log/json_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/log.go b/pkg/log/log.go
index b8d456aae..7d563241e 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/log_test.go b/pkg/log/log_test.go
index a59d457dd..0634e7c1f 100644
--- a/pkg/log/log_test.go
+++ b/pkg/log/log_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index e5eb95f89..803709cc4 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric.proto b/pkg/metric/metric.proto
index 917fda1ac..a2c2bd1ba 100644
--- a/pkg/metric/metric.proto
+++ b/pkg/metric/metric.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index 40034a589..b8b124c83 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/buffer.go b/pkg/p9/buffer.go
index b7bb14ef9..4c8c6555d 100644
--- a/pkg/p9/buffer.go
+++ b/pkg/p9/buffer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/buffer_test.go b/pkg/p9/buffer_test.go
index 18d55e5c0..a9c75f86b 100644
--- a/pkg/p9/buffer_test.go
+++ b/pkg/p9/buffer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 67887874a..2f9c716d0 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 992d1daf7..63c65129a 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client_test.go b/pkg/p9/client_test.go
index f7145452d..fc49729d8 100644
--- a/pkg/p9/client_test.go
+++ b/pkg/p9/client_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 55ceb52e1..a52a0f3e7 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index c1d1ac1e8..6da2ce4e3 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index 69b90c6cd..f4077a9d4 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 97decd3cc..833defbd6 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 68395a396..10a0587cf 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 4ea9f2f9a..78c7d3f86 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9_test.go b/pkg/p9/p9_test.go
index 02498346c..8dda6cc64 100644
--- a/pkg/p9/p9_test.go
+++ b/pkg/p9/p9_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 242d81b95..e00dd03ab 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go
index f9bacbf84..1c8eff200 100644
--- a/pkg/p9/p9test/p9test.go
+++ b/pkg/p9/p9test/p9test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/path_tree.go b/pkg/p9/path_tree.go
index 60b20578e..f37ad4ab2 100644
--- a/pkg/p9/path_tree.go
+++ b/pkg/p9/path_tree.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go
index 34ed898e8..52de889e1 100644
--- a/pkg/p9/pool.go
+++ b/pkg/p9/pool.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/pool_test.go b/pkg/p9/pool_test.go
index 71052d8c4..e4746b8da 100644
--- a/pkg/p9/pool_test.go
+++ b/pkg/p9/pool_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 3ef151595..b2a86d8fa 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index bafb377de..ef59077ff 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go
index b7b7825bd..c833d1c9c 100644
--- a/pkg/p9/transport_test.go
+++ b/pkg/p9/transport_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index ceb6fabbf..a36a499a1 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/version_test.go b/pkg/p9/version_test.go
index c053614c9..291e8580e 100644
--- a/pkg/p9/version_test.go
+++ b/pkg/p9/version_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/rand/rand.go b/pkg/rand/rand.go
index 593a14380..a2714784d 100644
--- a/pkg/rand/rand.go
+++ b/pkg/rand/rand.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 7ebe8f3b0..2b92db3e6 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 8f08c74c7..20f515391 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter_state.go b/pkg/refs/refcounter_state.go
index 136f06fbf..7c99fd2b5 100644
--- a/pkg/refs/refcounter_state.go
+++ b/pkg/refs/refcounter_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go
index abaa87453..ffd3d3f07 100644
--- a/pkg/refs/refcounter_test.go
+++ b/pkg/refs/refcounter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index e113f3574..50c9409e4 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index a9278c64b..29eec8db1 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 11ed90eb4..47ecac6f7 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index dd5ed0041..afc2f755f 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index a31c6471d..ccd40d9db 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/full_reader.go b/pkg/secio/full_reader.go
index 90b1772a7..aed2564bd 100644
--- a/pkg/secio/full_reader.go
+++ b/pkg/secio/full_reader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/secio.go b/pkg/secio/secio.go
index e5f74a497..b43226035 100644
--- a/pkg/secio/secio.go
+++ b/pkg/secio/secio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/secio_test.go b/pkg/secio/secio_test.go
index 8304c4f74..d1d905187 100644
--- a/pkg/secio/secio_test.go
+++ b/pkg/secio/secio_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/range.go b/pkg/segment/range.go
index 057bcd7ff..4d4aeffef 100644
--- a/pkg/segment/range.go
+++ b/pkg/segment/range.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 74a916ea3..982eb3fdd 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/set_state.go b/pkg/segment/set_state.go
index b86e1b75f..76de92591 100644
--- a/pkg/segment/set_state.go
+++ b/pkg/segment/set_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go
index 0825105db..f19a005f3 100644
--- a/pkg/segment/test/segment_test.go
+++ b/pkg/segment/test/segment_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go
index 41f649011..bcddb39bb 100644
--- a/pkg/segment/test/set_functions.go
+++ b/pkg/segment/test/set_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/aligned.go b/pkg/sentry/arch/aligned.go
index c88c034f6..df01a903d 100644
--- a/pkg/sentry/arch/aligned.go
+++ b/pkg/sentry/arch/aligned.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 16d8eb2b2..53f0c9018 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 7ec2f2c84..135c2ee1f 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s
index fa9857df7..bd61402cf 100644
--- a/pkg/sentry/arch/arch_amd64.s
+++ b/pkg/sentry/arch/arch_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index 01949049d..bb52d8db0 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 4305fe2cb..4d167ce98 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
index 5df65a691..80c923103 100644
--- a/pkg/sentry/arch/auxv.go
+++ b/pkg/sentry/arch/auxv.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index f4c2f7043..9dc83e241 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index ad098c746..f9ca2e74e 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 7f76eba27..aa030fd70 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_info.go b/pkg/sentry/arch/signal_info.go
index fa0ecbec5..f93ee8b46 100644
--- a/pkg/sentry/arch/signal_info.go
+++ b/pkg/sentry/arch/signal_info.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index c02ae3b7c..a442f9fdc 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 2e33ccdf5..7e6324e82 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go
index 47c31d4b9..8b4f23007 100644
--- a/pkg/sentry/arch/syscalls_amd64.go
+++ b/pkg/sentry/arch/syscalls_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index eefc3e1b4..d70f3a5c3 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index a29087775..a42038711 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/control.go b/pkg/sentry/control/control.go
index 32d30b6ea..6060b9b4f 100644
--- a/pkg/sentry/control/control.go
+++ b/pkg/sentry/control/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 1af092af3..94ed149f2 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index aca2267a7..f7f02a3e1 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index 5d52cd829..b7895d03c 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
index b6bbf69fa..11efcaba1 100644
--- a/pkg/sentry/control/state.go
+++ b/pkg/sentry/control/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index ae4fa1d93..458d03b30 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/device/device_test.go b/pkg/sentry/device/device_test.go
index 5d8805c2f..e3f51ce4f 100644
--- a/pkg/sentry/device/device_test.go
+++ b/pkg/sentry/device/device_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go
index a5e8c4f0d..a6ea8b9e7 100644
--- a/pkg/sentry/fs/anon/anon.go
+++ b/pkg/sentry/fs/anon/anon.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/anon/device.go b/pkg/sentry/fs/anon/device.go
index 2d1249299..5927bd11e 100644
--- a/pkg/sentry/fs/anon/device.go
+++ b/pkg/sentry/fs/anon/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index 1f61c5711..b53746519 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index 5369d1b0d..5e005bc2e 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
index 7c997f533..bdf23b371 100644
--- a/pkg/sentry/fs/ashmem/pin_board.go
+++ b/pkg/sentry/fs/ashmem/pin_board.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/pin_board_test.go b/pkg/sentry/fs/ashmem/pin_board_test.go
index 736e628dc..24f5d86d6 100644
--- a/pkg/sentry/fs/ashmem/pin_board_test.go
+++ b/pkg/sentry/fs/ashmem/pin_board_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 3523b068a..591e35e6a 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index d9f1559de..acbbd5466 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index 4869428a8..c80ea0175 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index ba69e718d..ee2d3d115 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 98a0b7638..54810afca 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index 29fb155a4..fe656cc24 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index fbc750a71..34ac01173 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/device.go b/pkg/sentry/fs/dev/device.go
index 3cecdf6e2..9f4e41fc9 100644
--- a/pkg/sentry/fs/dev/device.go
+++ b/pkg/sentry/fs/dev/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index cf4e7d00f..6096a40f8 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 82da9aae9..6b11afa44 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 5d306d352..069212b6d 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index ffd5cf6c3..de0f3e5e5 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 54fc11fe1..c0bc261a2 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index d26a06971..71f2d11de 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go
index 024c7b2d5..ebb80bd50 100644
--- a/pkg/sentry/fs/dirent_cache_limiter.go
+++ b/pkg/sentry/fs/dirent_cache_limiter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go
index 93e8d415f..395c879f5 100644
--- a/pkg/sentry/fs/dirent_cache_test.go
+++ b/pkg/sentry/fs/dirent_cache_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index 325404e27..db88d850e 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index 5cf151dab..18652b809 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 98483ab68..95e66ea8d 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go
index 92ab6ff0e..0cabe2e18 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index 69516e048..8c8b1b40c 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
index 4395666ad..8b347aa11 100644
--- a/pkg/sentry/fs/fdpipe/pipe_state.go
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index 7e3ee5257..b59a6aa0e 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 5d5026661..62b35dabc 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index e0fa5135f..ab0acb6eb 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 6e680f0a4..948ce9c6f 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index a4ac58763..6a2b8007c 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_state.go b/pkg/sentry/fs/file_state.go
index 1c3bae3e8..523182d59 100644
--- a/pkg/sentry/fs/file_state.go
+++ b/pkg/sentry/fs/file_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_test.go b/pkg/sentry/fs/file_test.go
index f3ed9a70b..d867a0257 100644
--- a/pkg/sentry/fs/file_test.go
+++ b/pkg/sentry/fs/file_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index a6b27c402..acd84dfcc 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index 388a1ce36..f6b827800 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index bf2a20b33..5c8cb773f 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 119689776..632055cce 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 5add16ac4..9cd196d7d 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go
index f5c9d9215..d9c68baa3 100644
--- a/pkg/sentry/fs/fsutil/dirty_set_test.go
+++ b/pkg/sentry/fs/fsutil/dirty_set_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 42afdd11c..e355d8594 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 32ebf64ff..b5ac6c71c 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index b6e783614..6565c28c8 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go
index 319c4841b..c9587b1d9 100644
--- a/pkg/sentry/fs/fsutil/fsutil.go
+++ b/pkg/sentry/fs/fsutil/fsutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index 9599665f0..2bdfc0db6 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
index bbd15b30b..576d2a3df 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_state.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
index 86df76822..7167be263 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 4a182baa1..28686f3b3 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 468171a9b..b6366d906 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index ba33b9912..919d2534c 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 2a8a1639c..661ec41f6 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
index 98700d014..c572f3396 100644
--- a/pkg/sentry/fs/gofer/attr.go
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 51c573aef..35cd0c1d6 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 455953237..d512afefc 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/device.go b/pkg/sentry/fs/gofer/device.go
index 52c5acf48..1de6c247c 100644
--- a/pkg/sentry/fs/gofer/device.go
+++ b/pkg/sentry/fs/gofer/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 35caa42cd..bc2be546e 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index d0c64003c..31264e065 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index adff0abac..6ab89fcc2 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 36201f017..29d34da7e 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index 0b33e80c3..c7098cd36 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 1181a24cc..f6f20844d 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 44d76ba9f..ac22ee4b1 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 8ae33d286..4cbf9e9d9 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 4ed688ce5..4cb65e7c6 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index b1f299be5..68fbf3417 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index ce6d3d5c3..cbd5b9a84 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index 1a759370d..d0e1096ce 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 0753640a2..480f0c8f4 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 554e1693a..ffcd57a94 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
index 530c0109f..8167390a9 100644
--- a/pkg/sentry/fs/host/descriptor_state.go
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go
index 5dec84ab2..ff08e43af 100644
--- a/pkg/sentry/fs/host/descriptor_test.go
+++ b/pkg/sentry/fs/host/descriptor_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
index b5adedf44..055024c44 100644
--- a/pkg/sentry/fs/host/device.go
+++ b/pkg/sentry/fs/host/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 2a8f285ff..82e2ae3b9 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index de349a41a..b1b8dc0b6 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index c83b29a16..16c89ddf1 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 69c648f67..20e077f77 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index b7c1a9581..26cc755bc 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index 9f1561bd5..ad1878b5a 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index 175dca613..b5a85c4d9 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index be2c3581f..3034e9441 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index d4ce4a8c1..5efbb3ae8 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
index 2932c1f16..5676c451a 100644
--- a/pkg/sentry/fs/host/socket_state.go
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 83e8e1b3c..cc760a7e1 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index f35e2492d..8873705c0 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index c5cb75df7..e45b339f5 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index 40c450660..94ff7708e 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index a8721d197..b95a57c3f 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index 9ca8c399f..afcb74724 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index fe411a766..d764ef93d 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index d2b653bc7..0f2a66a79 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index ff8b75f31..ac287e1e4 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index bda3e1861..3d015328e 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index fa8accf6c..66b3da2d0 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 59fa662f3..2652582c3 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index f09928b68..d52f956e4 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index d33e7e498..a0b488467 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 5ff800d2d..f2aee4512 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_range_test.go b/pkg/sentry/fs/lock/lock_range_test.go
index b0ab882b9..6221199d1 100644
--- a/pkg/sentry/fs/lock/lock_range_test.go
+++ b/pkg/sentry/fs/lock/lock_range_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
index 395592a4b..8a3ace0c1 100644
--- a/pkg/sentry/fs/lock/lock_set_functions.go
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
index 67fa4b1dd..ba002aeb7 100644
--- a/pkg/sentry/fs/lock/lock_test.go
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 118e30f63..cf359a1f1 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 4d1693204..a169ea4c9 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index fb60a1aec..535f812c8 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index d7605b2c9..9f7fbeff2 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index f6f7be0aa..01eb4607e 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index 54000614f..56d726dd1 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go
index 38aee765a..3f68da149 100644
--- a/pkg/sentry/fs/offset.go
+++ b/pkg/sentry/fs/offset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index f3e2d5cbe..db89a5f70 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/path.go b/pkg/sentry/fs/path.go
index 52139b648..e4dc02dbb 100644
--- a/pkg/sentry/fs/path.go
+++ b/pkg/sentry/fs/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/path_test.go b/pkg/sentry/fs/path_test.go
index 4ba1498f6..e6f57ebba 100644
--- a/pkg/sentry/fs/path_test.go
+++ b/pkg/sentry/fs/path_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index f756c45bf..15031234e 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/device/device.go b/pkg/sentry/fs/proc/device/device.go
index 04b687bcf..0de466c73 100644
--- a/pkg/sentry/fs/proc/device/device.go
+++ b/pkg/sentry/fs/proc/device/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index fc21dfbbd..d49dad685 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index f2329e623..744b31c74 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index c050a00be..7bb081d0e 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 666a2d054..7c5f8484a 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index 8dde2ea46..b03807043 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 3ee0e570a..2dfe7089a 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 75cbf3e77..d2b9b92c7 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index fe62b167b..37ed30724 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index d24b2d370..4a107c739 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/net_test.go b/pkg/sentry/fs/proc/net_test.go
index 94677cc1d..9aed5fdca 100644
--- a/pkg/sentry/fs/proc/net_test.go
+++ b/pkg/sentry/fs/proc/net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 64e1e1998..196fa5128 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
index 81f64a28b..db53686f6 100644
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 0a0eb45e2..10ea1f55d 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
index 35403ab7f..c4de565eb 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index 18bd8e9b6..397f9ec6b 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index a7bc9198e..b889ed625 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 0ce77f04f..e49794a48 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 5f481a1cf..6eba709c6 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index ea0d94fce..78135ba13 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 9f65a8337..0f400e80f 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index d433632cf..d649da0f1 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index d7ae26fcf..1ddf9fafa 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index 58e0c793c..a5479990c 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index c0400b67d..a6b6a5c33 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 5bcb6c364..9406a07ca 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index 35dabdad2..f7835fe05 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
index c1ac8a78b..8c6b31f70 100644
--- a/pkg/sentry/fs/ramfs/tree.go
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index 8bee9cfc1..27abeb6ba 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
index a6645b41e..f10168125 100644
--- a/pkg/sentry/fs/restore.go
+++ b/pkg/sentry/fs/restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/save.go b/pkg/sentry/fs/save.go
index 90988d385..2eaf6ab69 100644
--- a/pkg/sentry/fs/save.go
+++ b/pkg/sentry/fs/save.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/seek.go b/pkg/sentry/fs/seek.go
index 72f3fb632..0f43918ad 100644
--- a/pkg/sentry/fs/seek.go
+++ b/pkg/sentry/fs/seek.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sync.go b/pkg/sentry/fs/sync.go
index 6dcc2fe8d..1fff8059c 100644
--- a/pkg/sentry/fs/sync.go
+++ b/pkg/sentry/fs/sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/device.go b/pkg/sentry/fs/sys/device.go
index 38ecd0c18..128d3a9d9 100644
--- a/pkg/sentry/fs/sys/device.go
+++ b/pkg/sentry/fs/sys/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index 8b728a4e4..db91de435 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 44ae43754..f0c2322e0 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index c5b56fe69..d20ef91fa 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index ef9a08854..749961f51 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/device.go b/pkg/sentry/fs/tmpfs/device.go
index aade93c26..179c3a46f 100644
--- a/pkg/sentry/fs/tmpfs/device.go
+++ b/pkg/sentry/fs/tmpfs/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index d0c9b8bea..1ef256511 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index 743061190..b44c06556 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 8e44421b6..b7c29a4d1 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 4450e1363..f89d86c83 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 5bb4922cb..832914453 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index f8713471a..0fc777e67 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index a53448c47..701b2f7d9 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index c4a364edb..20d29d130 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index e2686a074..45e167e5f 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 5e88d84d9..11fb92be3 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index ed080ca0f..0ae57a02c 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 79f9d76d7..2b4160ba5 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index ad535838f..d2e75a511 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/getcpu_amd64.s b/pkg/sentry/hostcpu/getcpu_amd64.s
index 409db1450..aa00316da 100644
--- a/pkg/sentry/hostcpu/getcpu_amd64.s
+++ b/pkg/sentry/hostcpu/getcpu_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/hostcpu.go b/pkg/sentry/hostcpu/hostcpu.go
index 3adc847bb..d78f78402 100644
--- a/pkg/sentry/hostcpu/hostcpu.go
+++ b/pkg/sentry/hostcpu/hostcpu.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/hostcpu_test.go b/pkg/sentry/hostcpu/hostcpu_test.go
index 38de0e1f6..7d6885c9e 100644
--- a/pkg/sentry/hostcpu/hostcpu_test.go
+++ b/pkg/sentry/hostcpu/hostcpu_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/context.go b/pkg/sentry/inet/context.go
index d05e96f15..8550c4793 100644
--- a/pkg/sentry/inet/context.go
+++ b/pkg/sentry/inet/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 8206377cc..7c104fd47 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index 05c1a1792..624371eb6 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 1ea2cee36..5ce52e66c 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
index 19f15fd36..847d121aa 100644
--- a/pkg/sentry/kernel/auth/auth.go
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
index 88d6243aa..7a0c967cd 100644
--- a/pkg/sentry/kernel/auth/capability_set.go
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index f7e945599..16d110610 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index 2055da196..1511a0324 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index e5bed44d7..0a58ba17c 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index 43f439825..e5d6028d6 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
index 8f1a189ec..432dbfb6d 100644
--- a/pkg/sentry/kernel/auth/id_map_functions.go
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index 159940a69..a40dd668f 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index b629521eb..a1a084eab 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go
index eb56a6a07..ae67e2a25 100644
--- a/pkg/sentry/kernel/contexttest/contexttest.go
+++ b/pkg/sentry/kernel/contexttest/contexttest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index befefb11c..2399ae6f2 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index f6e3e4825..4c3c38f9e 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
index d89c1b745..49b781b69 100644
--- a/pkg/sentry/kernel/epoll/epoll_test.go
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index b448ad813..5d3139eef 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
index 14e8996d9..1159638e5 100644
--- a/pkg/sentry/kernel/eventfd/eventfd_test.go
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 298d988ea..84cd08501 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index 715f4714d..c5636d233 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
index 9e76f0a2d..22db4c7cf 100644
--- a/pkg/sentry/kernel/fd_map_test.go
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 3cf0db280..d8115f59a 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index cd7d51621..bb38eb81e 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 9d44ee8e5..2de5239bf 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 9ceb9bd92..ebe12812c 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
index 8eafe810b..304da2032 100644
--- a/pkg/sentry/kernel/kdefs/kdefs.go
+++ b/pkg/sentry/kernel/kdefs/kdefs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index a1b2d7161..0468dd678 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
index aae6f9ad2..48c3ff5a9 100644
--- a/pkg/sentry/kernel/kernel_state.go
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index d09d6debf..0e2cee807 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
index 43b8deb76..bf8029ff5 100644
--- a/pkg/sentry/kernel/memevent/memory_events.proto
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index deff6def9..c93f6598a 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go
index 72be6702f..2c902c7e3 100644
--- a/pkg/sentry/kernel/pending_signals_state.go
+++ b/pkg/sentry/kernel/pending_signals_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
index 54e059f8b..ba53fd482 100644
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
index eec5c5de8..eb59e15a1 100644
--- a/pkg/sentry/kernel/pipe/device.go
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 1336b6293..99188dddf 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index ad103b195..7ddecdad8 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 357d1162e..bd7649d2f 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index 3b9895927..de340c40c 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index f27379969..48fab45d1 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 1090432d7..ddcc5e09a 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index 6fea9769c..0f29fbc43 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
index 40b5acca3..a016b4087 100644
--- a/pkg/sentry/kernel/posixtimer.go
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 15f2e2964..4423e7efd 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index 1f88efca3..048eeaa3f 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
index 4636405e6..4899c813f 100644
--- a/pkg/sentry/kernel/ptrace_arm64.go
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 6d3314e81..c4fb2c56c 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
index 41ac1067d..c6c436690 100644
--- a/pkg/sentry/kernel/sched/cpuset.go
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go
index a036ed513..3af9f1197 100644
--- a/pkg/sentry/kernel/sched/cpuset_test.go
+++ b/pkg/sentry/kernel/sched/cpuset_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
index e59909baf..de18c9d02 100644
--- a/pkg/sentry/kernel/sched/sched.go
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 4bed4d373..cc75eb08a 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 2b7c1a9bc..9d0620e02 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index 2e51e6ee5..abfcd0fb4 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 070c2f930..610e199da 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go
index bbc653ed8..3cb759072 100644
--- a/pkg/sentry/kernel/shm/device.go
+++ b/pkg/sentry/kernel/shm/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index d4812a065..00393b5f0 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index 22a56c6fc..b528ec0dc 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 60cbe85b8..ce8bcb5e5 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 293b21249..0572053db 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 981455d46..00358326b 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 2aecf3eea..175d1b247 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
index 3b29d3c6a..8f7cdb9f3 100644
--- a/pkg/sentry/kernel/table_test.go
+++ b/pkg/sentry/kernel/table_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index ed2175c37..f9378c2de 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index 24230af89..1ca2a82eb 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index e5027e551..30a7f6b1e 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index daf974920..bba8ddd39 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index ac38dd157..bbd294141 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index b49f902a5..5d1425d5c 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index a07956208..6e9701b01 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 351cf47d7..f98097c2c 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 6c9608f8d..17f08729a 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index f4c881c2d..e0e57e8bd 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index fc7cefc1f..04c684c1a 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 7115aa967..4549b437e 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 3d654bf93..5455f6ea9 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 7f2e0df72..654cf7525 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index b7534c0a2..b42531e57 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 1302cadc1..e735a5dd0 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 52f5fde8d..a9283d0df 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go
index 3f37f505d..b895361d0 100644
--- a/pkg/sentry/kernel/task_test.go
+++ b/pkg/sentry/kernel/task_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index cb68799d3..461bd7316 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 58f3a7ec9..8bd53928e 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 4fd6cf4e2..656bbd46c 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
index 3675ea20d..c0660d362 100644
--- a/pkg/sentry/kernel/time/context.go
+++ b/pkg/sentry/kernel/time/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index ca0f4ba2e..3846cf1ea 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index d7bd85e78..505a4fa4f 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
index f3a3ed543..6ce358a05 100644
--- a/pkg/sentry/kernel/timekeeper_state.go
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index 6084bcb18..a92ad689e 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/uncaught_signal.proto b/pkg/sentry/kernel/uncaught_signal.proto
index c7f6a1978..0bdb062cb 100644
--- a/pkg/sentry/kernel/uncaught_signal.proto
+++ b/pkg/sentry/kernel/uncaught_signal.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index ed5f0c031..96fe3cbb9 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 3a35f1d00..d40ad74f4 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
index 8d2f14209..5640dd71d 100644
--- a/pkg/sentry/kernel/version.go
+++ b/pkg/sentry/kernel/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index bf413eb7d..9200edb52 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index b0571739f..b6c22656b 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/limits_test.go b/pkg/sentry/limits/limits_test.go
index 945428163..658a20f56 100644
--- a/pkg/sentry/limits/limits_test.go
+++ b/pkg/sentry/limits/limits_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index e09d0d2fb..a2b401e3d 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 385ad0102..97e32c8ba 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 35b83654d..b88062ae5 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 79051befa..dc1a52398 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 8c196df84..207d8ed3d 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
index b327f0e1e..db378e90a 100644
--- a/pkg/sentry/loader/vdso_state.go
+++ b/pkg/sentry/loader/vdso_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index bd07e9aac..3cf2b338f 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go
index 45d1d4688..c702555ce 100644
--- a/pkg/sentry/memmap/mapping_set_test.go
+++ b/pkg/sentry/memmap/mapping_set_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 3f6f7ebd0..0106c857d 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memutil/memutil.go b/pkg/sentry/memutil/memutil.go
index 286d50ca4..a4154c42a 100644
--- a/pkg/sentry/memutil/memutil.go
+++ b/pkg/sentry/memutil/memutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go
index bc2c72f55..92eab8a26 100644
--- a/pkg/sentry/memutil/memutil_unsafe.go
+++ b/pkg/sentry/memutil/memutil_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 4dddcf7b5..06f587fde 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 7075792e0..5c61acf36 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index 192a6f744..c37fc9f7b 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
index d075ee1ca..fe58cfc4c 100644
--- a/pkg/sentry/mm/debug.go
+++ b/pkg/sentry/mm/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index 81787a6fd..e4c057d28 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 2fe03172c..e6aa6f9ef 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 5ef1ba0b1..9768e51f1 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index a3417a46e..d25aa5136 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index ae4fba478..f4917419f 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 0cca743ef..ece561ff0 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 7cdbf6e25..c8302a553 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
index 46e0e0754..0385957bd 100644
--- a/pkg/sentry/mm/save_restore.go
+++ b/pkg/sentry/mm/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
index 3bc48c7e7..12913007b 100644
--- a/pkg/sentry/mm/shm.go
+++ b/pkg/sentry/mm/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 3b5161998..687959005 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 7b675b9b5..a25318abb 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 931995254..ad901344b 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go
index adc97e78f..cb9809b1f 100644
--- a/pkg/sentry/pgalloc/context.go
+++ b/pkg/sentry/pgalloc/context.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 0754e608f..411dafa07 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
index 726623c1a..14a39bb9e 100644
--- a/pkg/sentry/pgalloc/pgalloc_test.go
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/pgalloc_unsafe.go b/pkg/sentry/pgalloc/pgalloc_unsafe.go
index 33b0a68a8..a4b5d581c 100644
--- a/pkg/sentry/pgalloc/pgalloc_unsafe.go
+++ b/pkg/sentry/pgalloc/pgalloc_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index 21024e656..cf169af55 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go
index cca21a23e..793f57fd7 100644
--- a/pkg/sentry/platform/context.go
+++ b/pkg/sentry/platform/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index 9c83f41eb..a4651f500 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/interrupt/interrupt_test.go b/pkg/sentry/platform/interrupt/interrupt_test.go
index fb3284395..0ecdf6e7a 100644
--- a/pkg/sentry/platform/interrupt/interrupt_test.go
+++ b/pkg/sentry/platform/interrupt/interrupt_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index f2f7ab1e8..689122175 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
index b25cad155..42bcc9733 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index f24f1c662..a926e6f8b 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 6520682d7..c258408f9 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index 65b01f358..2bc34a435 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 21de2488e..92fde7ee0 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index e79a30ef2..3c452f5ba 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 2605f8c93..4184939e5 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index c75a4b415..0eb0020f7 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index c5a4435b1..ed0521c3f 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index 70d0ac63b..61493ccaf 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
index d0f6bb225..46c4b9113 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index cac8d9937..d05f05c29 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 361200622..e83db71e9 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index b8b3c9a4a..f5953b96e 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index ccfe837b5..b6821122a 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 69ba67ced..06a2e3b0c 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 22ae60b63..452d88d7f 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 9d7dca5b3..450eb8201 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go
index 0d496561d..6cf2359a3 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
index fcba33813..203d71528 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
index f1da41a44..491ec0c2a 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index 0343e9267..28a1b4414 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go
index 935e0eb93..d03ec654a 100644
--- a/pkg/sentry/platform/kvm/virtual_map_test.go
+++ b/pkg/sentry/platform/kvm/virtual_map_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go
index 1bcc1f8e9..90976735b 100644
--- a/pkg/sentry/platform/mmap_min_addr.go
+++ b/pkg/sentry/platform/mmap_min_addr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 0e48417b9..ae37276ad 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/sentry/platform/procid/procid.go
index 3f49ab093..78b92422c 100644
--- a/pkg/sentry/platform/procid/procid.go
+++ b/pkg/sentry/platform/procid/procid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
index ef3439c03..272c9fc14 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/sentry/platform/procid/procid_arm64.s
index 02e907b6b..7a1684a18 100644
--- a/pkg/sentry/platform/procid/procid_arm64.s
+++ b/pkg/sentry/platform/procid/procid_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_net_test.go b/pkg/sentry/platform/procid/procid_net_test.go
index e8dcc479d..b628e2285 100644
--- a/pkg/sentry/platform/procid/procid_net_test.go
+++ b/pkg/sentry/platform/procid/procid_net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_test.go b/pkg/sentry/platform/procid/procid_test.go
index 7a57c7cdc..88dd0b3ae 100644
--- a/pkg/sentry/platform/procid/procid_test.go
+++ b/pkg/sentry/platform/procid/procid_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 3c0713e95..6a890dd81 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 223b23199..585f6c1fb 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
index 63f98e40d..64c718d21 100644
--- a/pkg/sentry/platform/ptrace/stub_amd64.s
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go
index 48c16c4a1..54d5021a9 100644
--- a/pkg/sentry/platform/ptrace/stub_unsafe.go
+++ b/pkg/sentry/platform/ptrace/stub_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 2a5d699ec..83b43057f 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index d23a1133e..77a0e908f 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index e2aab8135..2c07b4ac3 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
index 0c9263060..1bf7eab28 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index ca6c4ac97..17736b05b 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 98d0a6de0..5bbd4612d 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 67242b92b..413c3dbc4 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index 4a9affe64..a5ce67885 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index afb040a6f..8cb8c4996 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go
index 11c49855f..a4927da2f 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/main.go
+++ b/pkg/sentry/platform/ring0/gen_offsets/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 19ac6eb7c..900c0bba7 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 5ed4342dd..3577b5127 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go
index faf4240e5..16955ad91 100644
--- a/pkg/sentry/platform/ring0/kernel_unsafe.go
+++ b/pkg/sentry/platform/ring0/kernel_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index 2b95a0141..9c5f26962 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
index 98a130525..75d742750 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.s
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index 806e07ec0..85cc3fdad 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index ee6e90a11..23fd5c352 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
index f48647b3a..1b996b4e2 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index c7207ec18..e5dcaada7 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 746f614e5..7aa6c524e 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
index 2f82c4353..a1ec4b109 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index 3e5dc7dc7..36e424495 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index 6bd8c3584..ff427fbe9 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index 0d9a51aa5..0f029f25d 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
index c4c71d23e..8f9dacd93 100644
--- a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go
index 10c51e88d..cdeb1b43a 100644
--- a/pkg/sentry/platform/ring0/ring0.go
+++ b/pkg/sentry/platform/ring0/ring0.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 4c6daec22..7e5ceafdb 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
index f90b4bfd1..a0cd78f33 100644
--- a/pkg/sentry/platform/safecopy/atomic_amd64.s
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
index 69c66a3b7..5126871eb 100644
--- a/pkg/sentry/platform/safecopy/safecopy.go
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy_test.go b/pkg/sentry/platform/safecopy/safecopy_test.go
index 1a682d28a..5818f7f9b 100644
--- a/pkg/sentry/platform/safecopy/safecopy_test.go
+++ b/pkg/sentry/platform/safecopy/safecopy_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
index f84527484..eef028e68 100644
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
index db7701a29..475ae48e9 100644
--- a/pkg/sentry/platform/safecopy/sighandler_amd64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s
index cdfca8207..53e4ac2c1 100644
--- a/pkg/sentry/platform/safecopy/sighandler_arm64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go
index c3a9780d2..1f72deb61 100644
--- a/pkg/sentry/safemem/block_unsafe.go
+++ b/pkg/sentry/safemem/block_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
index 6cb52439f..5c3d73eb7 100644
--- a/pkg/sentry/safemem/io.go
+++ b/pkg/sentry/safemem/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/io_test.go b/pkg/sentry/safemem/io_test.go
index 2eda8c3bb..629741bee 100644
--- a/pkg/sentry/safemem/io_test.go
+++ b/pkg/sentry/safemem/io_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/safemem.go b/pkg/sentry/safemem/safemem.go
index 090932d3e..3e70d33a2 100644
--- a/pkg/sentry/safemem/safemem.go
+++ b/pkg/sentry/safemem/safemem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/seq_test.go b/pkg/sentry/safemem/seq_test.go
index fddcaf714..eba4bb535 100644
--- a/pkg/sentry/safemem/seq_test.go
+++ b/pkg/sentry/safemem/seq_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/seq_unsafe.go b/pkg/sentry/safemem/seq_unsafe.go
index 83a6b7183..354a95dde 100644
--- a/pkg/sentry/safemem/seq_unsafe.go
+++ b/pkg/sentry/safemem/seq_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 571245ce5..659b43363 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index db6e71487..aca77888a 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index d44f5e88a..abda364c9 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/device.go b/pkg/sentry/socket/epsocket/device.go
index 3cc138eb0..ab4083efe 100644
--- a/pkg/sentry/socket/epsocket/device.go
+++ b/pkg/sentry/socket/epsocket/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 768fa0dfa..520d82f68 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 0d9c2df24..5a89a63fb 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/save_restore.go b/pkg/sentry/socket/epsocket/save_restore.go
index f19afb6c0..feaafb7cc 100644
--- a/pkg/sentry/socket/epsocket/save_restore.go
+++ b/pkg/sentry/socket/epsocket/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 37c48f4bc..edefa225b 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/device.go b/pkg/sentry/socket/hostinet/device.go
index c5133f3bb..4267e3691 100644
--- a/pkg/sentry/socket/hostinet/device.go
+++ b/pkg/sentry/socket/hostinet/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/hostinet.go b/pkg/sentry/socket/hostinet/hostinet.go
index 7858892ab..0d6f51d2b 100644
--- a/pkg/sentry/socket/hostinet/hostinet.go
+++ b/pkg/sentry/socket/hostinet/hostinet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/save_restore.go b/pkg/sentry/socket/hostinet/save_restore.go
index 3827f082a..1dec33897 100644
--- a/pkg/sentry/socket/hostinet/save_restore.go
+++ b/pkg/sentry/socket/hostinet/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 49349074f..71884d3db 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
index 59c8910ca..eed0c7837 100644
--- a/pkg/sentry/socket/hostinet/socket_unsafe.go
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 4ce73c1f1..9c45991ba 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index a95172cba..5bd3b49ce 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index 20b9a6e37..e9d3275b1 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/port/port_test.go b/pkg/sentry/socket/netlink/port/port_test.go
index 49b3b48ab..516f6cd6c 100644
--- a/pkg/sentry/socket/netlink/port/port_test.go
+++ b/pkg/sentry/socket/netlink/port/port_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 06786bd50..76cf12fd4 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index e414b829b..9f0a81403 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index a34f9d3ca..dc688eb00 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index 64106c4b5..f537c7f63 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/device.go b/pkg/sentry/socket/rpcinet/device.go
index d2b9f9222..44c0a39b7 100644
--- a/pkg/sentry/socket/rpcinet/device.go
+++ b/pkg/sentry/socket/rpcinet/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index f06d12231..601e05994 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/rpcinet.go b/pkg/sentry/socket/rpcinet/rpcinet.go
index 6c98e6acb..5d4fd4dac 100644
--- a/pkg/sentry/socket/rpcinet/rpcinet.go
+++ b/pkg/sentry/socket/rpcinet/rpcinet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index cf8f69efb..c028ed4dd 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
index cb8344ec6..a1be711df 100644
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/stack_unsafe.go b/pkg/sentry/socket/rpcinet/stack_unsafe.go
index d04fb2069..e53f578ba 100644
--- a/pkg/sentry/socket/rpcinet/stack_unsafe.go
+++ b/pkg/sentry/socket/rpcinet/stack_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 62ba13782..7e840b452 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/device.go b/pkg/sentry/socket/unix/device.go
index 41820dbb3..734d39ee6 100644
--- a/pkg/sentry/socket/unix/device.go
+++ b/pkg/sentry/socket/unix/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 7d80e4393..382911d51 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 62641bb34..18e492862 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectioned_state.go b/pkg/sentry/socket/unix/transport/connectioned_state.go
index 608a6a97a..7e02a5db8 100644
--- a/pkg/sentry/socket/unix/transport/connectioned_state.go
+++ b/pkg/sentry/socket/unix/transport/connectioned_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 728863f3f..43ff875e4 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index 45a58c600..b650caae7 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 12b1576bd..d5f7f7aa8 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 01efd24d3..e9607aa01 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 224f8b709..27fde505b 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go
index 7f047b808..b8e128c40 100644
--- a/pkg/sentry/state/state_metadata.go
+++ b/pkg/sentry/state/state_metadata.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state_unsafe.go b/pkg/sentry/state/state_unsafe.go
index f02e12b2a..7745b6ac6 100644
--- a/pkg/sentry/state/state_unsafe.go
+++ b/pkg/sentry/state/state_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/capability.go b/pkg/sentry/strace/capability.go
index 9001181e7..f85d6636e 100644
--- a/pkg/sentry/strace/capability.go
+++ b/pkg/sentry/strace/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/clone.go b/pkg/sentry/strace/clone.go
index e18ce84dc..ff6a432c6 100644
--- a/pkg/sentry/strace/clone.go
+++ b/pkg/sentry/strace/clone.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/futex.go b/pkg/sentry/strace/futex.go
index f4aa7fcad..24301bda6 100644
--- a/pkg/sentry/strace/futex.go
+++ b/pkg/sentry/strace/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 6043b8cb1..3650fd6e1 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/open.go b/pkg/sentry/strace/open.go
index 3bf348d7a..140727b02 100644
--- a/pkg/sentry/strace/open.go
+++ b/pkg/sentry/strace/open.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go
index b6b05423c..15605187d 100644
--- a/pkg/sentry/strace/poll.go
+++ b/pkg/sentry/strace/poll.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/ptrace.go b/pkg/sentry/strace/ptrace.go
index 8c4b79227..485aacb8a 100644
--- a/pkg/sentry/strace/ptrace.go
+++ b/pkg/sentry/strace/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/signal.go b/pkg/sentry/strace/signal.go
index 524be0e15..f82460e1c 100644
--- a/pkg/sentry/strace/signal.go
+++ b/pkg/sentry/strace/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 4c1a9d469..dbe53b9a2 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 434a200d9..f4c1be4ce 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto
index f1fc539d6..4b2f73a5f 100644
--- a/pkg/sentry/strace/strace.proto
+++ b/pkg/sentry/strace/strace.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 8c897fcbe..eae2d6c12 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index b90d191b7..ec1eab331 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 304a12dde..1ba3695fb 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index d2aec963a..d83e12971 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index b9b4ccbd1..9a460ebdf 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
index a033b7c70..5438b664b 100644
--- a/pkg/sentry/syscalls/linux/sigset.go
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 61c2647bf..1b27b2415 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
index cf972dc28..622cb8d0d 100644
--- a/pkg/sentry/syscalls/linux/sys_capability.go
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 200c46355..1467feb4e 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index 903172890..ca4ead488 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 967464c85..893322647 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index f0c89cba4..7cef4b50c 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 4b441b31b..1b597d5bc 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
index 8d594aa83..27e765a2d 100644
--- a/pkg/sentry/syscalls/linux/sys_identity.go
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index 26a505782..20269a769 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index ad3bfd761..8aadc6d8c 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 805b251b1..64a6e639c 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index e110a553f..cf613bad0 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 3652c429e..036845c13 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 17b6768e5..e32099dd4 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 7a29bd9b7..117ae1a0e 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
index 452dff058..fc3959a7e 100644
--- a/pkg/sentry/syscalls/linux/sys_random.go
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 50c7d7a74..48b0fd49d 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 443334693..8b0379779 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
index ab07c77f9..003d718da 100644
--- a/pkg/sentry/syscalls/linux/sys_rusage.go
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index e679a6694..8aea03abe 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index f08fdf5cb..b4262162a 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 86f850ef1..5bd61ab87 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index a0d3a73c5..d0eceac7c 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index a539354c5..7fbeb4fcd 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index c8748958a..69862f110 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 49c225011..10fc201ef 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 68488330f..4352482fb 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 6f7acf98f..ecf88edc1 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go
index 7193b7aed..9efc58d34 100644
--- a/pkg/sentry/syscalls/linux/sys_syslog.go
+++ b/pkg/sentry/syscalls/linux/sys_syslog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index ddcb5b789..23c2f7035 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index 063fbb106..b4f2609c0 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index 6baf4599b..04ea7a4e9 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index f70d13682..ec0155cbb 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index 8ea78093b..1e8312e00 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index f7545b965..fa81fe10e 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index e405608c4..1da72d606 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
index 752ec326d..fa6fcdc0b 100644
--- a/pkg/sentry/syscalls/linux/timespec.go
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index 425ce900c..5d10b3824 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index a98bcd7de..c27e391c9 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/calibrated_clock_test.go b/pkg/sentry/time/calibrated_clock_test.go
index a9237630e..d6622bfe2 100644
--- a/pkg/sentry/time/calibrated_clock_test.go
+++ b/pkg/sentry/time/calibrated_clock_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/clock_id.go b/pkg/sentry/time/clock_id.go
index 1317a5dad..724f59dd9 100644
--- a/pkg/sentry/time/clock_id.go
+++ b/pkg/sentry/time/clock_id.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/clocks.go b/pkg/sentry/time/clocks.go
index e26386520..837e86094 100644
--- a/pkg/sentry/time/clocks.go
+++ b/pkg/sentry/time/clocks.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/muldiv_amd64.s b/pkg/sentry/time/muldiv_amd64.s
index bfcb8c724..028c6684e 100644
--- a/pkg/sentry/time/muldiv_amd64.s
+++ b/pkg/sentry/time/muldiv_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/muldiv_arm64.s b/pkg/sentry/time/muldiv_arm64.s
index 5fa82a136..5ad57a8a3 100644
--- a/pkg/sentry/time/muldiv_arm64.s
+++ b/pkg/sentry/time/muldiv_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index 8568b1193..63cf7c4a3 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
index 4a0c4e880..e1b9084ac 100644
--- a/pkg/sentry/time/parameters_test.go
+++ b/pkg/sentry/time/parameters_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler.go b/pkg/sentry/time/sampler.go
index 445690d49..2140a99b7 100644
--- a/pkg/sentry/time/sampler.go
+++ b/pkg/sentry/time/sampler.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler_test.go b/pkg/sentry/time/sampler_test.go
index ec0e442b6..3e70a1134 100644
--- a/pkg/sentry/time/sampler_test.go
+++ b/pkg/sentry/time/sampler_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler_unsafe.go b/pkg/sentry/time/sampler_unsafe.go
index 0f8eb4fc8..e76180217 100644
--- a/pkg/sentry/time/sampler_unsafe.go
+++ b/pkg/sentry/time/sampler_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/tsc_amd64.s b/pkg/sentry/time/tsc_amd64.s
index e53d477f7..6a8eed664 100644
--- a/pkg/sentry/time/tsc_amd64.s
+++ b/pkg/sentry/time/tsc_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/tsc_arm64.s b/pkg/sentry/time/tsc_arm64.s
index c1c9760ef..da9fa4112 100644
--- a/pkg/sentry/time/tsc_arm64.s
+++ b/pkg/sentry/time/tsc_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/unimpl/events.go b/pkg/sentry/unimpl/events.go
index f78f8c981..d92766e2d 100644
--- a/pkg/sentry/unimpl/events.go
+++ b/pkg/sentry/unimpl/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/unimpl/unimplemented_syscall.proto b/pkg/sentry/unimpl/unimplemented_syscall.proto
index 41579b016..0d7a94be7 100644
--- a/pkg/sentry/unimpl/unimplemented_syscall.proto
+++ b/pkg/sentry/unimpl/unimplemented_syscall.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go
index 399d98c29..e55b89689 100644
--- a/pkg/sentry/uniqueid/context.go
+++ b/pkg/sentry/uniqueid/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/cpu.go b/pkg/sentry/usage/cpu.go
index cbd7cfe19..bfc282d69 100644
--- a/pkg/sentry/usage/cpu.go
+++ b/pkg/sentry/usage/cpu.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/io.go b/pkg/sentry/usage/io.go
index 8e27a0a88..dfcd3a49d 100644
--- a/pkg/sentry/usage/io.go
+++ b/pkg/sentry/usage/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 5be9ed9c6..c316f1597 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/memory_unsafe.go b/pkg/sentry/usage/memory_unsafe.go
index a3ae668a5..9e0014ca0 100644
--- a/pkg/sentry/usage/memory_unsafe.go
+++ b/pkg/sentry/usage/memory_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/usage.go b/pkg/sentry/usage/usage.go
index ab327f8e2..e3d33a965 100644
--- a/pkg/sentry/usage/usage.go
+++ b/pkg/sentry/usage/usage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
index 9e6a27bcf..9c1742a59 100644
--- a/pkg/sentry/usermem/access_type.go
+++ b/pkg/sentry/usermem/access_type.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
index 2a75aa60c..e79210804 100644
--- a/pkg/sentry/usermem/addr.go
+++ b/pkg/sentry/usermem/addr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr_range_seq_test.go b/pkg/sentry/usermem/addr_range_seq_test.go
index bd6a1ec8a..82f735026 100644
--- a/pkg/sentry/usermem/addr_range_seq_test.go
+++ b/pkg/sentry/usermem/addr_range_seq_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr_range_seq_unsafe.go b/pkg/sentry/usermem/addr_range_seq_unsafe.go
index f5fd446fa..c09337c15 100644
--- a/pkg/sentry/usermem/addr_range_seq_unsafe.go
+++ b/pkg/sentry/usermem/addr_range_seq_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go
index 274f568d0..f98d82168 100644
--- a/pkg/sentry/usermem/bytes_io.go
+++ b/pkg/sentry/usermem/bytes_io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go
index 7add8bc82..bb49d2ff3 100644
--- a/pkg/sentry/usermem/bytes_io_unsafe.go
+++ b/pkg/sentry/usermem/bytes_io_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 4c7d5014a..31e4d6ada 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_arm64.go b/pkg/sentry/usermem/usermem_arm64.go
index 7fd4ce963..fdfc30a66 100644
--- a/pkg/sentry/usermem/usermem_arm64.go
+++ b/pkg/sentry/usermem/usermem_arm64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go
index 1991a9641..4a07118b7 100644
--- a/pkg/sentry/usermem/usermem_test.go
+++ b/pkg/sentry/usermem/usermem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_unsafe.go b/pkg/sentry/usermem/usermem_unsafe.go
index 3895e7871..876783e78 100644
--- a/pkg/sentry/usermem/usermem_unsafe.go
+++ b/pkg/sentry/usermem/usermem_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_x86.go b/pkg/sentry/usermem/usermem_x86.go
index 9ec90f9ff..8059b72d2 100644
--- a/pkg/sentry/usermem/usermem_x86.go
+++ b/pkg/sentry/usermem/usermem_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index b4f1e3a4f..2fc4472dd 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_amd64.s b/pkg/sleep/commit_amd64.s
index d08df7f37..bc4ac2c3c 100644
--- a/pkg/sleep/commit_amd64.s
+++ b/pkg/sleep/commit_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_asm.go b/pkg/sleep/commit_asm.go
index 90eef4cbc..35e2cc337 100644
--- a/pkg/sleep/commit_asm.go
+++ b/pkg/sleep/commit_asm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_noasm.go b/pkg/sleep/commit_noasm.go
index 967d22e24..686b1da3d 100644
--- a/pkg/sleep/commit_noasm.go
+++ b/pkg/sleep/commit_noasm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/empty.s b/pkg/sleep/empty.s
index 85d52cd9c..fb37360ac 100644
--- a/pkg/sleep/empty.s
+++ b/pkg/sleep/empty.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index 8feb9ffc2..130806c86 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 45fb6f0ea..62e0abc34 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/decode.go b/pkg/state/decode.go
index 54b5ad8b8..73a59f871 100644
--- a/pkg/state/decode.go
+++ b/pkg/state/decode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/encode.go b/pkg/state/encode.go
index fe8512bbf..b0714170b 100644
--- a/pkg/state/encode.go
+++ b/pkg/state/encode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/encode_unsafe.go b/pkg/state/encode_unsafe.go
index be94742a8..457e6dbb7 100644
--- a/pkg/state/encode_unsafe.go
+++ b/pkg/state/encode_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/map.go b/pkg/state/map.go
index 0035d7250..1fb9b47b8 100644
--- a/pkg/state/map.go
+++ b/pkg/state/map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/object.proto b/pkg/state/object.proto
index d3b46ea97..952289069 100644
--- a/pkg/state/object.proto
+++ b/pkg/state/object.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/printer.go b/pkg/state/printer.go
index aee4b69fb..5174c3ba3 100644
--- a/pkg/state/printer.go
+++ b/pkg/state/printer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/state.go b/pkg/state/state.go
index 4486f83a7..cf7df803a 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/state_test.go b/pkg/state/state_test.go
index 22bcad9e1..7c24bbcda 100644
--- a/pkg/state/state_test.go
+++ b/pkg/state/state_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/statefile/statefile.go b/pkg/state/statefile/statefile.go
index c21e3bb0e..ad4e3b43e 100644
--- a/pkg/state/statefile/statefile.go
+++ b/pkg/state/statefile/statefile.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/statefile/statefile_test.go b/pkg/state/statefile/statefile_test.go
index b4f400e01..60b769895 100644
--- a/pkg/state/statefile/statefile_test.go
+++ b/pkg/state/statefile/statefile_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/stats.go b/pkg/state/stats.go
index 17ca258fc..eb51cda47 100644
--- a/pkg/state/stats.go
+++ b/pkg/state/stats.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/host_linux.go b/pkg/syserr/host_linux.go
index 74bbe9f5b..fc6ef60a1 100644
--- a/pkg/syserr/host_linux.go
+++ b/pkg/syserr/host_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index 1a23919ef..bd489b424 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go
index 232634dd4..4ddbd3322 100644
--- a/pkg/syserr/syserr.go
+++ b/pkg/syserr/syserr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 5558cccff..345653544 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserror/syserror_test.go b/pkg/syserror/syserror_test.go
index 0f0da5781..f2a10ee7b 100644
--- a/pkg/syserror/syserror_test.go
+++ b/pkg/syserror/syserror_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 628e28f57..df8bf435d 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index e84f73feb..2c81c5697 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go
index d3a9a0f88..43cbbc74c 100644
--- a/pkg/tcpip/buffer/prependable.go
+++ b/pkg/tcpip/buffer/prependable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 43cbb9461..1a9d40778 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index 74a0a96fc..ebc3a17b7 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 5dfb3ca1d..6e7edf3ab 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/hash/jenkins/jenkins.go b/pkg/tcpip/hash/jenkins/jenkins.go
index e66d5f12b..52c22230e 100644
--- a/pkg/tcpip/hash/jenkins/jenkins.go
+++ b/pkg/tcpip/hash/jenkins/jenkins.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/hash/jenkins/jenkins_test.go b/pkg/tcpip/hash/jenkins/jenkins_test.go
index 9d86174aa..4c78b5808 100644
--- a/pkg/tcpip/hash/jenkins/jenkins_test.go
+++ b/pkg/tcpip/hash/jenkins/jenkins_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/arp.go b/pkg/tcpip/header/arp.go
index 22b259ccb..55fe7292c 100644
--- a/pkg/tcpip/header/arp.go
+++ b/pkg/tcpip/header/arp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index 2e8c65fac..2eaa7938a 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index 77365bc41..76143f454 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/gue.go b/pkg/tcpip/header/gue.go
index 2ad13955a..10d358c0e 100644
--- a/pkg/tcpip/header/gue.go
+++ b/pkg/tcpip/header/gue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
index 3ac89cdae..782e1053c 100644
--- a/pkg/tcpip/header/icmpv4.go
+++ b/pkg/tcpip/header/icmpv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index e317975e8..d0b10d849 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/interfaces.go b/pkg/tcpip/header/interfaces.go
index ac327d8a5..fb250ea30 100644
--- a/pkg/tcpip/header/interfaces.go
+++ b/pkg/tcpip/header/interfaces.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index c3b8fb00e..96e461491 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 3d24736c7..66820a466 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv6_fragment.go b/pkg/tcpip/header/ipv6_fragment.go
index e36d5177b..6d896355a 100644
--- a/pkg/tcpip/header/ipv6_fragment.go
+++ b/pkg/tcpip/header/ipv6_fragment.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipversion_test.go b/pkg/tcpip/header/ipversion_test.go
index 8301ba5cf..0c830180e 100644
--- a/pkg/tcpip/header/ipversion_test.go
+++ b/pkg/tcpip/header/ipversion_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index e656ebb15..0cd89b992 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/tcp_test.go b/pkg/tcpip/header/tcp_test.go
index 7cd98df3b..9a2b99489 100644
--- a/pkg/tcpip/header/tcp_test.go
+++ b/pkg/tcpip/header/tcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index e8c860436..2205fec18 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index f7501a1bc..ee9dd8700 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 8f4d67074..4da376774 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index c8b037d57..31138e4ac 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
index 36e7fe5a9..97a477b61 100644
--- a/pkg/tcpip/link/fdbased/endpoint_unsafe.go
+++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index f1e71c233..430c85a42 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
index e5ac7996d..135da2498 100644
--- a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
+++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 2dc4bcfda..2c1148123 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index b3e71c7fc..be07b7c29 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 031449a05..5d40dfacc 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
index 9dade5421..b54131573 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
index 3ba96a123..0b51982c6 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
index 94ddad8ea..4eab77c74 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
index 7359849b1..8bde41637 100644
--- a/pkg/tcpip/link/rawfile/errors.go
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index fe2779125..86db7a487 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe.go b/pkg/tcpip/link/sharedmem/pipe/pipe.go
index e014324cc..74c9f0311 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
index 30742ccb1..59ef69a8b 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
index f491d74a2..62d17029e 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/rx.go b/pkg/tcpip/link/sharedmem/pipe/rx.go
index 8d641c76f..f22e533ac 100644
--- a/pkg/tcpip/link/sharedmem/pipe/rx.go
+++ b/pkg/tcpip/link/sharedmem/pipe/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/tx.go b/pkg/tcpip/link/sharedmem/pipe/tx.go
index e75175d98..9841eb231 100644
--- a/pkg/tcpip/link/sharedmem/pipe/tx.go
+++ b/pkg/tcpip/link/sharedmem/pipe/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/queue_test.go b/pkg/tcpip/link/sharedmem/queue/queue_test.go
index 391165bc3..d3f8f4b8b 100644
--- a/pkg/tcpip/link/sharedmem/queue/queue_test.go
+++ b/pkg/tcpip/link/sharedmem/queue/queue_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/rx.go b/pkg/tcpip/link/sharedmem/queue/rx.go
index d3a5da08a..d9aecf2d9 100644
--- a/pkg/tcpip/link/sharedmem/queue/rx.go
+++ b/pkg/tcpip/link/sharedmem/queue/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/tx.go b/pkg/tcpip/link/sharedmem/queue/tx.go
index 845108db1..a24dccd11 100644
--- a/pkg/tcpip/link/sharedmem/queue/tx.go
+++ b/pkg/tcpip/link/sharedmem/queue/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go
index 3eeab769e..215cb607f 100644
--- a/pkg/tcpip/link/sharedmem/rx.go
+++ b/pkg/tcpip/link/sharedmem/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 6e6aa5a13..e34b780f8 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 1f44e224c..65b9d7085 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
index b91adbaf7..f7e816a41 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/tx.go b/pkg/tcpip/link/sharedmem/tx.go
index 37da34831..ac3577aa6 100644
--- a/pkg/tcpip/link/sharedmem/tx.go
+++ b/pkg/tcpip/link/sharedmem/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sniffer/pcap.go b/pkg/tcpip/link/sniffer/pcap.go
index 3d0d8d852..c16c19647 100644
--- a/pkg/tcpip/link/sniffer/pcap.go
+++ b/pkg/tcpip/link/sniffer/pcap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 462a6e3a3..e87ae07d7 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/tun/tun_unsafe.go b/pkg/tcpip/link/tun/tun_unsafe.go
index e4c589dda..09ca9b527 100644
--- a/pkg/tcpip/link/tun/tun_unsafe.go
+++ b/pkg/tcpip/link/tun/tun_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index bd9f9845b..21690a226 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index a2df6be95..62054fb7f 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 975919e80..a3f2bce3e 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 14b9cb8b6..1b971b1a3 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/frag_heap.go b/pkg/tcpip/network/fragmentation/frag_heap.go
index 55615c8e6..9ad3e5a8a 100644
--- a/pkg/tcpip/network/fragmentation/frag_heap.go
+++ b/pkg/tcpip/network/fragmentation/frag_heap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/frag_heap_test.go b/pkg/tcpip/network/fragmentation/frag_heap_test.go
index 1b1b72e88..3a2486ba8 100644
--- a/pkg/tcpip/network/fragmentation/frag_heap_test.go
+++ b/pkg/tcpip/network/fragmentation/frag_heap_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index a5dda0398..e90edb375 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index 5bf3463a9..99ded68a3 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index c9ad2bef6..04f9ab964 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
index a2bc9707a..7eee0710d 100644
--- a/pkg/tcpip/network/fragmentation/reassembler_test.go
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/hash/hash.go b/pkg/tcpip/network/hash/hash.go
index 07960ddf0..0c91905dc 100644
--- a/pkg/tcpip/network/hash/hash.go
+++ b/pkg/tcpip/network/hash/hash.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 522009fac..4b822e2c6 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 1c3acda4b..9cb81245a 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index cbdca98a5..c6af0db79 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 42e85564e..146143ab3 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index be28be36d..9c011e107 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 8b57a0641..d8737a616 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 9a743ea80..4b8cd496b 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index d212a5792..a1712b590 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 01e7320b4..8466c661b 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index cf8900c4d..1681de56e 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index da6202f97..642607f83 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/seqnum/seqnum.go b/pkg/tcpip/seqnum/seqnum.go
index f2b988839..b40a3c212 100644
--- a/pkg/tcpip/seqnum/seqnum.go
+++ b/pkg/tcpip/seqnum/seqnum.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 40e4bdb4a..42b9768ae 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index 77a09ca86..91b2ffea8 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index c18571b0f..8008d9870 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 6e1660051..c70533a35 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 8ae562dcd..3d4c282a9 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index cb9ffe9c2..f204ca790 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index 3d7e4b719..dfec4258a 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index b5375df3c..351f63221 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index a8ac18e72..e8b562ad9 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 2df974bf2..8d74f1543 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index b09137f08..9367c8c02 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
index 1f7b04398..ebb1c1b56 100644
--- a/pkg/tcpip/tcpip_test.go
+++ b/pkg/tcpip/tcpip_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/time.s b/pkg/tcpip/time.s
index 85d52cd9c..fb37360ac 100644
--- a/pkg/tcpip/time.s
+++ b/pkg/tcpip/time.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 7ec5741af..1a307483b 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 8f2e3aa20..00840cfcf 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/icmp/endpoint_state.go b/pkg/tcpip/transport/icmp/endpoint_state.go
index 8a7909246..332b3cd33 100644
--- a/pkg/tcpip/transport/icmp/endpoint_state.go
+++ b/pkg/tcpip/transport/icmp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 09ee2f892..954fde9d8 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/raw/raw.go b/pkg/tcpip/transport/raw/raw.go
index f0f60ce91..7004c7ff4 100644
--- a/pkg/tcpip/transport/raw/raw.go
+++ b/pkg/tcpip/transport/raw/raw.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/raw/state.go b/pkg/tcpip/transport/raw/state.go
index e3891a8b8..e8907ebb1 100644
--- a/pkg/tcpip/transport/raw/state.go
+++ b/pkg/tcpip/transport/raw/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index a3894ed8f..e506d7133 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 6c4a4d95e..eaa67aeb7 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/cubic.go b/pkg/tcpip/transport/tcp/cubic.go
index 003525d86..e618cd2b9 100644
--- a/pkg/tcpip/transport/tcp/cubic.go
+++ b/pkg/tcpip/transport/tcp/cubic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 2886cc707..43bcfa070 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 09eff5be1..982f491cc 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 7f9dabb4d..27b0be046 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 6a7efaf1d..e088e24cb 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index b5fb160bc..b86473891 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index fa6bdddba..b08a0e356 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/reno.go b/pkg/tcpip/transport/tcp/reno.go
index e4f8b7d5a..f83ebc717 100644
--- a/pkg/tcpip/transport/tcp/reno.go
+++ b/pkg/tcpip/transport/tcp/reno.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/sack.go b/pkg/tcpip/transport/tcp/sack.go
index 24e48fe7b..6a013d99b 100644
--- a/pkg/tcpip/transport/tcp/sack.go
+++ b/pkg/tcpip/transport/tcp/sack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
index 21878ad82..99560d5b4 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard_test.go b/pkg/tcpip/transport/tcp/sack_scoreboard_test.go
index 3cf2ff451..8f6890cdf 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard_test.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index c603fe713..187effb6b 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
index 98422fadf..9fd061d7d 100644
--- a/pkg/tcpip/transport/tcp/segment_heap.go
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 0c637d7ad..3b020e580 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
index 68b049f06..dd7e14aa6 100644
--- a/pkg/tcpip/transport/tcp/segment_state.go
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 6317748cf..50743670e 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index 86bbd643f..12eff8afc 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index 06b0702c5..dbfbd5c4f 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index c5732ad1c..a8b290dae 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 87c640967..039bbcfba 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 6e2fed880..fa721a7f8 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/timer.go b/pkg/tcpip/transport/tcp/timer.go
index 38240d2d5..fc1c7cbd2 100644
--- a/pkg/tcpip/transport/tcp/timer.go
+++ b/pkg/tcpip/transport/tcp/timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index b94568fb1..f1dcd36d5 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
index aaeae9b18..435e136de 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1f9251de3..db65a4e88 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index b2daaf751..163dcbc13 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index d80c47e34..25bdd2929 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 616a9f388..8b47cce17 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 2f4e94c58..86a8fa19b 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tmutex/tmutex.go b/pkg/tmutex/tmutex.go
index df61d89f5..c4685020d 100644
--- a/pkg/tmutex/tmutex.go
+++ b/pkg/tmutex/tmutex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go
index a4537cb3b..ce34c7962 100644
--- a/pkg/tmutex/tmutex_test.go
+++ b/pkg/tmutex/tmutex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index 114fb8c5b..2aa1af4ff 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go
index db5485539..763b23c7c 100644
--- a/pkg/unet/unet_test.go
+++ b/pkg/unet/unet_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet_unsafe.go b/pkg/unet/unet_unsafe.go
index 1d6ec286c..fa0916439 100644
--- a/pkg/unet/unet_unsafe.go
+++ b/pkg/unet/unet_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index 719f0e92f..0f155ec74 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/urpc/urpc_test.go b/pkg/urpc/urpc_test.go
index f1b9a85ca..5bf2c5ed2 100644
--- a/pkg/urpc/urpc_test.go
+++ b/pkg/urpc/urpc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index a6c9dff3c..8a65ed164 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/waiter_test.go b/pkg/waiter/waiter_test.go
index 60853f9c1..c1b94a4f3 100644
--- a/pkg/waiter/waiter_test.go
+++ b/pkg/waiter/waiter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index b3499bcde..c1b33c551 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 0c9472f18..99df5e614 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index f1940dd72..ccec3d20c 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index ba47effc1..b6771de30 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 712c50ee9..ab7c58838 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
index d224d08b7..79f7387ac 100644
--- a/runsc/boot/debug.go
+++ b/runsc/boot/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index 717adfedd..ffd99f5e9 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index a3d21d963..4e428b49c 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 9c72e3b1a..652da1cef 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
index 67f3101fe..5c5ec4e06 100644
--- a/runsc/boot/filter/extra_filters.go
+++ b/runsc/boot/filter/extra_filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index fb95283ab..ac5a0f1aa 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
index 02a122c95..ba3c1ce87 100644
--- a/runsc/boot/filter/extra_filters_race.go
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index fb197f9b1..17479e0dd 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 07061b9b3..aeb1c52cc 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 32e62cdf7..3364aa5e6 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 75ec19c32..0b5be0a42 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 01578cfc5..9a864ad3f 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 35baa36ad..598ec969e 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index 028bcc1f4..19c7f8fbd 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 2b338b6c6..7431b17d6 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index ecc184f74..548c80e9a 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index ff2fa2fb9..ac937f7bc 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
index e5da021e5..312e5b471 100644
--- a/runsc/cmd/capability.go
+++ b/runsc/cmd/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index dd278b32d..ee74d33d8 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index f722df055..96d3c3378 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
index ed1dafef1..1a774db04 100644
--- a/runsc/cmd/chroot.go
+++ b/runsc/cmd/chroot.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index 208cf5304..aa7b1a636 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 30c8fa283..629c198fd 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 3ee9a9b49..000f694c7 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 3206b267a..9039723e9 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index 4a5b4774a..45fc91016 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 343461130..67d415733 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 208d2f74b..c6bc8fc3a 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 718d01067..ad2508405 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
index 686c5e150..6f0f258c0 100644
--- a/runsc/cmd/exec_test.go
+++ b/runsc/cmd/exec_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 82487887c..bccb29397 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/gofer_test.go b/runsc/cmd/gofer_test.go
index 8e692feb9..cbea7f127 100644
--- a/runsc/cmd/gofer_test.go
+++ b/runsc/cmd/gofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index e67f82473..aed5f3291 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index 1dcea2af0..1f5ca2473 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
index 1276f0dbd..0e9ef7fa5 100644
--- a/runsc/cmd/path.go
+++ b/runsc/cmd/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index 2c93e5f3e..11b36aa10 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 060d796f2..3a3e6f17a 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 66b23c38e..27b06713a 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index 5551d1450..9a2ade41e 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index be1c1b678..4d5f5c139 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 063bd39c5..344da13ba 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 9e2e0c11d..657726251 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index c3ef65ab5..f0d449b19 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 6498dd15c..a55a682f3 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 2eb9a8807..64b23639a 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 0b0dfb4cb..b8af27c15 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/container.go b/runsc/container/container.go
index a30c217f7..884bbc0fb 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 603c4d929..9458dbb90 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/hook.go b/runsc/container/hook.go
index 6b9e5550a..acae6781e 100644
--- a/runsc/container/hook.go
+++ b/runsc/container/hook.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 8922e6dbe..e554237cf 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index 8f81ed630..9d5a592a5 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/status.go b/runsc/container/status.go
index 234ffb0dd..91d9112f1 100644
--- a/runsc/container/status.go
+++ b/runsc/container/status.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index b5071ada6..62923f1ef 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 75a087848..a1ad49fb2 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go
index 67f3101fe..5c5ec4e06 100644
--- a/runsc/fsgofer/filter/extra_filters.go
+++ b/runsc/fsgofer/filter/extra_filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go
index 7e142b790..553060bc3 100644
--- a/runsc/fsgofer/filter/extra_filters_msan.go
+++ b/runsc/fsgofer/filter/extra_filters_msan.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
index 3cd29472a..28555f898 100644
--- a/runsc/fsgofer/filter/extra_filters_race.go
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index c120d57a6..ff8154369 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index c964a2a3b..158f22ddc 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index e74df7ede..695836927 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index 94413db86..58af5e44d 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/main.go b/runsc/main.go
index b35726a74..11bc73f75 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 6c6b665a0..2a68d7043 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/network_unsafe.go b/runsc/sandbox/network_unsafe.go
index f7447f002..2a2a0fb7e 100644
--- a/runsc/sandbox/network_unsafe.go
+++ b/runsc/sandbox/network_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 48a0dafe2..dac35ca0b 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index 98c3b19c0..1f3afb4e4 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 35da789f4..7d194335c 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index ac85bec71..c72207fb4 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index 02af6e6ad..2c86fffe8 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/image.go b/runsc/test/image/image.go
index bcb6f876f..297f1ab92 100644
--- a/runsc/test/image/image.go
+++ b/runsc/test/image/image.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index f7e750d71..0c45602f9 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/mysql.sql b/runsc/test/image/mysql.sql
index c1271e719..51554b98d 100644
--- a/runsc/test/image/mysql.sql
+++ b/runsc/test/image/mysql.sql
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/ruby.rb b/runsc/test/image/ruby.rb
index 25d1ac129..aced49c6d 100644
--- a/runsc/test/image/ruby.rb
+++ b/runsc/test/image/ruby.rb
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/ruby.sh b/runsc/test/image/ruby.sh
index d3a9b5656..ebe8d5b0e 100644
--- a/runsc/test/image/ruby.sh
+++ b/runsc/test/image/ruby.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/install.sh b/runsc/test/install.sh
index 32e1e884e..457df2d26 100755
--- a/runsc/test/install.sh
+++ b/runsc/test/install.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index d87957e2d..7af064d79 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/integration.go b/runsc/test/integration/integration.go
index e15321c87..4cd5f6c24 100644
--- a/runsc/test/integration/integration.go
+++ b/runsc/test/integration/integration.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 4a2770d48..b2e86aacc 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/cgroup_test.go b/runsc/test/root/cgroup_test.go
index 91839048c..edb6dee1d 100644
--- a/runsc/test/root/cgroup_test.go
+++ b/runsc/test/root/cgroup_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 0deca0532..da2f473b9 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
index 37fe53ba3..3cc176104 100644
--- a/runsc/test/root/crictl_test.go
+++ b/runsc/test/root/crictl_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/root.go b/runsc/test/root/root.go
index 586ea0fe3..349c752cc 100644
--- a/runsc/test/root/root.go
+++ b/runsc/test/root/root.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/busybox.go b/runsc/test/root/testdata/busybox.go
index 544571c63..e4dbd2843 100644
--- a/runsc/test/root/testdata/busybox.go
+++ b/runsc/test/root/testdata/busybox.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/containerd_config.go b/runsc/test/root/testdata/containerd_config.go
index 949354987..e12f1ec88 100644
--- a/runsc/test/root/testdata/containerd_config.go
+++ b/runsc/test/root/testdata/containerd_config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/httpd.go b/runsc/test/root/testdata/httpd.go
index f65b1da5d..45d5e33d4 100644
--- a/runsc/test/root/testdata/httpd.go
+++ b/runsc/test/root/testdata/httpd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/httpd_mount_paths.go b/runsc/test/root/testdata/httpd_mount_paths.go
index 5ca14340e..ac3f4446a 100644
--- a/runsc/test/root/testdata/httpd_mount_paths.go
+++ b/runsc/test/root/testdata/httpd_mount_paths.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/sandbox.go b/runsc/test/root/testdata/sandbox.go
index 194242a27..0db210370 100644
--- a/runsc/test/root/testdata/sandbox.go
+++ b/runsc/test/root/testdata/sandbox.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/crictl.go b/runsc/test/testutil/crictl.go
index 84bb4475a..4f9ee0c05 100644
--- a/runsc/test/testutil/crictl.go
+++ b/runsc/test/testutil/crictl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index b651319ed..29ef505b4 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 79f0a8b6b..6a4c045a8 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/testutil_race.go b/runsc/test/testutil/testutil_race.go
index 9267af150..86db6ffa1 100644
--- a/runsc/test/testutil/testutil_race.go
+++ b/runsc/test/testutil/testutil_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/tools/dockercfg/dockercfg.go b/runsc/tools/dockercfg/dockercfg.go
index cc7a67816..6fb134558 100644
--- a/runsc/tools/dockercfg/dockercfg.go
+++ b/runsc/tools/dockercfg/dockercfg.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/version.go b/runsc/version.go
index 4894f2de6..ce0573a9b 100644
--- a/runsc/version.go
+++ b/runsc/version.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/gtest/gtest.go b/test/syscalls/gtest/gtest.go
index dfe5037cd..bdec8eb07 100644
--- a/test/syscalls/gtest/gtest.go
+++ b/test/syscalls/gtest/gtest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 78baf548e..a7cbee06b 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index c2bb4a7ce..56377feab 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc
index 1501e526e..b6cdb3f4f 100644
--- a/test/syscalls/linux/accept_bind_stream.cc
+++ b/test/syscalls/linux/accept_bind_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/access.cc b/test/syscalls/linux/access.cc
index 6ea070a5d..bcc25cef4 100644
--- a/test/syscalls/linux/access.cc
+++ b/test/syscalls/linux/access.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/affinity.cc b/test/syscalls/linux/affinity.cc
index 81bd9bcb5..f2d8375b6 100644
--- a/test/syscalls/linux/affinity.cc
+++ b/test/syscalls/linux/affinity.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index b96aab9b9..68dc05417 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/alarm.cc b/test/syscalls/linux/alarm.cc
index e0ddbb415..d89269985 100644
--- a/test/syscalls/linux/alarm.cc
+++ b/test/syscalls/linux/alarm.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc
index 5687ceb86..81bf5a775 100644
--- a/test/syscalls/linux/arch_prctl.cc
+++ b/test/syscalls/linux/arch_prctl.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index a2634a8bf..f246a799e 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/base_poll_test.cc b/test/syscalls/linux/base_poll_test.cc
index bba0108ea..ab7a19dd0 100644
--- a/test/syscalls/linux/base_poll_test.cc
+++ b/test/syscalls/linux/base_poll_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/base_poll_test.h b/test/syscalls/linux/base_poll_test.h
index 9b9b81933..088831f9f 100644
--- a/test/syscalls/linux/base_poll_test.h
+++ b/test/syscalls/linux/base_poll_test.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/bind.cc b/test/syscalls/linux/bind.cc
index f5aa9c500..de8cca53b 100644
--- a/test/syscalls/linux/bind.cc
+++ b/test/syscalls/linux/bind.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/brk.cc b/test/syscalls/linux/brk.cc
index 33d353959..a03a44465 100644
--- a/test/syscalls/linux/brk.cc
+++ b/test/syscalls/linux/brk.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/chdir.cc b/test/syscalls/linux/chdir.cc
index a4b54f0ee..3182c228b 100644
--- a/test/syscalls/linux/chdir.cc
+++ b/test/syscalls/linux/chdir.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc
index 2f42fe326..79e98597f 100644
--- a/test/syscalls/linux/chmod.cc
+++ b/test/syscalls/linux/chmod.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/chown.cc b/test/syscalls/linux/chown.cc
index ad892cf6a..eb1762ddf 100644
--- a/test/syscalls/linux/chown.cc
+++ b/test/syscalls/linux/chown.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 6c200f63e..a4354ff62 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/clock_getres.cc b/test/syscalls/linux/clock_getres.cc
index 8f8842299..c408b936c 100644
--- a/test/syscalls/linux/clock_getres.cc
+++ b/test/syscalls/linux/clock_getres.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index 4ecb5f5b1..082ae1c39 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/clock_nanosleep.cc b/test/syscalls/linux/clock_nanosleep.cc
index 61c67a5ff..52a69d230 100644
--- a/test/syscalls/linux/clock_nanosleep.cc
+++ b/test/syscalls/linux/clock_nanosleep.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index 7978845c1..4e0a13f8b 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/creat.cc b/test/syscalls/linux/creat.cc
index df2cc0d5c..3c270d6da 100644
--- a/test/syscalls/linux/creat.cc
+++ b/test/syscalls/linux/creat.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index a140d3b30..b86ebe233 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/dup.cc b/test/syscalls/linux/dup.cc
index e8de2f4c4..4f773bc75 100644
--- a/test/syscalls/linux/dup.cc
+++ b/test/syscalls/linux/dup.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
index b4a3bfcba..a4f8f3cec 100644
--- a/test/syscalls/linux/epoll.cc
+++ b/test/syscalls/linux/epoll.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
index 8111da30e..5e5c39d44 100644
--- a/test/syscalls/linux/eventfd.cc
+++ b/test/syscalls/linux/eventfd.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 3f0aa8bf1..0da4c817d 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 30bc4b608..06c322a99 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec.h b/test/syscalls/linux/exec.h
index b82bfffd1..5c0f7e654 100644
--- a/test/syscalls/linux/exec.h
+++ b/test/syscalls/linux/exec.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_assert_closed_workload.cc b/test/syscalls/linux/exec_assert_closed_workload.cc
index 4448431e1..95643618d 100644
--- a/test/syscalls/linux/exec_assert_closed_workload.cc
+++ b/test/syscalls/linux/exec_assert_closed_workload.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_basic_workload.cc b/test/syscalls/linux/exec_basic_workload.cc
index d4bdf511f..1bbd6437e 100644
--- a/test/syscalls/linux/exec_basic_workload.cc
+++ b/test/syscalls/linux/exec_basic_workload.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index c10d85398..bdd6eb10b 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
index b9a4ac749..b3fbd5042 100644
--- a/test/syscalls/linux/exec_proc_exe_workload.cc
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_state_workload.cc b/test/syscalls/linux/exec_state_workload.cc
index b66e22565..725c2977f 100644
--- a/test/syscalls/linux/exec_state_workload.cc
+++ b/test/syscalls/linux/exec_state_workload.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exit.cc b/test/syscalls/linux/exit.cc
index 7246a7b3b..99de2b376 100644
--- a/test/syscalls/linux/exit.cc
+++ b/test/syscalls/linux/exit.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exit_script.sh b/test/syscalls/linux/exit_script.sh
index f014fcf99..527518e06 100755
--- a/test/syscalls/linux/exit_script.sh
+++ b/test/syscalls/linux/exit_script.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fadvise64.cc b/test/syscalls/linux/fadvise64.cc
index 041e8b7b6..2af7aa6d9 100644
--- a/test/syscalls/linux/fadvise64.cc
+++ b/test/syscalls/linux/fadvise64.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index e51538734..61b8acc7a 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fault.cc b/test/syscalls/linux/fault.cc
index cfa7d0d1f..f6e19026f 100644
--- a/test/syscalls/linux/fault.cc
+++ b/test/syscalls/linux/fault.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fchdir.cc b/test/syscalls/linux/fchdir.cc
index 2b13e36c3..08bcae1e8 100644
--- a/test/syscalls/linux/fchdir.cc
+++ b/test/syscalls/linux/fchdir.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 32a90a163..2f8e7c9dd 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 43f568111..b5b972c07 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
index 1388d3839..d89cfcbd7 100644
--- a/test/syscalls/linux/flock.cc
+++ b/test/syscalls/linux/flock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 73ac885b5..dd6e1a422 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fpsig_fork.cc b/test/syscalls/linux/fpsig_fork.cc
index e8f1dfa8a..e7e9f06a1 100644
--- a/test/syscalls/linux/fpsig_fork.cc
+++ b/test/syscalls/linux/fpsig_fork.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fpsig_nested.cc b/test/syscalls/linux/fpsig_nested.cc
index 2fa40b42d..395463aed 100644
--- a/test/syscalls/linux/fpsig_nested.cc
+++ b/test/syscalls/linux/fpsig_nested.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fsync.cc b/test/syscalls/linux/fsync.cc
index b34229248..e7e057f06 100644
--- a/test/syscalls/linux/fsync.cc
+++ b/test/syscalls/linux/fsync.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index c7a709a0a..bfec95466 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/getcpu.cc b/test/syscalls/linux/getcpu.cc
index 3a52b25fa..f4d94bd6a 100644
--- a/test/syscalls/linux/getcpu.cc
+++ b/test/syscalls/linux/getcpu.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
index e8a7bcd43..d146c8db7 100644
--- a/test/syscalls/linux/getdents.cc
+++ b/test/syscalls/linux/getdents.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/getrandom.cc b/test/syscalls/linux/getrandom.cc
index be5325497..f97f60029 100644
--- a/test/syscalls/linux/getrandom.cc
+++ b/test/syscalls/linux/getrandom.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/getrusage.cc b/test/syscalls/linux/getrusage.cc
index 1ae603858..9bdb1e4cd 100644
--- a/test/syscalls/linux/getrusage.cc
+++ b/test/syscalls/linux/getrusage.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index b99d339e5..6a3539e22 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index c7741a177..c525d41d2 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 0a149c2e5..7612919d4 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index cac790e64..6898effb8 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index ddfbc28fc..57ffd1595 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/kill.cc b/test/syscalls/linux/kill.cc
index cd98de41f..18ad923b8 100644
--- a/test/syscalls/linux/kill.cc
+++ b/test/syscalls/linux/kill.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc
index ed74437bc..a91703070 100644
--- a/test/syscalls/linux/link.cc
+++ b/test/syscalls/linux/link.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
index 6a4f1423c..a8af8e545 100644
--- a/test/syscalls/linux/lseek.cc
+++ b/test/syscalls/linux/lseek.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
index a79c8c75d..f6ad4d18b 100644
--- a/test/syscalls/linux/madvise.cc
+++ b/test/syscalls/linux/madvise.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index c2513682d..7e103124b 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
index b4b680c34..a6e20f9c3 100644
--- a/test/syscalls/linux/memory_accounting.cc
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index 9f8033bdf..4ac4cb88f 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mincore.cc b/test/syscalls/linux/mincore.cc
index c572bf5ec..5c1240c89 100644
--- a/test/syscalls/linux/mincore.cc
+++ b/test/syscalls/linux/mincore.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index 50807b68f..cf138d328 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc
index 361ca299b..b1675b9c7 100644
--- a/test/syscalls/linux/mknod.cc
+++ b/test/syscalls/linux/mknod.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index a492b2404..aee4f7d1a 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index a4fb9d1e0..5b5b4c2e8 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
index 201b83e87..3a17672aa 100644
--- a/test/syscalls/linux/mount.cc
+++ b/test/syscalls/linux/mount.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mremap.cc b/test/syscalls/linux/mremap.cc
index 01116c1ab..7298d4ca8 100644
--- a/test/syscalls/linux/mremap.cc
+++ b/test/syscalls/linux/mremap.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index 5afbfce72..ac7146017 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/munmap.cc b/test/syscalls/linux/munmap.cc
index e20039950..067241f4d 100644
--- a/test/syscalls/linux/munmap.cc
+++ b/test/syscalls/linux/munmap.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 22e4666c2..42646bb02 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index b2cbd63d1..e5a85ef9d 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 71288ebc4..83b1ad4e4 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pause.cc b/test/syscalls/linux/pause.cc
index 4e1148c24..8c05efd6f 100644
--- a/test/syscalls/linux/pause.cc
+++ b/test/syscalls/linux/pause.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index abd10b11b..8698295b3 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index cd2161bb1..9e5aa7fd0 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ppoll.cc b/test/syscalls/linux/ppoll.cc
index f8c388c00..8245a11e8 100644
--- a/test/syscalls/linux/ppoll.cc
+++ b/test/syscalls/linux/ppoll.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index 854dec714..bce42dc74 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
index c1b561464..00dd6523e 100644
--- a/test/syscalls/linux/prctl_setuid.cc
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
index 4e5bcfcde..5e3eb1735 100644
--- a/test/syscalls/linux/pread64.cc
+++ b/test/syscalls/linux/pread64.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index 4a31123d8..eebd129f2 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index 58a4f9224..aac960130 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/priority.cc b/test/syscalls/linux/priority.cc
index 3906c7132..1d9bdfa70 100644
--- a/test/syscalls/linux/priority.cc
+++ b/test/syscalls/linux/priority.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/priority_execve.cc b/test/syscalls/linux/priority_execve.cc
index 5604bd3d0..5cb343bad 100644
--- a/test/syscalls/linux/priority_execve.cc
+++ b/test/syscalls/linux/priority_execve.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 7ba274226..654f26242 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 6060d0644..03d0665eb 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index ea7c93012..6d745f728 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
index cf5c462f3..7f2e8f203 100644
--- a/test/syscalls/linux/proc_pid_smaps.cc
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc_pid_uid_gid_map.cc b/test/syscalls/linux/proc_pid_uid_gid_map.cc
index 96c58c564..df70b7eb9 100644
--- a/test/syscalls/linux/proc_pid_uid_gid_map.cc
+++ b/test/syscalls/linux/proc_pid_uid_gid_map.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pselect.cc b/test/syscalls/linux/pselect.cc
index 3294f6c14..4e43c4d7f 100644
--- a/test/syscalls/linux/pselect.cc
+++ b/test/syscalls/linux/pselect.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index e0c56f1fc..4c212836c 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index 5b2dc9ccb..0485d187c 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index 485b1e48d..e1603fc2d 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
index a6949f08e..db519f4e0 100644
--- a/test/syscalls/linux/pwritev2.cc
+++ b/test/syscalls/linux/pwritev2.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/raw_socket_ipv4.cc b/test/syscalls/linux/raw_socket_ipv4.cc
index 8b8d032cb..e20b5cb50 100644
--- a/test/syscalls/linux/raw_socket_ipv4.cc
+++ b/test/syscalls/linux/raw_socket_ipv4.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/read.cc b/test/syscalls/linux/read.cc
index eb1b5bc10..4430fa3c2 100644
--- a/test/syscalls/linux/read.cc
+++ b/test/syscalls/linux/read.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index 0b933673a..f327ec3a9 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 349b80d7f..35d2dd9e3 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/readv_common.h b/test/syscalls/linux/readv_common.h
index e261d545a..b16179fca 100644
--- a/test/syscalls/linux/readv_common.h
+++ b/test/syscalls/linux/readv_common.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index cf22c395e..3c315cc02 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
index c0cbc7cd9..c9d76c2e2 100644
--- a/test/syscalls/linux/rename.cc
+++ b/test/syscalls/linux/rename.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/rlimits.cc b/test/syscalls/linux/rlimits.cc
index 7b255d0f6..860f0f688 100644
--- a/test/syscalls/linux/rlimits.cc
+++ b/test/syscalls/linux/rlimits.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
index ff948f9d5..81d193ffd 100644
--- a/test/syscalls/linux/rtsignal.cc
+++ b/test/syscalls/linux/rtsignal.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sched.cc b/test/syscalls/linux/sched.cc
index 60cb6c443..735e99411 100644
--- a/test/syscalls/linux/sched.cc
+++ b/test/syscalls/linux/sched.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sched_yield.cc b/test/syscalls/linux/sched_yield.cc
index fc45aa5c2..5d24f5b58 100644
--- a/test/syscalls/linux/sched_yield.cc
+++ b/test/syscalls/linux/sched_yield.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 27740d7ef..e77586852 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 41e6043cc..88c010aec 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index 1c47b6851..421318fcb 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 15fd01ff0..2fbb3f4ef 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index e2ccf17ce..66adda515 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 2c0f9b04a..eb7a3966f 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigaction.cc b/test/syscalls/linux/sigaction.cc
index cdd2dbf31..9a53fd3e0 100644
--- a/test/syscalls/linux/sigaction.cc
+++ b/test/syscalls/linux/sigaction.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 5741720f4..7d4a12c1d 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigaltstack_check.cc b/test/syscalls/linux/sigaltstack_check.cc
index b71f812a8..5ac1b661d 100644
--- a/test/syscalls/linux/sigaltstack_check.cc
+++ b/test/syscalls/linux/sigaltstack_check.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index 1b7cecccb..a47c781ea 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
index 1aea1ecb8..654c6a47f 100644
--- a/test/syscalls/linux/sigprocmask.cc
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigstop.cc b/test/syscalls/linux/sigstop.cc
index e21d23d51..9c7210e17 100644
--- a/test/syscalls/linux/sigstop.cc
+++ b/test/syscalls/linux/sigstop.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
index 1df9c013f..1e5bf5942 100644
--- a/test/syscalls/linux/sigtimedwait.cc
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
index 639cd4e59..2faf678f7 100644
--- a/test/syscalls/linux/socket_abstract.cc
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc
index c1bca467f..00c50d1bf 100644
--- a/test/syscalls/linux/socket_blocking.cc
+++ b/test/syscalls/linux/socket_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_blocking.h b/test/syscalls/linux/socket_blocking.h
index 5cddee54b..db26e5ef5 100644
--- a/test/syscalls/linux/socket_blocking.h
+++ b/test/syscalls/linux/socket_blocking.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
index 2653be158..f7cb72df4 100644
--- a/test/syscalls/linux/socket_filesystem.cc
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index d04d5abe0..f99f3fe62 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_generic.h b/test/syscalls/linux/socket_generic.h
index cd826abcf..00ae7bfc3 100644
--- a/test/syscalls/linux/socket_generic.h
+++ b/test/syscalls/linux/socket_generic.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 14d7827c2..f86a0f30c 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index 9cec7a71d..d7fc20aad 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 54f00cd9b..5b198f49d 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.h b/test/syscalls/linux/socket_ip_tcp_generic.h
index f38500d14..a3eff3c73 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.h
+++ b/test/syscalls/linux/socket_ip_tcp_generic.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index 1963d5deb..2c6ae17bf 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc
index 7e36c35d2..831de53b8 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index 9e2a18d3e..d1ea8ef12 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 54053360f..96c1b3b3d 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
index 5bf1de7c6..251817a9f 100644
--- a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index ac15154f2..044394ba7 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_generic.h b/test/syscalls/linux/socket_ip_udp_generic.h
index 8b8fc7c6e..106c54e9f 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.h
+++ b/test/syscalls/linux/socket_ip_udp_generic.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
index 0e4463649..fc124e9ef 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
index 0c3b669bf..1c3d1c0ad 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
index 7bf8597fe..7554b08d5 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
index 8e1c13ff4..3a068aacf 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
index b23de08d1..fb582b224 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 773d84b13..040bb176e 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index c99958ed5..709172580 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.h b/test/syscalls/linux/socket_ipv4_udp_unbound.h
index a780c0144..8e07bfbbf 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 9dd9e1bd6..53dcd58cd 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
index 5cf9fa8eb..45e1d37ea 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index 535a5fa10..ffbb8e6eb 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
index d6a8e428c..cb0105471 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
index b4e9fe51b..6a5fa8965 100644
--- a/test/syscalls/linux/socket_netdevice.cc
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index ed4ae1c71..c8693225f 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index edf549544..728d25434 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 44b1f148c..bea449107 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_blocking.cc b/test/syscalls/linux/socket_non_blocking.cc
index 1bcc6fb7f..73e6dc618 100644
--- a/test/syscalls/linux/socket_non_blocking.cc
+++ b/test/syscalls/linux/socket_non_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_blocking.h b/test/syscalls/linux/socket_non_blocking.h
index 287e096bb..bd3e02fd2 100644
--- a/test/syscalls/linux/socket_non_blocking.h
+++ b/test/syscalls/linux/socket_non_blocking.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
index d170008a4..3c599b6e8 100644
--- a/test/syscalls/linux/socket_non_stream.cc
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_stream.h b/test/syscalls/linux/socket_non_stream.h
index 02dd2a958..469fbe6a2 100644
--- a/test/syscalls/linux/socket_non_stream.h
+++ b/test/syscalls/linux/socket_non_stream.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
index 9e92628c3..76127d181 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.cc
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_stream_blocking.h b/test/syscalls/linux/socket_non_stream_blocking.h
index bde355452..6e205a039 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.h
+++ b/test/syscalls/linux/socket_non_stream_blocking.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
index c8a8ad0f6..0417dd347 100644
--- a/test/syscalls/linux/socket_stream.cc
+++ b/test/syscalls/linux/socket_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream.h b/test/syscalls/linux/socket_stream.h
index 35e591e17..b837b8f8c 100644
--- a/test/syscalls/linux/socket_stream.h
+++ b/test/syscalls/linux/socket_stream.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index f0f86c01c..8367460d2 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream_blocking.h b/test/syscalls/linux/socket_stream_blocking.h
index 06113ad03..9fd19ff90 100644
--- a/test/syscalls/linux/socket_stream_blocking.h
+++ b/test/syscalls/linux/socket_stream_blocking.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream_nonblock.cc b/test/syscalls/linux/socket_stream_nonblock.cc
index a3202ffe4..b00748b97 100644
--- a/test/syscalls/linux/socket_stream_nonblock.cc
+++ b/test/syscalls/linux/socket_stream_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream_nonblock.h b/test/syscalls/linux/socket_stream_nonblock.h
index 491f53848..c3b7fad91 100644
--- a/test/syscalls/linux/socket_stream_nonblock.h
+++ b/test/syscalls/linux/socket_stream_nonblock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 0be23e541..da69de37c 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index dfabdf179..058313986 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index fafb23ad1..bb3397fa2 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix.h b/test/syscalls/linux/socket_unix.h
index d2a16afb2..3625cc404 100644
--- a/test/syscalls/linux/socket_unix.h
+++ b/test/syscalls/linux/socket_unix.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_abstract.cc b/test/syscalls/linux/socket_unix_abstract.cc
index c4a3c889c..8241bf997 100644
--- a/test/syscalls/linux/socket_unix_abstract.cc
+++ b/test/syscalls/linux/socket_unix_abstract.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
index a69ee027e..9de0f6dfe 100644
--- a/test/syscalls/linux/socket_unix_abstract_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 57af118c5..320915b0f 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
index 5dd5e6d77..3e0f611d2 100644
--- a/test/syscalls/linux/socket_unix_dgram.cc
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_dgram.h b/test/syscalls/linux/socket_unix_dgram.h
index 722a3d8e6..0764ef85b 100644
--- a/test/syscalls/linux/socket_unix_dgram.h
+++ b/test/syscalls/linux/socket_unix_dgram.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
index da8f59704..4ba2c80ae 100644
--- a/test/syscalls/linux/socket_unix_dgram_local.cc
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index 3becb513d..9fe86cee8 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_domain.cc b/test/syscalls/linux/socket_unix_domain.cc
index f081c601f..fa3efc7f8 100644
--- a/test/syscalls/linux/socket_unix_domain.cc
+++ b/test/syscalls/linux/socket_unix_domain.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_filesystem.cc b/test/syscalls/linux/socket_unix_filesystem.cc
index 6a67da75f..5dbe67773 100644
--- a/test/syscalls/linux/socket_unix_filesystem.cc
+++ b/test/syscalls/linux/socket_unix_filesystem.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
index c13a1e564..137db53c4 100644
--- a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index a565978f9..dafe82494 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_non_stream.h b/test/syscalls/linux/socket_unix_non_stream.h
index e4214d949..7478ab172 100644
--- a/test/syscalls/linux/socket_unix_non_stream.h
+++ b/test/syscalls/linux/socket_unix_non_stream.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index 6c435669b..98cf1fe8a 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
index c575fdcb2..bacfc11e4 100644
--- a/test/syscalls/linux/socket_unix_pair.cc
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
index 1ae7f9b5e..583506f08 100644
--- a/test/syscalls/linux/socket_unix_pair_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
index ad0af77e9..6f6367dd5 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_seqpacket.h b/test/syscalls/linux/socket_unix_seqpacket.h
index da8eb2b2b..30d9b9edf 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.h
+++ b/test/syscalls/linux/socket_unix_seqpacket.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
index e6484d9b4..b903a9e8f 100644
--- a/test/syscalls/linux/socket_unix_seqpacket_local.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
index 95f454251..659c93945 100644
--- a/test/syscalls/linux/socket_unix_stream.cc
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index ec0fc6955..ce0f1e50d 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
index bf4c5f2eb..6b840189c 100644
--- a/test/syscalls/linux/socket_unix_stream_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index df80b105a..ebec4e0ec 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc
index b6fe7a9ce..4b5832de8 100644
--- a/test/syscalls/linux/socket_unix_unbound_abstract.cc
+++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
index 1ec11a08d..2ddc5c11f 100644
--- a/test/syscalls/linux/socket_unix_unbound_dgram.cc
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
index d09142aa6..8cb03c450 100644
--- a/test/syscalls/linux/socket_unix_unbound_filesystem.cc
+++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
index 21209b244..0575f2e1d 100644
--- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
index b95f9569e..091d546b3 100644
--- a/test/syscalls/linux/socket_unix_unbound_stream.cc
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 746318d09..80ba67496 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/stat_times.cc b/test/syscalls/linux/stat_times.cc
index 8346e9a8e..9b53739a0 100644
--- a/test/syscalls/linux/stat_times.cc
+++ b/test/syscalls/linux/stat_times.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc
index e1e7fc707..aca51d30f 100644
--- a/test/syscalls/linux/statfs.cc
+++ b/test/syscalls/linux/statfs.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
index 58cf0d014..59fb5dfe6 100644
--- a/test/syscalls/linux/sticky.cc
+++ b/test/syscalls/linux/sticky.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index 318917f4b..494072a9b 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sync.cc b/test/syscalls/linux/sync.cc
index 5b777b6eb..fe479390d 100644
--- a/test/syscalls/linux/sync.cc
+++ b/test/syscalls/linux/sync.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sync_file_range.cc b/test/syscalls/linux/sync_file_range.cc
index d11f58481..36cc42043 100644
--- a/test/syscalls/linux/sync_file_range.cc
+++ b/test/syscalls/linux/sync_file_range.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sysinfo.cc b/test/syscalls/linux/sysinfo.cc
index a0dd82640..1a71256da 100644
--- a/test/syscalls/linux/sysinfo.cc
+++ b/test/syscalls/linux/sysinfo.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/syslog.cc b/test/syscalls/linux/syslog.cc
index 5bd0d1cc3..9a7407d96 100644
--- a/test/syscalls/linux/syslog.cc
+++ b/test/syscalls/linux/syslog.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
index 8e10220eb..819fa655a 100644
--- a/test/syscalls/linux/sysret.cc
+++ b/test/syscalls/linux/sysret.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 33620a874..e3f9f9f9d 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/temp_umask.h b/test/syscalls/linux/temp_umask.h
index f202dfa59..81a25440c 100644
--- a/test/syscalls/linux/temp_umask.h
+++ b/test/syscalls/linux/temp_umask.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/tgkill.cc b/test/syscalls/linux/tgkill.cc
index 2d258ef11..80acae5de 100644
--- a/test/syscalls/linux/tgkill.cc
+++ b/test/syscalls/linux/tgkill.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 5a3dfd026..c7eead17e 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/timerfd.cc b/test/syscalls/linux/timerfd.cc
index b85321795..9df53612f 100644
--- a/test/syscalls/linux/timerfd.cc
+++ b/test/syscalls/linux/timerfd.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 14506eb12..fd42e81e1 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index 3e8ce5327..bae377c69 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/truncate.cc b/test/syscalls/linux/truncate.cc
index 2616a9147..e5cc5d97c 100644
--- a/test/syscalls/linux/truncate.cc
+++ b/test/syscalls/linux/truncate.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/udp_bind.cc b/test/syscalls/linux/udp_bind.cc
index 547eb2a6c..6d92bdbeb 100644
--- a/test/syscalls/linux/udp_bind.cc
+++ b/test/syscalls/linux/udp_bind.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
index f39281d5c..31db8a2ad 100644
--- a/test/syscalls/linux/udp_socket.cc
+++ b/test/syscalls/linux/udp_socket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index d78a09b1e..bf1ca8679 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/uname.cc b/test/syscalls/linux/uname.cc
index d22a34bd7..0a5d91017 100644
--- a/test/syscalls/linux/uname.cc
+++ b/test/syscalls/linux/uname.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.cc b/test/syscalls/linux/unix_domain_socket_test_util.cc
index 2d7a530b9..6f49e3660 100644
--- a/test/syscalls/linux/unix_domain_socket_test_util.cc
+++ b/test/syscalls/linux/unix_domain_socket_test_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h
index 1b09aeae7..aae990245 100644
--- a/test/syscalls/linux/unix_domain_socket_test_util.h
+++ b/test/syscalls/linux/unix_domain_socket_test_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/unlink.cc b/test/syscalls/linux/unlink.cc
index b10aae025..b6f65e027 100644
--- a/test/syscalls/linux/unlink.cc
+++ b/test/syscalls/linux/unlink.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/unshare.cc b/test/syscalls/linux/unshare.cc
index 9dd6ec4b6..e32619efe 100644
--- a/test/syscalls/linux/unshare.cc
+++ b/test/syscalls/linux/unshare.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index bf776cd93..80716859a 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/vdso.cc b/test/syscalls/linux/vdso.cc
index 0f6e1c7c6..19c80add8 100644
--- a/test/syscalls/linux/vdso.cc
+++ b/test/syscalls/linux/vdso.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc
index 0e936594b..759a50569 100644
--- a/test/syscalls/linux/vdso_clock_gettime.cc
+++ b/test/syscalls/linux/vdso_clock_gettime.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
index 9999a909e..631a53654 100644
--- a/test/syscalls/linux/vfork.cc
+++ b/test/syscalls/linux/vfork.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
index cb6840cc6..2c2303358 100644
--- a/test/syscalls/linux/vsyscall.cc
+++ b/test/syscalls/linux/vsyscall.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index fcd606bec..50d0725a7 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index 7f80b2fa8..9b219cfd6 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index c4af28103..28f312b8b 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/syscall_test_runner.sh b/test/syscalls/syscall_test_runner.sh
index 87d62786b..864bb2de4 100755
--- a/test/syscalls/syscall_test_runner.sh
+++ b/test/syscalls/syscall_test_runner.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/test/util/capability_util.cc b/test/util/capability_util.cc
index d1dd95e76..5d733887b 100644
--- a/test/util/capability_util.cc
+++ b/test/util/capability_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/capability_util.h b/test/util/capability_util.h
index 8708f5e69..e968a2583 100644
--- a/test/util/capability_util.h
+++ b/test/util/capability_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/cleanup.h b/test/util/cleanup.h
index fb4724f97..c76482ef4 100644
--- a/test/util/cleanup.h
+++ b/test/util/cleanup.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/epoll_util.cc b/test/util/epoll_util.cc
index 0b95aa8cd..2e5051468 100644
--- a/test/util/epoll_util.cc
+++ b/test/util/epoll_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/epoll_util.h b/test/util/epoll_util.h
index 521e7a3d3..f233b37d5 100644
--- a/test/util/epoll_util.h
+++ b/test/util/epoll_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/eventfd_util.h b/test/util/eventfd_util.h
index 1fdb07d3b..cb9ce829c 100644
--- a/test/util/eventfd_util.h
+++ b/test/util/eventfd_util.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/file_descriptor.h b/test/util/file_descriptor.h
index be8812d01..fc5caa55b 100644
--- a/test/util/file_descriptor.h
+++ b/test/util/file_descriptor.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 6bd424417..bc90bd78e 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index 9412b2f71..eb7cdaa24 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/fs_util_test.cc b/test/util/fs_util_test.cc
index ce70d58aa..4e12076a1 100644
--- a/test/util/fs_util_test.cc
+++ b/test/util/fs_util_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/logging.cc b/test/util/logging.cc
index 86ea71df3..cc71d77b0 100644
--- a/test/util/logging.cc
+++ b/test/util/logging.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/logging.h b/test/util/logging.h
index 6e957b172..589166fab 100644
--- a/test/util/logging.h
+++ b/test/util/logging.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/memory_util.h b/test/util/memory_util.h
index 8f6e99ba6..8c77778ea 100644
--- a/test/util/memory_util.h
+++ b/test/util/memory_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
index 468170646..7782e6bf2 100644
--- a/test/util/mount_util.h
+++ b/test/util/mount_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/multiprocess_util.cc b/test/util/multiprocess_util.cc
index 12637db8c..95f5f3b4f 100644
--- a/test/util/multiprocess_util.cc
+++ b/test/util/multiprocess_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index ba5f2601f..0aecd3439 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/posix_error.cc b/test/util/posix_error.cc
index ead9ede16..cebf7e0ac 100644
--- a/test/util/posix_error.cc
+++ b/test/util/posix_error.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/posix_error.h b/test/util/posix_error.h
index 2a66e2e94..b604f4f8f 100644
--- a/test/util/posix_error.h
+++ b/test/util/posix_error.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/posix_error_test.cc b/test/util/posix_error_test.cc
index c5427b8e5..d67270842 100644
--- a/test/util/posix_error_test.cc
+++ b/test/util/posix_error_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/proc_util.cc b/test/util/proc_util.cc
index 2d9eb1986..9d4db37c3 100644
--- a/test/util/proc_util.cc
+++ b/test/util/proc_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/proc_util.h b/test/util/proc_util.h
index e1ee2db9c..af209a51e 100644
--- a/test/util/proc_util.h
+++ b/test/util/proc_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/proc_util_test.cc b/test/util/proc_util_test.cc
index 75335415a..71dd2355e 100644
--- a/test/util/proc_util_test.cc
+++ b/test/util/proc_util_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/rlimit_util.cc b/test/util/rlimit_util.cc
index a9912c372..684253f78 100644
--- a/test/util/rlimit_util.cc
+++ b/test/util/rlimit_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/rlimit_util.h b/test/util/rlimit_util.h
index fa5cc70dc..873252a32 100644
--- a/test/util/rlimit_util.h
+++ b/test/util/rlimit_util.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/save_util.cc b/test/util/save_util.cc
index 5540e2146..05f52b80d 100644
--- a/test/util/save_util.cc
+++ b/test/util/save_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/save_util.h b/test/util/save_util.h
index 919e4af3d..90460701e 100644
--- a/test/util/save_util.h
+++ b/test/util/save_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/signal_util.cc b/test/util/signal_util.cc
index 3e2df32a6..26738864f 100644
--- a/test/util/signal_util.cc
+++ b/test/util/signal_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/signal_util.h b/test/util/signal_util.h
index 80f1808f6..7fd2af015 100644
--- a/test/util/signal_util.h
+++ b/test/util/signal_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index 48ce82d20..c5d8fc635 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/temp_path.h b/test/util/temp_path.h
index 33eb6a72c..89302e0fd 100644
--- a/test/util/temp_path.h
+++ b/test/util/temp_path.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/test_main.cc b/test/util/test_main.cc
index 4c6b5e860..5c7ee0064 100644
--- a/test/util/test_main.cc
+++ b/test/util/test_main.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 9b7cfa4dc..c52fd9a4a 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 905412b24..8f5eb5089 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/test_util_test.cc b/test/util/test_util_test.cc
index 5889651d1..b7300d9e5 100644
--- a/test/util/test_util_test.cc
+++ b/test/util/test_util_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/thread_util.h b/test/util/thread_util.h
index df09ac8cf..860e77531 100644
--- a/test/util/thread_util.h
+++ b/test/util/thread_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/timer_util.cc b/test/util/timer_util.cc
index 681fafb69..43a26b0d3 100644
--- a/test/util/timer_util.cc
+++ b/test/util/timer_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/timer_util.h b/test/util/timer_util.h
index 9bdc51a57..2cebfa5d1 100644
--- a/test/util/timer_util.h
+++ b/test/util/timer_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/third_party/gvsync/atomicptr_unsafe.go b/third_party/gvsync/atomicptr_unsafe.go
index da9f16240..53a943282 100644
--- a/third_party/gvsync/atomicptr_unsafe.go
+++ b/third_party/gvsync/atomicptr_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/atomicptrtest/atomicptr_test.go b/third_party/gvsync/atomicptrtest/atomicptr_test.go
index 15d0936d4..8fdc5112e 100644
--- a/third_party/gvsync/atomicptrtest/atomicptr_test.go
+++ b/third_party/gvsync/atomicptrtest/atomicptr_test.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/downgradable_rwmutex_test.go b/third_party/gvsync/downgradable_rwmutex_test.go
index 6517dd5dc..40c384b8b 100644
--- a/third_party/gvsync/downgradable_rwmutex_test.go
+++ b/third_party/gvsync/downgradable_rwmutex_test.go
@@ -1,5 +1,5 @@
 // Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
index 131f0a2ba..4d43eb765 100644
--- a/third_party/gvsync/downgradable_rwmutex_unsafe.go
+++ b/third_party/gvsync/downgradable_rwmutex_unsafe.go
@@ -1,5 +1,5 @@
 // Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/third_party/gvsync/gvsync.go b/third_party/gvsync/gvsync.go
index 46a2565fd..3bbef13c3 100644
--- a/third_party/gvsync/gvsync.go
+++ b/third_party/gvsync/gvsync.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/memmove_unsafe.go b/third_party/gvsync/memmove_unsafe.go
index d483fc739..4c8aa9ab6 100644
--- a/third_party/gvsync/memmove_unsafe.go
+++ b/third_party/gvsync/memmove_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/norace_unsafe.go b/third_party/gvsync/norace_unsafe.go
index f9c88d13f..e3852db8c 100644
--- a/third_party/gvsync/norace_unsafe.go
+++ b/third_party/gvsync/norace_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/race_unsafe.go b/third_party/gvsync/race_unsafe.go
index 2cdcdf7f7..13c02a830 100644
--- a/third_party/gvsync/race_unsafe.go
+++ b/third_party/gvsync/race_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/seqatomic_unsafe.go b/third_party/gvsync/seqatomic_unsafe.go
index ef61503e2..c52d378f1 100644
--- a/third_party/gvsync/seqatomic_unsafe.go
+++ b/third_party/gvsync/seqatomic_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/seqatomictest/seqatomic_test.go b/third_party/gvsync/seqatomictest/seqatomic_test.go
index d0c373bae..2da73cf96 100644
--- a/third_party/gvsync/seqatomictest/seqatomic_test.go
+++ b/third_party/gvsync/seqatomictest/seqatomic_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/third_party/gvsync/seqcount.go b/third_party/gvsync/seqcount.go
index c7ae91cfa..2c9c2c3d6 100644
--- a/third_party/gvsync/seqcount.go
+++ b/third_party/gvsync/seqcount.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/seqcount_test.go b/third_party/gvsync/seqcount_test.go
index ee6579ed8..085e574b3 100644
--- a/third_party/gvsync/seqcount_test.go
+++ b/third_party/gvsync/seqcount_test.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go
index eaf5c4970..ca414d8cb 100644
--- a/tools/go_generics/generics.go
+++ b/tools/go_generics/generics.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_stmts/input.go b/tools/go_generics/generics_tests/all_stmts/input.go
index 19184a3fe..4791d1ff1 100644
--- a/tools/go_generics/generics_tests/all_stmts/input.go
+++ b/tools/go_generics/generics_tests/all_stmts/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_stmts/output/output.go b/tools/go_generics/generics_tests/all_stmts/output/output.go
index 51582346c..a53d84535 100644
--- a/tools/go_generics/generics_tests/all_stmts/output/output.go
+++ b/tools/go_generics/generics_tests/all_stmts/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/input.go b/tools/go_generics/generics_tests/all_types/input.go
index ed6e97c29..3575d02ec 100644
--- a/tools/go_generics/generics_tests/all_types/input.go
+++ b/tools/go_generics/generics_tests/all_types/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/lib/lib.go b/tools/go_generics/generics_tests/all_types/lib/lib.go
index 7e73e678e..988786496 100644
--- a/tools/go_generics/generics_tests/all_types/lib/lib.go
+++ b/tools/go_generics/generics_tests/all_types/lib/lib.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/output/output.go b/tools/go_generics/generics_tests/all_types/output/output.go
index ec09a6be4..41fd147a1 100644
--- a/tools/go_generics/generics_tests/all_types/output/output.go
+++ b/tools/go_generics/generics_tests/all_types/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/consts/input.go b/tools/go_generics/generics_tests/consts/input.go
index 394bcc262..04b95fcc6 100644
--- a/tools/go_generics/generics_tests/consts/input.go
+++ b/tools/go_generics/generics_tests/consts/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/consts/output/output.go b/tools/go_generics/generics_tests/consts/output/output.go
index 91a07fdc2..18d316cc9 100644
--- a/tools/go_generics/generics_tests/consts/output/output.go
+++ b/tools/go_generics/generics_tests/consts/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/imports/input.go b/tools/go_generics/generics_tests/imports/input.go
index 22e6641a6..0f032c2a1 100644
--- a/tools/go_generics/generics_tests/imports/input.go
+++ b/tools/go_generics/generics_tests/imports/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/imports/output/output.go b/tools/go_generics/generics_tests/imports/output/output.go
index 2555c0004..2488ca58c 100644
--- a/tools/go_generics/generics_tests/imports/output/output.go
+++ b/tools/go_generics/generics_tests/imports/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/remove_typedef/input.go b/tools/go_generics/generics_tests/remove_typedef/input.go
index d9c9b8530..cf632bae7 100644
--- a/tools/go_generics/generics_tests/remove_typedef/input.go
+++ b/tools/go_generics/generics_tests/remove_typedef/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/remove_typedef/output/output.go b/tools/go_generics/generics_tests/remove_typedef/output/output.go
index f111a9426..d44fd8e1c 100644
--- a/tools/go_generics/generics_tests/remove_typedef/output/output.go
+++ b/tools/go_generics/generics_tests/remove_typedef/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/simple/input.go b/tools/go_generics/generics_tests/simple/input.go
index 711687cf5..2a917f16c 100644
--- a/tools/go_generics/generics_tests/simple/input.go
+++ b/tools/go_generics/generics_tests/simple/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/simple/output/output.go b/tools/go_generics/generics_tests/simple/output/output.go
index 139c9bf9d..6bfa0b25b 100644
--- a/tools/go_generics/generics_tests/simple/output/output.go
+++ b/tools/go_generics/generics_tests/simple/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/globals/globals_visitor.go b/tools/go_generics/globals/globals_visitor.go
index daaa17b1d..7ae48c662 100644
--- a/tools/go_generics/globals/globals_visitor.go
+++ b/tools/go_generics/globals/globals_visitor.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/globals/scope.go b/tools/go_generics/globals/scope.go
index b75a91689..96c965ea2 100644
--- a/tools/go_generics/globals/scope.go
+++ b/tools/go_generics/globals/scope.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/go_generics_unittest.sh b/tools/go_generics/go_generics_unittest.sh
index e7553a071..44b22db91 100755
--- a/tools/go_generics/go_generics_unittest.sh
+++ b/tools/go_generics/go_generics_unittest.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/go_merge/main.go b/tools/go_generics/go_merge/main.go
index 2f83facf8..f6a331123 100644
--- a/tools/go_generics/go_merge/main.go
+++ b/tools/go_generics/go_merge/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/imports.go b/tools/go_generics/imports.go
index 57f7c3dce..3a7230c97 100644
--- a/tools/go_generics/imports.go
+++ b/tools/go_generics/imports.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/remove.go b/tools/go_generics/remove.go
index 139d03955..568a6bbd3 100644
--- a/tools/go_generics/remove.go
+++ b/tools/go_generics/remove.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/rules_tests/template.go b/tools/go_generics/rules_tests/template.go
index f3f31ae8e..aace61da1 100644
--- a/tools/go_generics/rules_tests/template.go
+++ b/tools/go_generics/rules_tests/template.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/rules_tests/template_test.go b/tools/go_generics/rules_tests/template_test.go
index 3a38c8629..b2a3446ef 100644
--- a/tools/go_generics/rules_tests/template_test.go
+++ b/tools/go_generics/rules_tests/template_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 9e2c8e106..db7a7107b 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/tag_release.sh b/tools/tag_release.sh
index 6906a952f..02a49cdf1 100755
--- a/tools/tag_release.sh
+++ b/tools/tag_release.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
index a0e646e45..64a905fc9 100755
--- a/tools/workspace_status.sh
+++ b/tools/workspace_status.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vdso/barrier.h b/vdso/barrier.h
index 5b6c763f6..edba4afb5 100644
--- a/vdso/barrier.h
+++ b/vdso/barrier.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/check_vdso.py b/vdso/check_vdso.py
index 6f7d7e7ec..e41b09709 100644
--- a/vdso/check_vdso.py
+++ b/vdso/check_vdso.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vdso/compiler.h b/vdso/compiler.h
index d65f148fb..54a510000 100644
--- a/vdso/compiler.h
+++ b/vdso/compiler.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/cycle_clock.h b/vdso/cycle_clock.h
index 309e07a3f..5d3fbb257 100644
--- a/vdso/cycle_clock.h
+++ b/vdso/cycle_clock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/seqlock.h b/vdso/seqlock.h
index ab2f3fda3..7a173174b 100644
--- a/vdso/seqlock.h
+++ b/vdso/seqlock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/syscalls.h b/vdso/syscalls.h
index 90fb424ce..f5865bb72 100644
--- a/vdso/syscalls.h
+++ b/vdso/syscalls.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso.cc b/vdso/vdso.cc
index 550729035..6265ad217 100644
--- a/vdso/vdso.cc
+++ b/vdso/vdso.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso_time.cc b/vdso/vdso_time.cc
index 9fc262f60..1bb4bb86b 100644
--- a/vdso/vdso_time.cc
+++ b/vdso/vdso_time.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso_time.h b/vdso/vdso_time.h
index 464dadff2..70d079efc 100644
--- a/vdso/vdso_time.h
+++ b/vdso/vdso_time.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3


From 8bfb83d0acdea553082b897d3fd0ad1c1580eaa9 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 30 Apr 2019 13:55:41 -0700
Subject: Implement async MemoryFile eviction, and use it in
 CachingInodeOperations.

This feature allows MemoryFile to delay eviction of "optional"
allocations, such as unused cached file pages.

Note that this incidentally makes CachingInodeOperations writeback
asynchronous, in the sense that it doesn't occur until eviction; this is
necessary because between when a cached page becomes evictable and when
it's evicted, file writes (via CachingInodeOperations.Write) may dirty
the page.

As currently implemented, this feature won't meaningfully impact
steady-state memory usage or caching; the reclaimer goroutine will
schedule eviction as soon as it runs out of other work to do. Future CLs
increase caching by adding constraints on when eviction is scheduled.

PiperOrigin-RevId: 246014822
Change-Id: Ia85feb25a2de92a48359eb84434b6ec6f9bea2cb
---
 pkg/sentry/context/contexttest/contexttest.go |   2 +-
 pkg/sentry/fs/fsutil/dirty_set.go             |  22 ++
 pkg/sentry/fs/fsutil/inode_cached.go          |  78 +++-
 pkg/sentry/fs/fsutil/inode_cached_test.go     |   8 +-
 pkg/sentry/kernel/kernel.go                   |   5 +
 pkg/sentry/pgalloc/BUILD                      |  27 ++
 pkg/sentry/pgalloc/pgalloc.go                 | 504 +++++++++++++++++++-------
 pkg/sentry/pgalloc/save_restore.go            |  14 +
 runsc/boot/loader.go                          |   2 +-
 9 files changed, 511 insertions(+), 151 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index a42038711..210a235d2 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -44,7 +44,7 @@ func Context(tb testing.TB) context.Context {
 		tb.Fatalf("error creating application memory file: %v", err)
 	}
 	memfile := os.NewFile(uintptr(memfd), memfileName)
-	mf, err := pgalloc.NewMemoryFile(memfile)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
 	if err != nil {
 		memfile.Close()
 		tb.Fatalf("error creating pgalloc.MemoryFile: %v", err)
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 9cd196d7d..f1451d77a 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -107,6 +107,7 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
 	var changedAny bool
 	defer func() {
 		if changedAny {
+			// Merge segments split by Isolate to reduce cost of iteration.
 			ds.MergeRange(mr)
 		}
 	}()
@@ -132,6 +133,26 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
 	}
 }
 
+// AllowClean allows MarkClean to mark offsets in mr as not dirty, ending the
+// effect of a previous call to KeepDirty. (It does not itself mark those
+// offsets as not dirty.)
+func (ds *DirtySet) AllowClean(mr memmap.MappableRange) {
+	var changedAny bool
+	defer func() {
+		if changedAny {
+			// Merge segments split by Isolate to reduce cost of iteration.
+			ds.MergeRange(mr)
+		}
+	}()
+	for seg := ds.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() {
+		if seg.Value().Keep {
+			changedAny = true
+			seg = ds.Isolate(seg, mr)
+			seg.ValuePtr().Keep = false
+		}
+	}
+}
+
 // SyncDirty passes pages in the range mr that are stored in cache and
 // identified as dirty to writeAt, updating dirty to reflect successful writes.
 // If writeAt returns a successful partial write, SyncDirty will call it
@@ -142,6 +163,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet
 	var changedDirty bool
 	defer func() {
 		if changedDirty {
+			// Merge segments split by Isolate to reduce cost of iteration.
 			dirty.MergeRange(mr)
 		}
 	}()
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 919d2534c..76644e69d 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -175,11 +175,22 @@ func (c *CachingInodeOperations) Release() {
 	defer c.mapsMu.Unlock()
 	c.dataMu.Lock()
 	defer c.dataMu.Unlock()
-	// The cache should be empty (something has gone terribly wrong if we're
-	// releasing an inode that is still memory-mapped).
-	if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() {
-		panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty))
+
+	// Something has gone terribly wrong if we're releasing an inode that is
+	// still memory-mapped.
+	if !c.mappings.IsEmpty() {
+		panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings))
+	}
+
+	// Drop any cached pages that are still awaiting MemoryFile eviction. (This
+	// means that MemoryFile no longer needs to evict them.)
+	mf := c.mfp.MemoryFile()
+	mf.MarkAllUnevictable(c)
+	if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
+		panic(fmt.Sprintf("Failed to writeback cached data: %v", err))
 	}
+	c.cache.DropAll(mf)
+	c.dirty.RemoveAll()
 }
 
 // UnstableAttr implements fs.InodeOperations.UnstableAttr.
@@ -679,6 +690,13 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 	return done, nil
 }
 
+// useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O
+// and memory mappings, and false if c.cache may contain data cached from
+// c.backingFile.
+func (c *CachingInodeOperations) useHostPageCache() bool {
+	return !c.forcePageCache && c.backingFile.FD() >= 0
+}
+
 // AddMapping implements memmap.Mappable.AddMapping.
 func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
 	// Hot path. Avoid defers.
@@ -689,7 +707,15 @@ func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.Mappi
 	for _, r := range mapped {
 		c.hostFileMapper.IncRefOn(r)
 	}
-	if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 {
+	if !c.useHostPageCache() {
+		// c.Evict() will refuse to evict memory-mapped pages, so tell the
+		// MemoryFile to not bother trying.
+		mf := c.mfp.MemoryFile()
+		for _, r := range mapped {
+			mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End})
+		}
+	}
+	if c.useHostPageCache() && !usage.IncrementalMappedAccounting {
 		for _, r := range mapped {
 			usage.MemoryAccounting.Inc(r.Length(), usage.Mapped)
 		}
@@ -706,7 +732,7 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 	for _, r := range unmapped {
 		c.hostFileMapper.DecRefOn(r)
 	}
-	if !c.forcePageCache && c.backingFile.FD() >= 0 {
+	if c.useHostPageCache() {
 		if !usage.IncrementalMappedAccounting {
 			for _, r := range unmapped {
 				usage.MemoryAccounting.Dec(r.Length(), usage.Mapped)
@@ -716,17 +742,16 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 		return
 	}
 
-	// Writeback dirty mapped memory now that there are no longer any
-	// mappings that reference it. This is our naive memory eviction
-	// strategy.
+	// Pages that are no longer referenced by any application memory mappings
+	// are now considered unused; allow MemoryFile to evict them when
+	// necessary.
 	mf := c.mfp.MemoryFile()
 	c.dataMu.Lock()
 	for _, r := range unmapped {
-		if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
-			log.Warningf("Failed to writeback cached data %v: %v", r, err)
-		}
-		c.cache.Drop(r, mf)
-		c.dirty.KeepClean(r)
+		// Since these pages are no longer mapped, they are no longer
+		// concurrently dirtyable by a writable memory mapping.
+		c.dirty.AllowClean(r)
+		mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End})
 	}
 	c.dataMu.Unlock()
 	c.mapsMu.Unlock()
@@ -740,7 +765,7 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp
 // Translate implements memmap.Mappable.Translate.
 func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
 	// Hot path. Avoid defer.
-	if !c.forcePageCache && c.backingFile.FD() >= 0 {
+	if c.useHostPageCache() {
 		return []memmap.Translation{
 			{
 				Source: optional,
@@ -853,6 +878,29 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error
 	return nil
 }
 
+// Evict implements pgalloc.EvictableMemoryUser.Evict.
+func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+	c.mapsMu.Lock()
+	defer c.mapsMu.Unlock()
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+
+	mr := memmap.MappableRange{er.Start, er.End}
+	mf := c.mfp.MemoryFile()
+	// Only allow pages that are no longer memory-mapped to be evicted.
+	for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
+		mgapMR := mgap.Range().Intersect(mr)
+		if mgapMR.Length() == 0 {
+			continue
+		}
+		if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
+			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
+		}
+		c.cache.Drop(mgapMR, mf)
+		c.dirty.KeepClean(mgapMR)
+	}
+}
+
 // IncRef implements platform.File.IncRef. This is used when we directly map an
 // underlying host fd and CachingInodeOperations is used as the platform.File
 // during translation.
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 661ec41f6..3f10efc12 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -311,12 +311,10 @@ func TestRead(t *testing.T) {
 		t.Errorf("Read back bytes %v, want %v", rbuf, buf)
 	}
 
-	// Delete the memory mapping and expect it to cause the cached page to be
-	// uncached.
+	// Delete the memory mapping before iops.Release(). The cached page will
+	// either be evicted by ctx's pgalloc.MemoryFile, or dropped by
+	// iops.Release().
 	iops.RemoveMapping(ctx, ms, ar, usermem.PageSize, true)
-	if cached := iops.cache.Span(); cached != 0 {
-		t.Fatalf("Span got %d, want 0", cached)
-	}
 }
 
 func TestWrite(t *testing.T) {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 0468dd678..91889b573 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -303,7 +303,12 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	k.pauseTimeLocked()
 	defer k.resumeTimeLocked()
 
+	// Evict all evictable MemoryFile allocations.
+	k.mf.FlushEvictions()
+
 	// Flush write operations on open files so data reaches backing storage.
+	// This must come after k.mf.FlushEvictions() since eviction may cause file
+	// writes.
 	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
 		return err
 	}
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 7efa55c20..8a8a0e4e4 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -3,6 +3,31 @@ package(licenses = ["notice"])
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+go_template_instance(
+    name = "evictable_range",
+    out = "evictable_range.go",
+    package = "pgalloc",
+    prefix = "Evictable",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint64",
+    },
+)
+
+go_template_instance(
+    name = "evictable_range_set",
+    out = "evictable_range_set.go",
+    package = "pgalloc",
+    prefix = "evictableRange",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "EvictableRange",
+        "Value": "evictableRangeSetValue",
+        "Functions": "evictableRangeSetFunctions",
+    },
+)
+
 go_template_instance(
     name = "usage_set",
     out = "usage_set.go",
@@ -27,6 +52,8 @@ go_library(
     name = "pgalloc",
     srcs = [
         "context.go",
+        "evictable_range.go",
+        "evictable_range_set.go",
         "pgalloc.go",
         "pgalloc_unsafe.go",
         "save_restore.go",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 411dafa07..9c1313f6f 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -31,6 +31,7 @@ import (
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -41,6 +42,9 @@ import (
 // MemoryFile is a platform.File whose pages may be allocated to arbitrary
 // users.
 type MemoryFile struct {
+	// opts holds options passed to NewMemoryFile. opts is immutable.
+	opts MemoryFileOpts
+
 	// MemoryFile owns a single backing file, which is modeled as follows:
 	//
 	// Each page in the file can be committed or uncommitted. A page is
@@ -115,6 +119,24 @@ type MemoryFile struct {
 	// fileSize is protected by mu.
 	fileSize int64
 
+	// Pages from the backing file are mapped into the local address space on
+	// the granularity of large pieces called chunks. mappings is a []uintptr
+	// that stores, for each chunk, the start address of a mapping of that
+	// chunk in the current process' address space, or 0 if no such mapping
+	// exists. Once a chunk is mapped, it is never remapped or unmapped until
+	// the MemoryFile is destroyed.
+	//
+	// Mutating the mappings slice or its contents requires both holding
+	// mappingsMu and using atomic memory operations. (The slice is mutated
+	// whenever the file is expanded. Per the above, the only permitted
+	// mutation of the slice's contents is the assignment of a mapping to a
+	// chunk that was previously unmapped.) Reading the slice or its contents
+	// only requires *either* holding mappingsMu or using atomic memory
+	// operations. This allows MemoryFile.MapInternal to avoid locking in the
+	// common case where chunk mappings already exist.
+	mappingsMu sync.Mutex
+	mappings   atomic.Value
+
 	// destroyed is set by Destroy to instruct the reclaimer goroutine to
 	// release resources and exit. destroyed is protected by mu.
 	destroyed bool
@@ -133,26 +155,44 @@ type MemoryFile struct {
 	// transitions from false to true.
 	reclaimCond sync.Cond
 
-	// Pages from the backing file are mapped into the local address space on
-	// the granularity of large pieces called chunks. mappings is a []uintptr
-	// that stores, for each chunk, the start address of a mapping of that
-	// chunk in the current process' address space, or 0 if no such mapping
-	// exists. Once a chunk is mapped, it is never remapped or unmapped until
-	// the MemoryFile is destroyed.
+	// evictable maps EvictableMemoryUsers to eviction state.
 	//
-	// Mutating the mappings slice or its contents requires both holding
-	// mappingsMu and using atomic memory operations. (The slice is mutated
-	// whenever the file is expanded. Per the above, the only permitted
-	// mutation of the slice's contents is the assignment of a mapping to a
-	// chunk that was previously unmapped.) Reading the slice or its contents
-	// only requires *either* holding mappingsMu or using atomic memory
-	// operations. This allows MemoryFile.MapInternal to avoid locking in the
-	// common case where chunk mappings already exist.
-	mappingsMu sync.Mutex
-	mappings   atomic.Value
+	// evictable is protected by mu.
+	evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
+
+	// evictionWG counts the number of goroutines currently performing evictions.
+	evictionWG sync.WaitGroup
+}
+
+// MemoryFileOpts provides options to NewMemoryFile.
+type MemoryFileOpts struct {
+	// DelayedEviction controls the extent to which the MemoryFile may delay
+	// eviction of evictable allocations.
+	DelayedEviction DelayedEvictionType
 }
 
-// usage tracks usage information.
+// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
+type DelayedEvictionType int
+
+const (
+	// DelayedEvictionDefault has unspecified behavior.
+	DelayedEvictionDefault DelayedEvictionType = iota
+
+	// DelayedEvictionDisabled requires that evictable allocations are evicted
+	// as soon as possible.
+	DelayedEvictionDisabled
+
+	// DelayedEvictionEnabled requests that the MemoryFile delay eviction of
+	// evictable allocations until doing so is considered necessary to avoid
+	// performance degradation due to host memory pressure, or OOM kills.
+	//
+	// As of this writing, DelayedEvictionEnabled delays evictions until the
+	// reclaimer goroutine is out of work (pages to reclaim), then evicts all
+	// pending evictable allocations immediately.
+	DelayedEvictionEnabled
+)
+
+// usageInfo tracks usage information.
 //
 // +stateify savable
 type usageInfo struct {
@@ -166,6 +206,46 @@ type usageInfo struct {
 	refs uint64
 }
 
+// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
+// may be asked to deallocate that memory in the presence of memory pressure.
+type EvictableMemoryUser interface {
+	// Evict requests that the EvictableMemoryUser deallocate memory used by
+	// er, which was registered as evictable by a previous call to
+	// MemoryFile.MarkEvictable.
+	//
+	// Evict is not required to deallocate memory. In particular, since pgalloc
+	// must call Evict without holding locks to avoid circular lock ordering,
+	// it is possible that the passed range has already been marked as
+	// unevictable by a racing call to MemoryFile.MarkUnevictable.
+	// Implementations of EvictableMemoryUser must detect such races and handle
+	// them by making Evict have no effect on unevictable ranges.
+	//
+	// After a call to Evict, the MemoryFile will consider the evicted range
+	// unevictable (i.e. it will not call Evict on the same range again) until
+	// informed otherwise by a subsequent call to MarkEvictable.
+	Evict(ctx context.Context, er EvictableRange)
+}
+
+// An EvictableRange represents a range of uint64 offsets in an
+// EvictableMemoryUser.
+//
+// In practice, most EvictableMemoryUsers will probably be implementations of
+// memmap.Mappable, and EvictableRange therefore corresponds to
+// memmap.MappableRange. However, this package cannot depend on the memmap
+// package, since doing so would create a circular dependency.
+//
+// type EvictableRange <generated using go_generics>
+
+// evictableMemoryUserInfo is the value type of MemoryFile.evictable.
+type evictableMemoryUserInfo struct {
+	// ranges tracks all evictable ranges for the given user.
+	ranges evictableRangeSet
+
+	// If evicting is true, there is a goroutine currently evicting all
+	// evictable ranges for this user.
+	evicting bool
+}
+
 const (
 	chunkShift = 24
 	chunkSize  = 1 << chunkShift // 16 MB
@@ -180,7 +260,15 @@ const (
 // NewMemoryFile creates a MemoryFile backed by the given file. If
 // NewMemoryFile succeeds, ownership of file is transferred to the returned
 // MemoryFile.
-func NewMemoryFile(file *os.File) (*MemoryFile, error) {
+func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
+	switch opts.DelayedEviction {
+	case DelayedEvictionDefault:
+		opts.DelayedEviction = DelayedEvictionEnabled
+	case DelayedEvictionDisabled, DelayedEvictionEnabled:
+	default:
+		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
+	}
+
 	// Truncate the file to 0 bytes first to ensure that it's empty.
 	if err := file.Truncate(0); err != nil {
 		return nil, err
@@ -189,14 +277,16 @@ func NewMemoryFile(file *os.File) (*MemoryFile, error) {
 		return nil, err
 	}
 	f := &MemoryFile{
+		opts:     opts,
 		fileSize: initialSize,
 		file:     file,
 		// No pages are reclaimable. DecRef will always be able to
 		// decrease minReclaimablePage from this point.
 		minReclaimablePage: maxPage,
+		evictable:          make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
 	}
-	f.reclaimCond.L = &f.mu
 	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	f.reclaimCond.L = &f.mu
 	go f.runReclaim() // S/R-SAFE: f.mu
 
 	// The Linux kernel contains an optional feature called "Integrity
@@ -434,113 +524,6 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
 	f.usage.MergeRange(fr)
 }
 
-// runReclaim implements the reclaimer goroutine, which continuously decommits
-// reclaimable pages in order to reduce memory usage and make them available
-// for allocation.
-func (f *MemoryFile) runReclaim() {
-	for {
-		fr, ok := f.findReclaimable()
-		if !ok {
-			break
-		}
-
-		if err := f.Decommit(fr); err != nil {
-			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
-			// Zero the pages manually. This won't reduce memory usage, but at
-			// least ensures that the pages will be zero when reallocated.
-			f.forEachMappingSlice(fr, func(bs []byte) {
-				for i := range bs {
-					bs[i] = 0
-				}
-			})
-			// Pretend the pages were decommitted even though they weren't,
-			// since the memory accounting implementation has no idea how to
-			// deal with this.
-			f.markDecommitted(fr)
-		}
-		f.markReclaimed(fr)
-	}
-	// We only get here if findReclaimable finds f.destroyed set and returns
-	// false.
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	if !f.destroyed {
-		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
-	}
-	f.file.Close()
-	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
-	// that has possibly been reassigned.
-	f.file = nil
-	mappings := f.mappings.Load().([]uintptr)
-	for i, m := range mappings {
-		if m != 0 {
-			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
-			if errno != 0 {
-				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
-			}
-		}
-	}
-	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
-	f.mappings.Store([]uintptr{})
-}
-
-func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	for {
-		for {
-			if f.destroyed {
-				return platform.FileRange{}, false
-			}
-			if f.reclaimable {
-				break
-			}
-			f.reclaimCond.Wait()
-		}
-		// Allocate returns the first usable range in offset order and is
-		// currently a linear scan, so reclaiming from the beginning of the
-		// file minimizes the expected latency of Allocate.
-		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
-			if seg.ValuePtr().refs == 0 {
-				f.minReclaimablePage = seg.End()
-				return seg.Range(), true
-			}
-		}
-		// No pages are reclaimable.
-		f.reclaimable = false
-		f.minReclaimablePage = maxPage
-	}
-}
-
-func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	seg := f.usage.FindSegment(fr.Start)
-	// All of fr should be mapped to a single uncommitted reclaimable segment
-	// accounted to System.
-	if !seg.Ok() {
-		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
-	}
-	if !seg.Range().IsSupersetOf(fr) {
-		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
-	}
-	if got, want := seg.Value(), (usageInfo{
-		kind:           usage.System,
-		knownCommitted: false,
-		refs:           0,
-	}); got != want {
-		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
-	}
-	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
-	// caller of markReclaimed may not have decommitted it, so we can only mark
-	// fr as reclaimed.
-	f.usage.Remove(f.usage.Isolate(seg, fr))
-	if fr.Start < f.minUnallocatedPage {
-		// We've deallocated at least one lower page.
-		f.minUnallocatedPage = fr.Start
-	}
-}
-
 // IncRef implements platform.File.IncRef.
 func (f *MemoryFile) IncRef(fr platform.FileRange) {
 	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
@@ -677,9 +660,82 @@ func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
 	return mappings, m, nil
 }
 
-// FD implements platform.File.FD.
-func (f *MemoryFile) FD() int {
-	return int(f.file.Fd())
+// MarkEvictable allows f to request memory deallocation by calling
+// user.Evict(er) in the future.
+//
+// Redundantly marking an already-evictable range as evictable has no effect.
+func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		info = &evictableMemoryUserInfo{}
+		f.evictable[user] = info
+	}
+	gap := info.ranges.LowerBoundGap(er.Start)
+	for gap.Ok() && gap.Start() < er.End {
+		gapER := gap.Range().Intersect(er)
+		if gapER.Length() == 0 {
+			gap = gap.NextGap()
+			continue
+		}
+		gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
+	}
+	if !info.evicting {
+		switch f.opts.DelayedEviction {
+		case DelayedEvictionDisabled:
+			// Kick off eviction immediately.
+			f.startEvictionGoroutineLocked(user, info)
+		case DelayedEvictionEnabled:
+			// Ensure that the reclaimer goroutine is running, so that it can
+			// start eviction when necessary.
+			f.reclaimCond.Signal()
+		}
+	}
+}
+
+// MarkUnevictable informs f that user no longer considers er to be evictable,
+// so the MemoryFile should no longer call user.Evict(er). Note that, per
+// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
+// called even after MarkUnevictable returns due to race conditions, and
+// implementations of EvictableMemoryUser must handle this possibility.
+//
+// Redundantly marking an already-unevictable range as unevictable has no
+// effect.
+func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		return
+	}
+	seg := info.ranges.LowerBoundSegment(er.Start)
+	for seg.Ok() && seg.Start() < er.End {
+		seg = info.ranges.Isolate(seg, er)
+		seg = info.ranges.Remove(seg).NextSegment()
+	}
+	// We can only remove info if there's no eviction goroutine running on its
+	// behalf.
+	if !info.evicting && info.ranges.IsEmpty() {
+		delete(f.evictable, user)
+	}
+}
+
+// MarkAllUnevictable informs f that user no longer considers any offsets to be
+// evictable. It otherwise has the same semantics as MarkUnevictable.
+func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		return
+	}
+	info.ranges.RemoveAll()
+	// We can only remove info if there's no eviction goroutine running on its
+	// behalf.
+	if !info.evicting {
+		delete(f.evictable, user)
+	}
 }
 
 // UpdateUsage ensures that the memory usage statistics in
@@ -889,6 +945,11 @@ func (f *MemoryFile) File() *os.File {
 	return f.file
 }
 
+// FD implements platform.File.FD.
+func (f *MemoryFile) FD() int {
+	return int(f.file.Fd())
+}
+
 // String implements fmt.Stringer.String.
 //
 // Note that because f.String locks f.mu, calling f.String internally
@@ -900,6 +961,167 @@ func (f *MemoryFile) String() string {
 	return f.usage.String()
 }
 
+// runReclaim implements the reclaimer goroutine, which continuously decommits
+// reclaimable pages in order to reduce memory usage and make them available
+// for allocation.
+func (f *MemoryFile) runReclaim() {
+	for {
+		fr, ok := f.findReclaimable()
+		if !ok {
+			break
+		}
+
+		if err := f.Decommit(fr); err != nil {
+			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
+			// Zero the pages manually. This won't reduce memory usage, but at
+			// least ensures that the pages will be zero when reallocated.
+			f.forEachMappingSlice(fr, func(bs []byte) {
+				for i := range bs {
+					bs[i] = 0
+				}
+			})
+			// Pretend the pages were decommitted even though they weren't,
+			// since the memory accounting implementation has no idea how to
+			// deal with this.
+			f.markDecommitted(fr)
+		}
+		f.markReclaimed(fr)
+	}
+	// We only get here if findReclaimable finds f.destroyed set and returns
+	// false.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if !f.destroyed {
+		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
+	}
+	f.file.Close()
+	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
+	// that has possibly been reassigned.
+	f.file = nil
+	f.mappingsMu.Lock()
+	defer f.mappingsMu.Unlock()
+	mappings := f.mappings.Load().([]uintptr)
+	for i, m := range mappings {
+		if m != 0 {
+			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
+			if errno != 0 {
+				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
+			}
+		}
+	}
+	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
+	f.mappings.Store([]uintptr{})
+}
+
+func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for {
+		for {
+			if f.destroyed {
+				return platform.FileRange{}, false
+			}
+			if f.reclaimable {
+				break
+			}
+			if f.opts.DelayedEviction == DelayedEvictionEnabled {
+				// No work to do. Evict any pending evictable allocations to
+				// get more reclaimable pages before going to sleep.
+				f.startEvictionsLocked()
+			}
+			f.reclaimCond.Wait()
+		}
+		// Allocate returns the first usable range in offset order and is
+		// currently a linear scan, so reclaiming from the beginning of the
+		// file minimizes the expected latency of Allocate.
+		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
+			if seg.ValuePtr().refs == 0 {
+				f.minReclaimablePage = seg.End()
+				return seg.Range(), true
+			}
+		}
+		// No pages are reclaimable.
+		f.reclaimable = false
+		f.minReclaimablePage = maxPage
+	}
+}
+
+func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	seg := f.usage.FindSegment(fr.Start)
+	// All of fr should be mapped to a single uncommitted reclaimable segment
+	// accounted to System.
+	if !seg.Ok() {
+		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
+	}
+	if !seg.Range().IsSupersetOf(fr) {
+		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
+	}
+	if got, want := seg.Value(), (usageInfo{
+		kind:           usage.System,
+		knownCommitted: false,
+		refs:           0,
+	}); got != want {
+		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
+	}
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
+	// caller of markReclaimed may not have decommitted it, so we can only mark
+	// fr as reclaimed.
+	f.usage.Remove(f.usage.Isolate(seg, fr))
+	if fr.Start < f.minUnallocatedPage {
+		// We've deallocated at least one lower page.
+		f.minUnallocatedPage = fr.Start
+	}
+}
+
+// Preconditions: f.mu must be locked.
+func (f *MemoryFile) startEvictionsLocked() {
+	for user, info := range f.evictable {
+		// Don't start multiple goroutines to evict the same user's
+		// allocations.
+		if !info.evicting {
+			f.startEvictionGoroutineLocked(user, info)
+		}
+	}
+}
+
+// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
+// locked.
+func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
+	info.evicting = true
+	f.evictionWG.Add(1)
+	go func() { // S/R-SAFE: f.evictionWG
+		defer f.evictionWG.Done()
+		for {
+			f.mu.Lock()
+			info, ok := f.evictable[user]
+			if !ok {
+				// This shouldn't happen: only this goroutine is permitted
+				// to delete this entry.
+				f.mu.Unlock()
+				panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
+			}
+			if info.ranges.IsEmpty() {
+				delete(f.evictable, user)
+				f.mu.Unlock()
+				return
+			}
+			// Evict from the end of info.ranges, under the assumption that
+			// if ranges in user start being used again (and are
+			// consequently marked unevictable), such uses are more likely
+			// to start from the beginning of user.
+			seg := info.ranges.LastSegment()
+			er := seg.Range()
+			info.ranges.Remove(seg)
+			// user.Evict() must be called without holding f.mu to avoid
+			// circular lock ordering.
+			f.mu.Unlock()
+			user.Evict(context.Background(), er)
+		}
+	}()
+}
+
 type usageSetFunctions struct{}
 
 func (usageSetFunctions) MinKey() uint64 {
@@ -920,3 +1142,27 @@ func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.
 func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
 	return val, val
 }
+
+// evictableRangeSetValue is the value type of evictableRangeSet.
+type evictableRangeSetValue struct{}
+
+type evictableRangeSetFunctions struct{}
+
+func (evictableRangeSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (evictableRangeSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
+}
+
+func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
+	return evictableRangeSetValue{}, true
+}
+
+func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
+	return evictableRangeSetValue{}, evictableRangeSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index cf169af55..9534d1aed 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -28,6 +28,15 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/state"
 )
 
+// FlushEvictions blocks until f has finished evicting all evictable
+// allocations.
+func (f *MemoryFile) FlushEvictions() {
+	f.mu.Lock()
+	f.startEvictionsLocked()
+	f.mu.Unlock()
+	f.evictionWG.Wait()
+}
+
 // SaveTo writes f's state to the given stream.
 func (f *MemoryFile) SaveTo(w io.Writer) error {
 	// Wait for reclaim.
@@ -40,6 +49,11 @@ func (f *MemoryFile) SaveTo(w io.Writer) error {
 		f.mu.Lock()
 	}
 
+	// Ensure that there are no pending evictions.
+	if len(f.evictable) != 0 {
+		panic(fmt.Sprintf("evictions still pending for %d users; call FlushEvictions before SaveTo", len(f.evictable)))
+	}
+
 	// Ensure that all pages that contain data have knownCommitted set, since
 	// we only store knownCommitted pages below.
 	zeroPage := make([]byte, usermem.PageSize)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0b5be0a42..05122a6a8 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -424,7 +424,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 		return nil, fmt.Errorf("error creating memfd: %v", err)
 	}
 	memfile := os.NewFile(uintptr(memfd), memfileName)
-	mf, err := pgalloc.NewMemoryFile(memfile)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
 	if err != nil {
 		memfile.Close()
 		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
-- 
cgit v1.2.3


From bbb65391143d48a4781e48d0875897a857a69d67 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 2 May 2019 17:16:30 -0700
Subject: Add [simple] network support to 'runsc do'

Sandbox always runsc with IP 192.168.10.2 and the peer
network adds 1 to the address (192.168.10.3). Sandbox
IP can be changed using --ip flag.

Here a few examples:
  sudo runsc do curl www.google.com
  sudo runsc do --ip=10.10.10.2 bash -c "echo 123 | netcat -l -p 8080"

PiperOrigin-RevId: 246421277
Change-Id: I7b3dce4af46a57300350dab41cb27e04e4b6e9da
---
 runsc/cmd/do.go              | 160 +++++++++++++++++++++++++++++++++++++++++--
 runsc/container/container.go |   2 +-
 runsc/sandbox/sandbox.go     |   2 +-
 3 files changed, 156 insertions(+), 8 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 67d415733..842fe2341 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -21,7 +21,10 @@ import (
 	"io/ioutil"
 	"math/rand"
 	"os"
+	"os/exec"
 	"path/filepath"
+	"strconv"
+	"strings"
 	"syscall"
 
 	"flag"
@@ -38,6 +41,7 @@ import (
 type Do struct {
 	root string
 	cwd  string
+	ip   string
 }
 
 // Name implements subcommands.Command.Name.
@@ -65,7 +69,8 @@ used for testing only.
 // SetFlags implements subcommands.Command.SetFlags.
 func (c *Do) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
-	f.StringVar(&c.cwd, "cwd", ".", `path to the current directory, defaults to the current directory`)
+	f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory")
+	f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -112,6 +117,15 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 
 	specutils.LogSpec(spec)
 
+	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
+	if conf.Network != boot.NetworkNone {
+		clean, err := c.setupNet(cid, spec)
+		if err != nil {
+			Fatalf("Error setting up network: %v", err)
+		}
+		defer clean()
+	}
+
 	out, err := json.Marshal(spec)
 	if err != nil {
 		Fatalf("Error to marshal spec: %v", err)
@@ -130,11 +144,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		Fatalf("Error write spec: %v", err)
 	}
 
-	// No network support yet.
-	conf.Network = boot.NetworkNone
-
-	id := fmt.Sprintf("runcs-do-%06d", rand.Int31n(1000000))
-	ws, err := container.Run(id, spec, conf, tmpDir, "", "", "")
+	ws, err := container.Run(cid, spec, conf, tmpDir, "", "", "")
 	if err != nil {
 		Fatalf("running container: %v", err)
 	}
@@ -155,3 +165,141 @@ func resolvePath(path string) (string, error) {
 	}
 	return path, nil
 }
+
+func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
+	dev, err := defaultDevice()
+	if err != nil {
+		return nil, err
+	}
+	peerIP, err := calculatePeerIP(c.ip)
+	if err != nil {
+		return nil, err
+	}
+	veth, peer := deviceNames(cid)
+
+	cmds := []string{
+		fmt.Sprintf("ip link add %s type veth peer name %s", veth, peer),
+
+		// Setup device outside the namespace.
+		fmt.Sprintf("ip addr add %s/24 dev %s", peerIP, peer),
+		fmt.Sprintf("ip link set %s up", peer),
+
+		// Setup device inside the namespace.
+		fmt.Sprintf("ip netns add %s", cid),
+		fmt.Sprintf("ip link set %s netns %s", veth, cid),
+		fmt.Sprintf("ip netns exec %s ip addr add %s/24 dev %s", cid, c.ip, veth),
+		fmt.Sprintf("ip netns exec %s ip link set %s up", cid, veth),
+		fmt.Sprintf("ip netns exec %s ip link set lo up", cid),
+		fmt.Sprintf("ip netns exec %s ip route add default via %s", cid, peerIP),
+
+		// Enable network access.
+		"sysctl -w net.ipv4.ip_forward=1",
+		fmt.Sprintf("iptables -t nat -A POSTROUTING -s %s -o %s -j MASQUERADE", c.ip, dev),
+		fmt.Sprintf("iptables -A FORWARD -i %s -o %s -j ACCEPT", dev, peer),
+		fmt.Sprintf("iptables -A FORWARD -o %s -i %s -j ACCEPT", dev, peer),
+	}
+
+	for _, cmd := range cmds {
+		log.Debugf("Run %q", cmd)
+		args := strings.Split(cmd, " ")
+		c := exec.Command(args[0], args[1:]...)
+		if err := c.Run(); err != nil {
+			return nil, fmt.Errorf("failed to run %q: %v", cmd, err)
+		}
+	}
+
+	if err := makeFile("/etc/resolv.conf", "nameserver 8.8.8.8\n", spec); err != nil {
+		return nil, err
+	}
+	if err := makeFile("/etc/hostname", cid+"\n", spec); err != nil {
+		return nil, err
+	}
+	hosts := fmt.Sprintf("127.0.0.1\tlocalhost\n%s\t%s\n", c.ip, cid)
+	if err := makeFile("/etc/hosts", hosts, spec); err != nil {
+		return nil, err
+	}
+
+	if spec.Linux == nil {
+		spec.Linux = &specs.Linux{}
+	}
+	netns := specs.LinuxNamespace{
+		Type: specs.NetworkNamespace,
+		Path: filepath.Join("/var/run/netns", cid),
+	}
+	spec.Linux.Namespaces = append(spec.Linux.Namespaces, netns)
+
+	return func() { c.cleanNet(cid, dev) }, nil
+}
+
+func (c *Do) cleanNet(cid, dev string) {
+	veth, peer := deviceNames(cid)
+
+	cmds := []string{
+		fmt.Sprintf("ip link delete %s", peer),
+		fmt.Sprintf("ip netns delete %s", cid),
+
+		fmt.Sprintf("iptables -t nat -D POSTROUTING -s %s/24 -o %s -j MASQUERADE", c.ip, dev),
+		fmt.Sprintf("iptables -D FORWARD -i %s -o %s -j ACCEPT", dev, veth),
+		fmt.Sprintf("iptables -D FORWARD -o %s -i %s -j ACCEPT", dev, veth),
+	}
+
+	for _, cmd := range cmds {
+		log.Debugf("Run %q", cmd)
+		args := strings.Split(cmd, " ")
+		c := exec.Command(args[0], args[1:]...)
+		if err := c.Run(); err != nil {
+			log.Warningf("Failed to run %q: %v", cmd, err)
+		}
+	}
+}
+
+func deviceNames(cid string) (string, string) {
+	// Device name is limited to 15 letters.
+	return "ve-" + cid, "vp-" + cid
+
+}
+
+func defaultDevice() (string, error) {
+	out, err := exec.Command("ip", "route", "list", "default").CombinedOutput()
+	if err != nil {
+		return "", err
+	}
+	parts := strings.Split(string(out), " ")
+	if len(parts) < 5 {
+		return "", fmt.Errorf("malformed %q output: %q", "ip route list default", string(out))
+	}
+	return parts[4], nil
+}
+
+func makeFile(dest, content string, spec *specs.Spec) error {
+	tmpFile, err := ioutil.TempFile("", filepath.Base(dest))
+	if err != nil {
+		return err
+	}
+	if _, err := tmpFile.WriteString(content); err != nil {
+		return err
+	}
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Source:      tmpFile.Name(),
+		Destination: dest,
+		Type:        "bind",
+		Options:     []string{"ro"},
+	})
+	return nil
+}
+
+func calculatePeerIP(ip string) (string, error) {
+	parts := strings.Split(ip, ".")
+	if len(parts) != 4 {
+		return "", fmt.Errorf("invalid IP format %q", ip)
+	}
+	n, err := strconv.Atoi(parts[3])
+	if err != nil {
+		return "", fmt.Errorf("invalid IP format %q: %v", ip, err)
+	}
+	n++
+	if n > 255 {
+		n = 1
+	}
+	return fmt.Sprintf("%s.%s.%s.%d", parts[0], parts[1], parts[2], n), nil
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 884bbc0fb..3589272f2 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -906,7 +906,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
-		return nil, nil, err
+		return nil, nil, fmt.Errorf("Gofer: %v", err)
 	}
 	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
 	c.GoferPid = cmd.Process.Pid
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index dac35ca0b..9d8cfa451 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -601,7 +601,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
 	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
-		return err
+		return fmt.Errorf("Sandbox: %v", err)
 	}
 	s.child = true
 	s.Pid = cmd.Process.Pid
-- 
cgit v1.2.3


From c967fbdaa2cda260312f73a3f75744ac1ad11176 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 2 May 2019 19:26:16 -0700
Subject: runsc: move test_app in a separate directory

Opensource tools (e. g. https://github.com/fatih/vim-go) can't hanlde more than
one golang package in one directory.

PiperOrigin-RevId: 246435962
Change-Id: I67487915e3838762424b2d168efc54ae34fb801f
---
 runsc/container/BUILD                   |  15 +-
 runsc/container/container_test.go       |   8 +-
 runsc/container/multi_container_test.go |   6 +-
 runsc/container/test_app.go             | 287 --------------------------------
 runsc/container/test_app/BUILD          |  15 ++
 runsc/container/test_app/test_app.go    | 287 ++++++++++++++++++++++++++++++++
 6 files changed, 311 insertions(+), 307 deletions(-)
 delete mode 100644 runsc/container/test_app.go
 create mode 100644 runsc/container/test_app/BUILD
 create mode 100644 runsc/container/test_app/test_app.go

(limited to 'runsc')

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 2936b7cdf..13709a0ae 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -37,8 +37,8 @@ go_test(
         "shared_volume_test.go",
     ],
     data = [
-        ":test_app",
         "//runsc",
+        "//runsc/container/test_app",
     ],
     embed = [":container"],
     shard_count = 5,
@@ -61,14 +61,3 @@ go_test(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
-
-go_binary(
-    name = "test_app",
-    testonly = 1,
-    srcs = ["test_app.go"],
-    pure = "on",
-    deps = [
-        "//runsc/test/testutil",
-        "@com_github_google_subcommands//:go_default_library",
-    ],
-)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 9458dbb90..269d28448 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -569,7 +569,7 @@ func TestKillPid(t *testing.T) {
 	for _, conf := range configs(overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		app, err := testutil.FindFile("runsc/container/test_app")
+		app, err := testutil.FindFile("runsc/container/test_app/test_app")
 		if err != nil {
 			t.Fatal("error finding test_app:", err)
 		}
@@ -792,7 +792,7 @@ func TestUnixDomainSockets(t *testing.T) {
 		}
 		defer outputFile.Close()
 
-		app, err := testutil.FindFile("runsc/container/test_app")
+		app, err := testutil.FindFile("runsc/container/test_app/test_app")
 		if err != nil {
 			t.Fatal("error finding test_app:", err)
 		}
@@ -1471,7 +1471,7 @@ func TestRootNotMount(t *testing.T) {
 		t.Skip("race makes test_app not statically linked")
 	}
 
-	appSym, err := testutil.FindFile("runsc/container/test_app")
+	appSym, err := testutil.FindFile("runsc/container/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -1497,7 +1497,7 @@ func TestRootNotMount(t *testing.T) {
 }
 
 func TestUserLog(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app")
+	app, err := testutil.FindFile("runsc/container/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index e554237cf..39c4dc03d 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -403,7 +403,7 @@ func TestMultiContainerSignal(t *testing.T) {
 // TestMultiContainerDestroy checks that container are properly cleaned-up when
 // they are destroyed.
 func TestMultiContainerDestroy(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app")
+	app, err := testutil.FindFile("runsc/container/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -533,7 +533,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 		{killContainer: true},
 		{killContainer: false},
 	} {
-		app, err := testutil.FindFile("runsc/container/test_app")
+		app, err := testutil.FindFile("runsc/container/test_app/test_app")
 		if err != nil {
 			t.Fatal("error finding test_app:", err)
 		}
@@ -734,7 +734,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 // TestMultiContainerGoferStop tests that IO operations continue to work after
 // containers have been stopped and gofers killed.
 func TestMultiContainerGoferStop(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app")
+	app, err := testutil.FindFile("runsc/container/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
deleted file mode 100644
index 62923f1ef..000000000
--- a/runsc/container/test_app.go
+++ /dev/null
@@ -1,287 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary test_app is like a swiss knife for tests that need to run anything
-// inside the sandbox. New functionality can be added with new commands.
-package main
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"net"
-	"os"
-	"os/exec"
-	"strconv"
-	sys "syscall"
-	"time"
-
-	"flag"
-	"github.com/google/subcommands"
-	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
-)
-
-func main() {
-	subcommands.Register(subcommands.HelpCommand(), "")
-	subcommands.Register(subcommands.FlagsCommand(), "")
-	subcommands.Register(new(uds), "")
-	subcommands.Register(new(taskTree), "")
-	subcommands.Register(new(forkBomb), "")
-	subcommands.Register(new(reaper), "")
-	subcommands.Register(new(syscall), "")
-
-	flag.Parse()
-
-	exitCode := subcommands.Execute(context.Background())
-	os.Exit(int(exitCode))
-}
-
-type uds struct {
-	fileName   string
-	socketPath string
-}
-
-// Name implements subcommands.Command.Name.
-func (*uds) Name() string {
-	return "uds"
-}
-
-// Synopsis implements subcommands.Command.Synopsys.
-func (*uds) Synopsis() string {
-	return "creates unix domain socket client and server. Client sends a contant flow of sequential numbers. Server prints them to --file"
-}
-
-// Usage implements subcommands.Command.Usage.
-func (*uds) Usage() string {
-	return "uds <flags>"
-}
-
-// SetFlags implements subcommands.Command.SetFlags.
-func (c *uds) SetFlags(f *flag.FlagSet) {
-	f.StringVar(&c.fileName, "file", "", "name of output file")
-	f.StringVar(&c.socketPath, "socket", "", "path to socket")
-}
-
-// Execute implements subcommands.Command.Execute.
-func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if c.fileName == "" || c.socketPath == "" {
-		log.Fatal("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath)
-		return subcommands.ExitFailure
-	}
-	outputFile, err := os.OpenFile(c.fileName, os.O_WRONLY|os.O_CREATE, 0666)
-	if err != nil {
-		log.Fatal("error opening output file:", err)
-	}
-
-	defer os.Remove(c.socketPath)
-
-	listener, err := net.Listen("unix", c.socketPath)
-	if err != nil {
-		log.Fatal("error listening on socket %q:", c.socketPath, err)
-	}
-
-	go server(listener, outputFile)
-	for i := 0; ; i++ {
-		conn, err := net.Dial("unix", c.socketPath)
-		if err != nil {
-			log.Fatal("error dialing:", err)
-		}
-		if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil {
-			log.Fatal("error writing:", err)
-		}
-		conn.Close()
-		time.Sleep(100 * time.Millisecond)
-	}
-}
-
-func server(listener net.Listener, out *os.File) {
-	buf := make([]byte, 16)
-
-	for {
-		c, err := listener.Accept()
-		if err != nil {
-			log.Fatal("error accepting connection:", err)
-		}
-		nr, err := c.Read(buf)
-		if err != nil {
-			log.Fatal("error reading from buf:", err)
-		}
-		data := buf[0:nr]
-		fmt.Fprint(out, string(data)+"\n")
-	}
-}
-
-type taskTree struct {
-	depth int
-	width int
-	pause bool
-}
-
-// Name implements subcommands.Command.
-func (*taskTree) Name() string {
-	return "task-tree"
-}
-
-// Synopsis implements subcommands.Command.
-func (*taskTree) Synopsis() string {
-	return "creates a tree of tasks"
-}
-
-// Usage implements subcommands.Command.
-func (*taskTree) Usage() string {
-	return "task-tree <flags>"
-}
-
-// SetFlags implements subcommands.Command.
-func (c *taskTree) SetFlags(f *flag.FlagSet) {
-	f.IntVar(&c.depth, "depth", 1, "number of levels to create")
-	f.IntVar(&c.width, "width", 1, "number of tasks at each level")
-	f.BoolVar(&c.pause, "pause", false, "whether the tasks should pause perpetually")
-}
-
-// Execute implements subcommands.Command.
-func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	stop := testutil.StartReaper()
-	defer stop()
-
-	if c.depth == 0 {
-		log.Printf("Child sleeping, PID: %d\n", os.Getpid())
-		select {}
-	}
-	log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid())
-
-	var cmds []*exec.Cmd
-	for i := 0; i < c.width; i++ {
-		cmd := exec.Command(
-			"/proc/self/exe", c.Name(),
-			"--depth", strconv.Itoa(c.depth-1),
-			"--width", strconv.Itoa(c.width),
-			"--pause", strconv.FormatBool(c.pause))
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-
-		if err := cmd.Start(); err != nil {
-			log.Fatal("failed to call self:", err)
-		}
-		cmds = append(cmds, cmd)
-	}
-
-	for _, c := range cmds {
-		c.Wait()
-	}
-
-	if c.pause {
-		select {}
-	}
-
-	return subcommands.ExitSuccess
-}
-
-type forkBomb struct {
-	delay time.Duration
-}
-
-// Name implements subcommands.Command.
-func (*forkBomb) Name() string {
-	return "fork-bomb"
-}
-
-// Synopsis implements subcommands.Command.
-func (*forkBomb) Synopsis() string {
-	return "creates child process until the end of times"
-}
-
-// Usage implements subcommands.Command.
-func (*forkBomb) Usage() string {
-	return "fork-bomb <flags>"
-}
-
-// SetFlags implements subcommands.Command.
-func (c *forkBomb) SetFlags(f *flag.FlagSet) {
-	f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child")
-}
-
-// Execute implements subcommands.Command.
-func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	time.Sleep(c.delay)
-
-	cmd := exec.Command("/proc/self/exe", c.Name())
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
-		log.Fatal("failed to call self:", err)
-	}
-	return subcommands.ExitSuccess
-}
-
-type reaper struct{}
-
-// Name implements subcommands.Command.
-func (*reaper) Name() string {
-	return "reaper"
-}
-
-// Synopsis implements subcommands.Command.
-func (*reaper) Synopsis() string {
-	return "reaps all children in a loop"
-}
-
-// Usage implements subcommands.Command.
-func (*reaper) Usage() string {
-	return "reaper <flags>"
-}
-
-// SetFlags implements subcommands.Command.
-func (*reaper) SetFlags(*flag.FlagSet) {}
-
-// Execute implements subcommands.Command.
-func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	stop := testutil.StartReaper()
-	defer stop()
-	select {}
-}
-
-type syscall struct {
-	sysno uint64
-}
-
-// Name implements subcommands.Command.
-func (*syscall) Name() string {
-	return "syscall"
-}
-
-// Synopsis implements subcommands.Command.
-func (*syscall) Synopsis() string {
-	return "syscall makes a syscall"
-}
-
-// Usage implements subcommands.Command.
-func (*syscall) Usage() string {
-	return "syscall <flags>"
-}
-
-// SetFlags implements subcommands.Command.
-func (s *syscall) SetFlags(f *flag.FlagSet) {
-	f.Uint64Var(&s.sysno, "syscall", 0, "syscall to call")
-}
-
-// Execute implements subcommands.Command.
-func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if _, _, errno := sys.Syscall(uintptr(s.sysno), 0, 0, 0); errno != 0 {
-		fmt.Printf("syscall(%d, 0, 0...) failed: %v\n", s.sysno, errno)
-	} else {
-		fmt.Printf("syscall(%d, 0, 0...) success\n", s.sysno)
-	}
-	return subcommands.ExitSuccess
-}
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
new file mode 100644
index 000000000..054705ed7
--- /dev/null
+++ b/runsc/container/test_app/BUILD
@@ -0,0 +1,15 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "test_app",
+    testonly = 1,
+    srcs = ["test_app.go"],
+    pure = "on",
+    visibility = ["//runsc/container:__pkg__"],
+    deps = [
+        "//runsc/test/testutil",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
+)
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
new file mode 100644
index 000000000..62923f1ef
--- /dev/null
+++ b/runsc/container/test_app/test_app.go
@@ -0,0 +1,287 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary test_app is like a swiss knife for tests that need to run anything
+// inside the sandbox. New functionality can be added with new commands.
+package main
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"net"
+	"os"
+	"os/exec"
+	"strconv"
+	sys "syscall"
+	"time"
+
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+func main() {
+	subcommands.Register(subcommands.HelpCommand(), "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+	subcommands.Register(new(uds), "")
+	subcommands.Register(new(taskTree), "")
+	subcommands.Register(new(forkBomb), "")
+	subcommands.Register(new(reaper), "")
+	subcommands.Register(new(syscall), "")
+
+	flag.Parse()
+
+	exitCode := subcommands.Execute(context.Background())
+	os.Exit(int(exitCode))
+}
+
+type uds struct {
+	fileName   string
+	socketPath string
+}
+
+// Name implements subcommands.Command.Name.
+func (*uds) Name() string {
+	return "uds"
+}
+
+// Synopsis implements subcommands.Command.Synopsys.
+func (*uds) Synopsis() string {
+	return "creates unix domain socket client and server. Client sends a contant flow of sequential numbers. Server prints them to --file"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*uds) Usage() string {
+	return "uds <flags>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *uds) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.fileName, "file", "", "name of output file")
+	f.StringVar(&c.socketPath, "socket", "", "path to socket")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if c.fileName == "" || c.socketPath == "" {
+		log.Fatal("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath)
+		return subcommands.ExitFailure
+	}
+	outputFile, err := os.OpenFile(c.fileName, os.O_WRONLY|os.O_CREATE, 0666)
+	if err != nil {
+		log.Fatal("error opening output file:", err)
+	}
+
+	defer os.Remove(c.socketPath)
+
+	listener, err := net.Listen("unix", c.socketPath)
+	if err != nil {
+		log.Fatal("error listening on socket %q:", c.socketPath, err)
+	}
+
+	go server(listener, outputFile)
+	for i := 0; ; i++ {
+		conn, err := net.Dial("unix", c.socketPath)
+		if err != nil {
+			log.Fatal("error dialing:", err)
+		}
+		if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil {
+			log.Fatal("error writing:", err)
+		}
+		conn.Close()
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func server(listener net.Listener, out *os.File) {
+	buf := make([]byte, 16)
+
+	for {
+		c, err := listener.Accept()
+		if err != nil {
+			log.Fatal("error accepting connection:", err)
+		}
+		nr, err := c.Read(buf)
+		if err != nil {
+			log.Fatal("error reading from buf:", err)
+		}
+		data := buf[0:nr]
+		fmt.Fprint(out, string(data)+"\n")
+	}
+}
+
+type taskTree struct {
+	depth int
+	width int
+	pause bool
+}
+
+// Name implements subcommands.Command.
+func (*taskTree) Name() string {
+	return "task-tree"
+}
+
+// Synopsis implements subcommands.Command.
+func (*taskTree) Synopsis() string {
+	return "creates a tree of tasks"
+}
+
+// Usage implements subcommands.Command.
+func (*taskTree) Usage() string {
+	return "task-tree <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *taskTree) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&c.depth, "depth", 1, "number of levels to create")
+	f.IntVar(&c.width, "width", 1, "number of tasks at each level")
+	f.BoolVar(&c.pause, "pause", false, "whether the tasks should pause perpetually")
+}
+
+// Execute implements subcommands.Command.
+func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	if c.depth == 0 {
+		log.Printf("Child sleeping, PID: %d\n", os.Getpid())
+		select {}
+	}
+	log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid())
+
+	var cmds []*exec.Cmd
+	for i := 0; i < c.width; i++ {
+		cmd := exec.Command(
+			"/proc/self/exe", c.Name(),
+			"--depth", strconv.Itoa(c.depth-1),
+			"--width", strconv.Itoa(c.width),
+			"--pause", strconv.FormatBool(c.pause))
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+
+		if err := cmd.Start(); err != nil {
+			log.Fatal("failed to call self:", err)
+		}
+		cmds = append(cmds, cmd)
+	}
+
+	for _, c := range cmds {
+		c.Wait()
+	}
+
+	if c.pause {
+		select {}
+	}
+
+	return subcommands.ExitSuccess
+}
+
+type forkBomb struct {
+	delay time.Duration
+}
+
+// Name implements subcommands.Command.
+func (*forkBomb) Name() string {
+	return "fork-bomb"
+}
+
+// Synopsis implements subcommands.Command.
+func (*forkBomb) Synopsis() string {
+	return "creates child process until the end of times"
+}
+
+// Usage implements subcommands.Command.
+func (*forkBomb) Usage() string {
+	return "fork-bomb <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *forkBomb) SetFlags(f *flag.FlagSet) {
+	f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child")
+}
+
+// Execute implements subcommands.Command.
+func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	time.Sleep(c.delay)
+
+	cmd := exec.Command("/proc/self/exe", c.Name())
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		log.Fatal("failed to call self:", err)
+	}
+	return subcommands.ExitSuccess
+}
+
+type reaper struct{}
+
+// Name implements subcommands.Command.
+func (*reaper) Name() string {
+	return "reaper"
+}
+
+// Synopsis implements subcommands.Command.
+func (*reaper) Synopsis() string {
+	return "reaps all children in a loop"
+}
+
+// Usage implements subcommands.Command.
+func (*reaper) Usage() string {
+	return "reaper <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (*reaper) SetFlags(*flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	stop := testutil.StartReaper()
+	defer stop()
+	select {}
+}
+
+type syscall struct {
+	sysno uint64
+}
+
+// Name implements subcommands.Command.
+func (*syscall) Name() string {
+	return "syscall"
+}
+
+// Synopsis implements subcommands.Command.
+func (*syscall) Synopsis() string {
+	return "syscall makes a syscall"
+}
+
+// Usage implements subcommands.Command.
+func (*syscall) Usage() string {
+	return "syscall <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (s *syscall) SetFlags(f *flag.FlagSet) {
+	f.Uint64Var(&s.sysno, "syscall", 0, "syscall to call")
+}
+
+// Execute implements subcommands.Command.
+func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if _, _, errno := sys.Syscall(uintptr(s.sysno), 0, 0, 0); errno != 0 {
+		fmt.Printf("syscall(%d, 0, 0...) failed: %v\n", s.sysno, errno)
+	} else {
+		fmt.Printf("syscall(%d, 0, 0...) success\n", s.sysno)
+	}
+	return subcommands.ExitSuccess
+}
-- 
cgit v1.2.3


From 5f8225c009fcf297139c54c7b329da4aff679ece Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 2 May 2019 19:33:19 -0700
Subject: runsc: don't create an empty network namespace if NetworkHost is set

With this change, we will be able to run runsc do in a host network namespace.

PiperOrigin-RevId: 246436660
Change-Id: I8ea18b1053c88fe2feed74239b915fe7a151ce34
---
 runsc/sandbox/sandbox.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'runsc')

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 9d8cfa451..bc69a9d61 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -472,6 +472,8 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	if ns, ok := specutils.GetNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone {
 		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
 		nss = append(nss, ns)
+	} else if conf.Network == boot.NetworkHost {
+		log.Infof("Sandbox will be started in the host network namespace")
 	} else {
 		log.Infof("Sandbox will be started in new network namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
-- 
cgit v1.2.3


From 95614bbefa2f4657c77b2040630088fdec7f5dd1 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 3 May 2019 09:53:26 -0700
Subject: Increase timeout to wait for port to become available

TestHttpd fails sporadically waiting for the port on slow
machines.

PiperOrigin-RevId: 246525277
Change-Id: Ie0ea71e3c4664d24f580eabd8f7461e47079f734
---
 runsc/test/image/image_test.go             | 4 ++--
 runsc/test/integration/integration_test.go | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'runsc')

diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 0c45602f9..8322dd001 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -103,7 +103,7 @@ func TestHttpd(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 5*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
@@ -137,7 +137,7 @@ func TestNginx(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 5*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index b2e86aacc..842f05545 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -68,7 +68,7 @@ func TestLifeCycle(t *testing.T) {
 	if err != nil {
 		t.Fatal("docker.FindPort(80) failed: ", err)
 	}
-	if err := testutil.WaitForHTTP(port, 5*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
 		t.Fatal("WaitForHTTP() timeout:", err)
 	}
 	client := http.Client{Timeout: time.Duration(2 * time.Second)}
-- 
cgit v1.2.3


From 3f3e3a63033f87dd42076423661b62c04d10c15f Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 3 May 2019 11:20:12 -0700
Subject: gvisor/kokoro: save runsc logs

PiperOrigin-RevId: 246542315
Change-Id: Ia9ba2bc104e0af3277d3b6102122c13d320ea802
---
 kokoro/continuous.cfg         |  2 ++
 kokoro/presubmit.cfg          |  1 +
 kokoro/run_tests.sh           |  3 +++
 runsc/test/testutil/docker.go | 23 +++++++++++++++++++++--
 4 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/continuous.cfg b/kokoro/continuous.cfg
index a834db198..8da47736a 100644
--- a/kokoro/continuous.cfg
+++ b/kokoro/continuous.cfg
@@ -7,5 +7,7 @@ action {
   define_artifacts {
     regex: "**/sponge_log.xml"
     regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+    regex: "**/runsc-logs.tar.gz"
   }
 }
diff --git a/kokoro/presubmit.cfg b/kokoro/presubmit.cfg
index 2d8ab76d6..8da47736a 100644
--- a/kokoro/presubmit.cfg
+++ b/kokoro/presubmit.cfg
@@ -8,5 +8,6 @@ action {
     regex: "**/sponge_log.xml"
     regex: "**/sponge_log.log"
     regex: "**/outputs.zip"
+    regex: "**/runsc-logs.tar.gz"
   }
 }
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 08f678e39..c5c6a7780 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -183,6 +183,9 @@ upload_test_artifacts() {
   find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
     tar --create --files-from - --transform 's/test\./sponge_log./' |
     tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
+  if [[ -d "/tmp/${RUNTIME}/logs" ]]; then
+    tar --create --gzip "--file=${KOKORO_ARTIFACTS_DIR}/runsc-logs.tar.gz" -C /tmp/ ${RUNTIME}/logs
+  fi
 }
 
 # Finish runs at exit, even in the event of an error, and uploads all test
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 29ef505b4..ecd66dc77 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -120,7 +120,7 @@ func getLocalPath(file string) string {
 
 // do executes docker command.
 func do(args ...string) (string, error) {
-	fmt.Printf("Running: docker %s\n", args)
+	log.Printf("Running: docker %s\n", args)
 	cmd := exec.Command("docker", args...)
 	out, err := cmd.CombinedOutput()
 	if err != nil {
@@ -131,7 +131,7 @@ func do(args ...string) (string, error) {
 
 // doWithPty executes docker command with stdio attached to a pty.
 func doWithPty(args ...string) (*exec.Cmd, *os.File, error) {
-	fmt.Printf("Running with pty: docker %s\n", args)
+	log.Printf("Running with pty: docker %s\n", args)
 	cmd := exec.Command("docker", args...)
 	ptmx, err := pty.Start(cmd)
 	if err != nil {
@@ -160,11 +160,23 @@ func MakeDocker(namePrefix string) Docker {
 	return Docker{Name: RandomName(namePrefix), Runtime: getRuntime()}
 }
 
+// logDockerID logs a container id, which is needed to find container runsc logs.
+func (d *Docker) logDockerID() {
+	id, err := d.ID()
+	if err != nil {
+		log.Printf("%v\n", err)
+	}
+	log.Printf("Name: %s ID: %v\n", d.Name, id)
+}
+
 // Create calls 'docker create' with the arguments provided.
 func (d *Docker) Create(args ...string) error {
 	a := []string{"create", "--runtime", d.Runtime, "--name", d.Name}
 	a = append(a, args...)
 	_, err := do(a...)
+	if err == nil {
+		d.logDockerID()
+	}
 	return err
 }
 
@@ -190,6 +202,9 @@ func (d *Docker) Run(args ...string) error {
 	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
 	a = append(a, args...)
 	_, err := do(a...)
+	if err == nil {
+		d.logDockerID()
+	}
 	return err
 }
 
@@ -206,6 +221,9 @@ func (d *Docker) RunFg(args ...string) (string, error) {
 	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name}
 	a = append(a, args...)
 	out, err := do(a...)
+	if err == nil {
+		d.logDockerID()
+	}
 	return string(out), err
 }
 
@@ -255,6 +273,7 @@ func (d *Docker) Remove() error {
 
 // CleanUp kills and deletes the container (best effort).
 func (d *Docker) CleanUp() {
+	d.logDockerID()
 	if _, err := do("kill", d.Name); err != nil {
 		log.Printf("error killing container %q: %v", d.Name, err)
 	}
-- 
cgit v1.2.3


From 4edd6f5ccfc5e48defae26145548b745903ca492 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Fri, 3 May 2019 11:42:25 -0700
Subject: runsc: add a bazel target to build a debian package

$ dpkg -s runsc
Package: runsc
Status: install ok installed
Priority: optional
Section: contrib/devel
Maintainer: The gVisor Authors <gvisor-dev@googlegroups.com>
Architecture: amd64
Version: 20190304.1-123-g861434f612ce-dirty
Description: gVisor is a user-space kernel, written in Go, that
 implements a substantial portion of the Linux system surface. It
 includes an Open Container Initiative (OCI) runtime called runsc that
 provides an isolation boundary between the application and the host
 kernel. The runsc runtime integrates with Docker and Kubernetes,
 making it simple to run sandboxed containers.
Homepage: https://gvisor.dev/
Built-Using: Bazel
Change-Id: I6f161de8fba649f12272a87b99529ccfd22e499a
PiperOrigin-RevId: 246546294
---
 runsc/BUILD                 | 45 +++++++++++++++++++++++++++++++++++++++++++++
 runsc/debian/description    |  5 +++++
 runsc/debian/postinst.sh    | 24 ++++++++++++++++++++++++
 runsc/tools/dockercfg/BUILD |  4 +---
 4 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 runsc/debian/description
 create mode 100755 runsc/debian/postinst.sh

(limited to 'runsc')

diff --git a/runsc/BUILD b/runsc/BUILD
index eb7503502..4d2046ed3 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,6 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar")
 
 go_binary(
     name = "runsc",
@@ -52,3 +53,47 @@ go_binary(
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
+
+pkg_tar(
+    name = "runsc-bin",
+    srcs = [":runsc"],
+    mode = "0755",
+    package_dir = "/usr/bin",
+    strip_prefix = "/runsc/linux_amd64_pure_stripped",
+)
+
+pkg_tar(
+    name = "runsc-tools",
+    srcs = ["//runsc/tools/dockercfg"],
+    mode = "0755",
+    package_dir = "/usr/libexec/runsc",
+    strip_prefix = "/runsc/tools/dockercfg/linux_amd64_stripped",
+)
+
+pkg_tar(
+    name = "debian-data",
+    extension = "tar.gz",
+    deps = [
+        ":runsc-bin",
+        ":runsc-tools",
+    ],
+)
+
+genrule(
+    name = "deb-version",
+    outs = ["version.txt"],
+    cmd = "cat bazel-out/volatile-status.txt | grep VERSION | sed 's/^[^0-9]*//' >$@",
+    stamp = 1,
+)
+
+pkg_deb(
+    name = "runsc-debian",
+    architecture = "amd64",
+    data = ":debian-data",
+    description_file = "debian/description",
+    homepage = "https://gvisor.dev/",
+    maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
+    package = "runsc",
+    postinst = "debian/postinst.sh",
+    version_file = ":version.txt",
+)
diff --git a/runsc/debian/description b/runsc/debian/description
new file mode 100644
index 000000000..6e3b1b2c0
--- /dev/null
+++ b/runsc/debian/description
@@ -0,0 +1,5 @@
+gVisor is a user-space kernel, written in Go, that implements a substantial
+portion of the Linux system surface. It includes an Open Container Initiative
+(OCI) runtime called runsc that provides an isolation boundary between the
+application and the host kernel. The runsc runtime integrates with Docker and
+Kubernetes, making it simple to run sandboxed containers.
diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh
new file mode 100755
index 000000000..03a5ff524
--- /dev/null
+++ b/runsc/debian/postinst.sh
@@ -0,0 +1,24 @@
+#!/bin/sh -e
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ "$1" != configure ]; then
+    exit 0
+fi
+
+if [ -f /etc/docker/daemon.json ]; then
+	/usr/libexec/runsc/dockercfg runtime-add runsc /usr/bin/runsc
+	systemctl restart docker
+fi
diff --git a/runsc/tools/dockercfg/BUILD b/runsc/tools/dockercfg/BUILD
index fd406ab93..5cff917ed 100644
--- a/runsc/tools/dockercfg/BUILD
+++ b/runsc/tools/dockercfg/BUILD
@@ -5,8 +5,6 @@ package(licenses = ["notice"])
 go_binary(
     name = "dockercfg",
     srcs = ["dockercfg.go"],
-    visibility = [
-        "//runsc/test:__subpackages__",
-    ],
+    visibility = ["//visibility:public"],
     deps = ["@com_github_google_subcommands//:go_default_library"],
 )
-- 
cgit v1.2.3


From 24d8656585e6072ff7d5a00a7eb4bd25cba42dc4 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 3 May 2019 14:00:31 -0700
Subject: gofer: don't leak file descriptors

Fixes #219

PiperOrigin-RevId: 246568639
Change-Id: Ic7afd15dde922638d77f6429c508d1cbe2e4288a
---
 pkg/sentry/fs/gofer/cache_policy.go | 3 ++-
 pkg/sentry/fs/gofer/path.go         | 4 ++++
 runsc/fsgofer/fsgofer.go            | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 35cd0c1d6..c59344589 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -139,11 +139,12 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child
 	// TODO(b/112031682): If we have a directory FD in the parent
 	// inodeOperations, then we can use fstatat(2) to get the inode
 	// attributes instead of making this RPC.
-	qids, _, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name})
+	qids, f, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
 		// Can't look up the name. Trigger reload.
 		return true
 	}
+	f.close(ctx)
 
 	// If the Path has changed, then we are not looking at the file file.
 	// We must reload.
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 4cbf9e9d9..aa3d3aaa6 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -109,6 +109,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	hostFile, err := newFile.create(ctx, name, openFlags, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
 	if err != nil {
 		// Could not create the file.
+		newFile.close(ctx)
 		return nil, err
 	}
 
@@ -120,11 +121,14 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	qids, unopened, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
 		newFile.close(ctx)
+		hostFile.Close()
 		return nil, err
 	}
 	if len(qids) != 1 {
 		log.Warningf("WalkGetAttr(%s) succeeded, but returned %d QIDs (%v), wanted 1", name, len(qids), qids)
 		newFile.close(ctx)
+		hostFile.Close()
+		unopened.close(ctx)
 		return nil, syserror.EIO
 	}
 	qid := qids[0]
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 158f22ddc..3a0806837 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -502,6 +502,9 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	last := l
 	for _, name := range names {
 		f, path, err := openAnyFileFromParent(last, name)
+		if last != l {
+			last.Close()
+		}
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
-- 
cgit v1.2.3


From bf0ac565d2873069799082ad7bc3e3c43acbc593 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Fri, 3 May 2019 21:40:48 -0700
Subject: Fix runsc restore to be compatible with docker start --checkpoint ...

Change-Id: I02b30de13f1393df66edf8829fedbf32405d18f8
PiperOrigin-RevId: 246621192
---
 runsc/boot/config.go                       |  3 ++
 runsc/boot/controller.go                   | 21 ++++++++------
 runsc/boot/fs.go                           |  9 ++++--
 runsc/cmd/do.go                            |  2 +-
 runsc/cmd/restore.go                       | 18 ++++--------
 runsc/cmd/run.go                           |  6 +++-
 runsc/container/container.go               | 30 +++++++++++++++++---
 runsc/container/container_test.go          | 10 +++----
 runsc/test/install.sh                      |  3 +-
 runsc/test/integration/integration_test.go | 44 ++++++++++++++++++++++++++++++
 runsc/test/testutil/docker.go              | 16 +++++++++++
 runsc/tools/dockercfg/dockercfg.go         |  6 +++-
 12 files changed, 131 insertions(+), 37 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index b6771de30..15f624f9b 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -213,6 +213,9 @@ type Config struct {
 	// ProfileEnable is set to prepare the sandbox to be profiled.
 	ProfileEnable bool
 
+	// RestoreFile is the path to the saved container image
+	RestoreFile string
+
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index ab7c58838..86f06bff1 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -19,6 +19,7 @@ import (
 	"fmt"
 	"os"
 	"path"
+	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
@@ -304,12 +305,17 @@ type RestoreOpts struct {
 func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	log.Debugf("containerManager.Restore")
 
-	var specFile, deviceFile *os.File
+	var specFile *os.File
+	deviceFD := -1
 	switch numFiles := len(o.FilePayload.Files); numFiles {
 	case 2:
-		// The device file is donated to the platform, so don't Close
-		// it here.
-		deviceFile = o.FilePayload.Files[1]
+		var err error
+		// The device file is donated to the platform.
+		// Can't take ownership away from os.File. dup them to get a new FD.
+		deviceFD, err = syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+		if err != nil {
+			return fmt.Errorf("failed to dup file: %v", err)
+		}
 		fallthrough
 	case 1:
 		specFile = o.FilePayload.Files[0]
@@ -320,11 +326,12 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("at most two files may be passed to Restore")
 	}
 
+	networkStack := cm.l.k.NetworkStack()
 	// Destroy the old kernel and create a new kernel.
 	cm.l.k.Pause()
 	cm.l.k.Destroy()
 
-	p, err := createPlatform(cm.l.conf, int(deviceFile.Fd()))
+	p, err := createPlatform(cm.l.conf, deviceFD)
 	if err != nil {
 		return fmt.Errorf("creating platform: %v", err)
 	}
@@ -347,10 +354,6 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
-	networkStack, err := newEmptyNetworkStack(cm.l.conf, k)
-	if err != nil {
-		return fmt.Errorf("creating network: %v", err)
-	}
 	if eps, ok := networkStack.(*epsocket.Stack); ok {
 		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index aeb1c52cc..1611dda2c 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -187,7 +187,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 // createRootMount creates the root filesystem.
 func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
-	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly}
+	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay}
 
 	var (
 		rootInode *fs.Inode
@@ -419,7 +419,7 @@ func mountDevice(m specs.Mount) string {
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
 func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
-	fsName, opts, _, err := getMountNameAndOptions(conf, m, fds)
+	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
 
 	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
 	if err != nil {
@@ -436,6 +436,9 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 		Flags:      mountFlags(m.Options),
 		DataString: strings.Join(opts, ","),
 	}
+	if useOverlay {
+		newMount.Flags.ReadOnly = true
+	}
 	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
 	log.Infof("Added mount at %q: %+v", fsName, newMount)
 	return nil
@@ -453,7 +456,7 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	opts := p9MountOptions(fd, conf.FileAccess)
 
 	mf := fs.MountSourceFlags{}
-	if spec.Root.Readonly {
+	if spec.Root.Readonly || conf.Overlay {
 		mf.ReadOnly = true
 	}
 
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 842fe2341..c5e72f32b 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -144,7 +144,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		Fatalf("Error write spec: %v", err)
 	}
 
-	ws, err := container.Run(cid, spec, conf, tmpDir, "", "", "")
+	ws, err := container.Run(cid, spec, conf, tmpDir, "", "", "", false)
 	if err != nil {
 		Fatalf("running container: %v", err)
 	}
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 27b06713a..3ab2f5676 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -33,6 +33,9 @@ type Restore struct {
 
 	// imagePath is the path to the saved container image
 	imagePath string
+
+	// detach indicates that runsc has to start a process and exit without waiting it.
+	detach bool
 }
 
 // Name implements subcommands.Command.Name.
@@ -55,10 +58,9 @@ func (*Restore) Usage() string {
 func (r *Restore) SetFlags(f *flag.FlagSet) {
 	r.Create.SetFlags(f)
 	f.StringVar(&r.imagePath, "image-path", "", "directory path to saved container image")
+	f.BoolVar(&r.detach, "detach", false, "detach from the container's process")
 
 	// Unimplemented flags necessary for compatibility with docker.
-	var d bool
-	f.BoolVar(&d, "detach", false, "ignored")
 
 	var nsr bool
 	f.BoolVar(&nsr, "no-subreaper", false, "ignored")
@@ -92,17 +94,9 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 		Fatalf("image-path flag must be provided")
 	}
 
-	restoreFile := filepath.Join(r.imagePath, checkpointFileName)
-
-	c, err := container.Load(conf.RootDir, id)
-	if err != nil {
-		Fatalf("loading container: %v", err)
-	}
-	if err := c.Restore(spec, conf, restoreFile); err != nil {
-		Fatalf("restoring container: %v", err)
-	}
+	conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName)
 
-	ws, err := c.Wait()
+	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
 	if err != nil {
 		Fatalf("running container: %v", err)
 	}
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 4d5f5c139..c228b4f93 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -29,6 +29,9 @@ import (
 type Run struct {
 	// Run flags are a super-set of those for Create.
 	Create
+
+	// detach indicates that runsc has to start a process and exit without waiting it.
+	detach bool
 }
 
 // Name implements subcommands.Command.Name.
@@ -49,6 +52,7 @@ func (*Run) Usage() string {
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (r *Run) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&r.detach, "detach", false, "detach from the container's process")
 	r.Create.SetFlags(f)
 }
 
@@ -73,7 +77,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 	specutils.LogSpec(spec)
 
-	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog)
+	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
 	if err != nil {
 		Fatalf("running container: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 3589272f2..513085836 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -445,6 +445,14 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 		return err
 	}
 
+	// "If any prestart hook fails, the runtime MUST generate an error,
+	// stop and destroy the container" -OCI spec.
+	if c.Spec.Hooks != nil {
+		if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
+			return err
+		}
+	}
+
 	if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil {
 		return err
 	}
@@ -453,7 +461,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 }
 
 // Run is a helper that calls Create + Start + Wait.
-func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile, userLog string) (syscall.WaitStatus, error) {
+func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile, userLog string, detach bool) (syscall.WaitStatus, error) {
 	log.Debugf("Run container %q in root dir: %s", id, conf.RootDir)
 	c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, userLog)
 	if err != nil {
@@ -461,10 +469,24 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	}
 	// Clean up partially created container if an error ocurrs.
 	// Any errors returned by Destroy() itself are ignored.
-	defer c.Destroy()
+	cu := specutils.MakeCleanup(func() {
+		c.Destroy()
+	})
+	defer cu.Clean()
 
-	if err := c.Start(conf); err != nil {
-		return 0, fmt.Errorf("starting container: %v", err)
+	if conf.RestoreFile != "" {
+		log.Debugf("Restore: %v", conf.RestoreFile)
+		if err := c.Restore(spec, conf, conf.RestoreFile); err != nil {
+			return 0, fmt.Errorf("starting container: %v", err)
+		}
+	} else {
+		if err := c.Start(conf); err != nil {
+			return 0, fmt.Errorf("starting container: %v", err)
+		}
+	}
+	if detach {
+		cu.Release()
+		return 0, nil
 	}
 	return c.Wait()
 }
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 269d28448..dcd9910a0 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -210,7 +210,7 @@ func run(spec *specs.Spec, conf *boot.Config) error {
 	defer os.RemoveAll(bundleDir)
 
 	// Create, start and wait for the container.
-	ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "", false)
 	if err != nil {
 		return fmt.Errorf("running container: %v", err)
 	}
@@ -416,7 +416,7 @@ func TestExePath(t *testing.T) {
 				t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
 			}
 
-			ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+			ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "", false)
 
 			os.RemoveAll(rootDir)
 			os.RemoveAll(bundleDir)
@@ -449,7 +449,7 @@ func TestAppExitStatus(t *testing.T) {
 	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
-	ws, err := Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir, "", "", "")
+	ws, err := Run(testutil.UniqueContainerID(), succSpec, conf, bundleDir, "", "", "", false)
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
@@ -468,7 +468,7 @@ func TestAppExitStatus(t *testing.T) {
 	defer os.RemoveAll(rootDir2)
 	defer os.RemoveAll(bundleDir2)
 
-	ws, err = Run(testutil.UniqueContainerID(), errSpec, conf, bundleDir2, "", "", "")
+	ws, err = Run(testutil.UniqueContainerID(), errSpec, conf, bundleDir2, "", "", "", false)
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
@@ -1519,7 +1519,7 @@ func TestUserLog(t *testing.T) {
 	userLog := filepath.Join(dir, "user.log")
 
 	// Create, start and wait for the container.
-	ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", userLog)
+	ws, err := Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", userLog, false)
 	if err != nil {
 		t.Fatalf("error running container: %v", err)
 	}
diff --git a/runsc/test/install.sh b/runsc/test/install.sh
index 457df2d26..8f05dea20 100755
--- a/runsc/test/install.sh
+++ b/runsc/test/install.sh
@@ -76,7 +76,8 @@ if [[ ${uninstall} == 0 ]]; then
   sudo -n chmod a+wx "${logdir}"
 
   declare -r args="--debug-log '${logdir}/' --debug --strace --log-packets"
-  sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" ${args}
+  # experimental is needed to checkpoint/restore.
+  sudo -n "${dockercfg}" --experimental=true runtime-add "${runtime}" "${runsc}" ${args}
   sudo -n "${dockercfg}" runtime-add "${runtime}"-kvm "${runsc}" --platform=kvm ${args}
   sudo -n "${dockercfg}" runtime-add "${runtime}"-hostnet "${runsc}" --network=host ${args}
   sudo -n "${dockercfg}" runtime-add "${runtime}"-overlay "${runsc}" --overlay ${args}
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 842f05545..de17dd3c2 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -148,6 +148,50 @@ func TestPauseResume(t *testing.T) {
 	}
 }
 
+func TestCheckpointRestore(t *testing.T) {
+	if !testutil.IsPauseResumeSupported() {
+		t.Log("Pause/resume is not supported, skipping test.")
+		return
+	}
+	if err := testutil.Pull("google/python-hello"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("save-restore-test")
+	if err := d.Run("-p", "8080", "google/python-hello"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	if err := d.Checkpoint("test"); err != nil {
+		t.Fatal("docker checkpoint failed:", err)
+	}
+
+	if _, err := d.Wait(30 * time.Second); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := d.Restore("test"); err != nil {
+		t.Fatal("docker restore failed:", err)
+	}
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatal("docker.FindPort(8080) failed:", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Fatal("WaitForHTTP() timeout:", err)
+	}
+
+	// Check if container is working again.
+	client := http.Client{Timeout: time.Duration(2 * time.Second)}
+	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+		t.Error("http request failed:", err)
+	}
+}
+
 // Create client and server that talk to each other using the local IP.
 func TestConnectToSelf(t *testing.T) {
 	d := testutil.MakeDocker("connect-to-self-test")
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index ecd66dc77..e103e930c 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -263,6 +263,22 @@ func (d *Docker) Unpause() error {
 	return nil
 }
 
+// Checkpoint calls 'docker checkpoint'.
+func (d *Docker) Checkpoint(name string) error {
+	if _, err := do("checkpoint", "create", d.Name, name); err != nil {
+		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Restore calls 'docker start --checkname [name]'.
+func (d *Docker) Restore(name string) error {
+	if _, err := do("start", "--checkpoint", name, d.Name); err != nil {
+		return fmt.Errorf("error starting container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
 // Remove calls 'docker rm'.
 func (d *Docker) Remove() error {
 	if _, err := do("rm", d.Name); err != nil {
diff --git a/runsc/tools/dockercfg/dockercfg.go b/runsc/tools/dockercfg/dockercfg.go
index 6fb134558..eb9dbd421 100644
--- a/runsc/tools/dockercfg/dockercfg.go
+++ b/runsc/tools/dockercfg/dockercfg.go
@@ -28,7 +28,8 @@ import (
 )
 
 var (
-	configFile = flag.String("config_file", "/etc/docker/daemon.json", "path to Docker daemon config file")
+	configFile   = flag.String("config_file", "/etc/docker/daemon.json", "path to Docker daemon config file")
+	experimental = flag.Bool("experimental", false, "enable experimental features")
 )
 
 func main() {
@@ -96,6 +97,9 @@ func (r *runtimeAdd) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		rts = make(map[string]interface{})
 		c["runtimes"] = rts
 	}
+	if *experimental {
+		c["experimental"] = true
+	}
 	rts[name] = runtime{Path: path, RuntimeArgs: runtimeArgs}
 
 	if err := writeConfig(c, *configFile); err != nil {
-- 
cgit v1.2.3


From bfd9f75ba4390de824d2c3d44c15bdca9dd0ff35 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 8 May 2019 14:34:01 -0700
Subject: Set the FilesytemType in MountSource from the Filesystem.

And stop storing the Filesystem in the MountSource.

This allows us to decouple the MountSource filesystem type from the name of the
filesystem.

PiperOrigin-RevId: 247292982
Change-Id: I49cbcce3c17883b7aa918ba76203dfd6d1b03cc8
---
 pkg/sentry/fs/mount.go        | 11 +++++++----
 pkg/sentry/fs/proc/mounts.go  | 12 ++----------
 pkg/tcpip/link/muxed/BUILD    |  4 +++-
 pkg/tcpip/transport/raw/BUILD |  4 +++-
 runsc/BUILD                   |  4 +++-
 5 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index a169ea4c9..9740f1fc6 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -110,9 +110,8 @@ type MountSource struct {
 	// MountSourceOperations defines filesystem specific behavior.
 	MountSourceOperations
 
-	// Filesystem is the filesystem backing the mount. Can be nil if there
-	// is no filesystem backing the mount.
-	Filesystem Filesystem
+	// FilesystemType is the type of the filesystem backing this mount.
+	FilesystemType string
 
 	// Flags are the flags that this filesystem was mounted with.
 	Flags MountSourceFlags
@@ -158,10 +157,14 @@ const DefaultDirentCacheSize uint64 = 1000
 // NewMountSource returns a new MountSource. Filesystem may be nil if there is no
 // filesystem backing the mount.
 func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags MountSourceFlags) *MountSource {
+	fsType := "none"
+	if filesystem != nil {
+		fsType = filesystem.Name()
+	}
 	return &MountSource{
 		MountSourceOperations: mops,
 		Flags:                 flags,
-		Filesystem:            filesystem,
+		FilesystemType:        fsType,
 		fscache:               NewDirentCache(DefaultDirentCacheSize),
 		children:              make(map[*MountSource]struct{}),
 	}
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 37ed30724..b5e01301f 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -139,11 +139,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		fmt.Fprintf(&buf, "- ")
 
 		// (9) Filesystem type.
-		name := "none"
-		if m.Filesystem != nil {
-			name = m.Filesystem.Name()
-		}
-		fmt.Fprintf(&buf, "%s ", name)
+		fmt.Fprintf(&buf, "%s ", m.FilesystemType)
 
 		// (10) Mount source: filesystem-specific information or "none".
 		fmt.Fprintf(&buf, "none ")
@@ -190,11 +186,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		if m.Flags.ReadOnly {
 			opts = "ro"
 		}
-		name := "none"
-		if m.Filesystem != nil {
-			name = m.Filesystem.Name()
-		}
-		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, name, opts, 0, 0)
+		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, m.FilesystemType, opts, 0, 0)
 	})
 
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
index f991dca83..84cfae784 100644
--- a/pkg/tcpip/link/muxed/BUILD
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -1,6 +1,8 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 go_library(
     name = "muxed",
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index 52f6b9759..6d3f0130e 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -1,4 +1,6 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/runsc/BUILD b/runsc/BUILD
index 4d2046ed3..af8e928c5 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,4 +1,6 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar")
-- 
cgit v1.2.3


From 1bee43be13549b01e18d87df194ac219845de5cf Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 9 May 2019 15:34:44 -0700
Subject: Implement fallocate(2)

Closes #225

PiperOrigin-RevId: 247508791
Change-Id: I04f47cf2770b30043e5a272aba4ba6e11d0476cc
---
 pkg/abi/linux/file.go                     |  11 +++
 pkg/p9/BUILD                              |   1 +
 pkg/p9/client_file.go                     |  12 +++
 pkg/p9/file.go                            |   4 +
 pkg/p9/handlers.go                        |  34 +++++++++
 pkg/p9/local_server/local_server.go       |   5 ++
 pkg/p9/messages.go                        |  59 ++++++++++++++
 pkg/p9/p9.go                              |  81 ++++++++++++++++++++
 pkg/p9/version.go                         |   7 +-
 pkg/sentry/fs/ashmem/device.go            |   1 +
 pkg/sentry/fs/binder/binder.go            |   1 +
 pkg/sentry/fs/dev/full.go                 |   3 +-
 pkg/sentry/fs/dev/null.go                 |   9 ++-
 pkg/sentry/fs/dev/random.go               |   9 ++-
 pkg/sentry/fs/fsutil/BUILD                |   1 +
 pkg/sentry/fs/fsutil/host_mappable.go     |  10 ++-
 pkg/sentry/fs/fsutil/inode.go             |  25 ++++++
 pkg/sentry/fs/fsutil/inode_cached.go      |  28 +++++++
 pkg/sentry/fs/fsutil/inode_cached_test.go |   9 +++
 pkg/sentry/fs/gofer/context_file.go       |   7 ++
 pkg/sentry/fs/gofer/inode.go              |  24 ++++++
 pkg/sentry/fs/host/inode.go               |  18 +++++
 pkg/sentry/fs/inode.go                    |   7 ++
 pkg/sentry/fs/inode_operations.go         |   4 +
 pkg/sentry/fs/inode_overlay.go            |   7 ++
 pkg/sentry/fs/inode_overlay_test.go       |   1 +
 pkg/sentry/fs/mock.go                     |   5 ++
 pkg/sentry/fs/proc/inode.go               |   1 +
 pkg/sentry/fs/proc/seqfile/seqfile.go     |   3 +-
 pkg/sentry/fs/proc/uid_gid_map.go         |   3 +-
 pkg/sentry/fs/ramfs/dir.go                |   1 +
 pkg/sentry/fs/ramfs/socket.go             |   3 +-
 pkg/sentry/fs/ramfs/symlink.go            |   5 +-
 pkg/sentry/fs/sys/devices.go              |   3 +-
 pkg/sentry/fs/tmpfs/inode_file.go         |  27 +++++++
 pkg/sentry/fs/tmpfs/tmpfs.go              |   8 +-
 pkg/sentry/fs/tty/dir.go                  |   5 +-
 pkg/sentry/kernel/pipe/node.go            |   4 +
 pkg/sentry/syscalls/linux/sys_file.go     |  38 ++++++++-
 runsc/boot/compat.go                      |   2 +-
 runsc/fsgofer/filter/config.go            |  10 ++-
 runsc/fsgofer/fsgofer.go                  |  12 +++
 test/syscalls/linux/BUILD                 |   2 +
 test/syscalls/linux/fallocate.cc          | 123 +++++++++++++++++++++++++-----
 44 files changed, 589 insertions(+), 44 deletions(-)

(limited to 'runsc')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 753fec3ed..81ff9fe9e 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -254,3 +254,14 @@ const (
 	F_SEAL_GROW   = 0x0004 // Prevent file from growing.
 	F_SEAL_WRITE  = 0x0008 // Prevent writes.
 )
+
+// Constants related to fallocate(2). Source: include/uapi/linux/falloc.h
+const (
+	FALLOC_FL_KEEP_SIZE      = 0x01
+	FALLOC_FL_PUNCH_HOLE     = 0x02
+	FALLOC_FL_NO_HIDE_STALE  = 0x04
+	FALLOC_FL_COLLAPSE_RANGE = 0x08
+	FALLOC_FL_ZERO_RANGE     = 0x10
+	FALLOC_FL_INSERT_RANGE   = 0x20
+	FALLOC_FL_UNSHARE_RANGE  = 0x40
+)
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index 5d972309d..36b2ec5f6 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//pkg/fd",
         "//pkg/log",
         "//pkg/unet",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 63c65129a..471c3a80b 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -171,6 +171,18 @@ func (c *clientFile) SetAttr(valid SetAttrMask, attr SetAttr) error {
 	return c.client.sendRecv(&Tsetattr{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattr{})
 }
 
+// Allocate implements File.Allocate.
+func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return syscall.EBADF
+	}
+	if !versionSupportsTallocate(c.client.version) {
+		return syscall.EOPNOTSUPP
+	}
+
+	return c.client.sendRecv(&Tallocate{FID: c.fid, Mode: mode, Offset: offset, Length: length}, &Rallocate{})
+}
+
 // Remove implements File.Remove.
 //
 // N.B. This method is no longer part of the file interface and should be
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index a52a0f3e7..89e814d50 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -89,6 +89,10 @@ type File interface {
 	// On the server, SetAttr has a write concurrency guarantee.
 	SetAttr(valid SetAttrMask, attr SetAttr) error
 
+	// Allocate allows the caller to directly manipulate the allocated disk space
+	// for the file. See fallocate(2) for more details.
+	Allocate(mode AllocateMode, offset, length uint64) error
+
 	// Close is called when all references are dropped on the server side,
 	// and Close should be called by the client to drop all references.
 	//
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 6da2ce4e3..533ead98a 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -877,6 +877,40 @@ func (t *Tsetattr) handle(cs *connState) message {
 	return &Rsetattr{}
 }
 
+// handle implements handler.handle.
+func (t *Tallocate) handle(cs *connState) message {
+	// Lookup the FID.
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	if err := ref.safelyWrite(func() error {
+		// Has it been opened already?
+		openFlags, opened := ref.OpenFlags()
+		if !opened {
+			return syscall.EINVAL
+		}
+
+		// Can it be written? Check permissions.
+		if openFlags&OpenFlagsModeMask == ReadOnly {
+			return syscall.EBADF
+		}
+
+		// We don't allow allocate on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		return ref.file.Allocate(t.Mode, t.Offset, t.Length)
+	}); err != nil {
+		return newErr(err)
+	}
+
+	return &Rallocate{}
+}
+
 // handle implements handler.handle.
 func (t *Txattrwalk) handle(cs *connState) message {
 	// Lookup the FID.
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index f4077a9d4..d49d94550 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -323,6 +323,11 @@ func (l *local) Renamed(parent p9.File, newName string) {
 	l.path = path.Join(parent.(*local).path, newName)
 }
 
+// Allocate implements p9.File.Allocate.
+func (l *local) Allocate(mode p9.AllocateMode, offset, length uint64) error {
+	return syscall.Fallocate(int(l.file.Fd()), mode.ToLinux(), int64(offset), int64(length))
+}
+
 func main() {
 	log.SetLevel(log.Debug)
 
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 3c7898cc1..703753c31 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1424,6 +1424,63 @@ func (r *Rsetattr) String() string {
 	return fmt.Sprintf("Rsetattr{}")
 }
 
+// Tallocate is an allocate request. This is an extension to 9P protocol, not
+// present in the 9P2000.L standard.
+type Tallocate struct {
+	FID    FID
+	Mode   AllocateMode
+	Offset uint64
+	Length uint64
+}
+
+// Decode implements encoder.Decode.
+func (t *Tallocate) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Mode.Decode(b)
+	t.Offset = b.Read64()
+	t.Length = b.Read64()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tallocate) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	t.Mode.Encode(b)
+	b.Write64(t.Offset)
+	b.Write64(t.Length)
+}
+
+// Type implements message.Type.
+func (*Tallocate) Type() MsgType {
+	return MsgTallocate
+}
+
+// String implements fmt.Stringer.
+func (t *Tallocate) String() string {
+	return fmt.Sprintf("Tallocate{FID: %d, Offset: %d, Length: %d}", t.FID, t.Offset, t.Length)
+}
+
+// Rallocate is an allocate response.
+type Rallocate struct {
+}
+
+// Decode implements encoder.Decode.
+func (*Rallocate) Decode(b *buffer) {
+}
+
+// Encode implements encoder.Encode.
+func (*Rallocate) Encode(b *buffer) {
+}
+
+// Type implements message.Type.
+func (*Rallocate) Type() MsgType {
+	return MsgRallocate
+}
+
+// String implements fmt.Stringer.
+func (r *Rallocate) String() string {
+	return fmt.Sprintf("Rallocate{}")
+}
+
 // Txattrwalk walks extended attributes.
 type Txattrwalk struct {
 	// FID is the FID to check for attributes.
@@ -2297,4 +2354,6 @@ func init() {
 	msgRegistry.register(MsgRusymlink, func() message { return &Rusymlink{} })
 	msgRegistry.register(MsgTlconnect, func() message { return &Tlconnect{} })
 	msgRegistry.register(MsgRlconnect, func() message { return &Rlconnect{} })
+	msgRegistry.register(MsgTallocate, func() message { return &Tallocate{} })
+	msgRegistry.register(MsgRallocate, func() message { return &Rallocate{} })
 }
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 78c7d3f86..4039862e6 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -22,6 +22,8 @@ import (
 	"strings"
 	"sync/atomic"
 	"syscall"
+
+	"golang.org/x/sys/unix"
 )
 
 // OpenFlags is the mode passed to Open and Create operations.
@@ -374,6 +376,8 @@ const (
 	MsgRusymlink            = 135
 	MsgTlconnect            = 136
 	MsgRlconnect            = 137
+	MsgTallocate            = 138
+	MsgRallocate            = 139
 )
 
 // QIDType represents the file type for QIDs.
@@ -1058,3 +1062,80 @@ func (d *Dirent) Encode(b *buffer) {
 	b.WriteQIDType(d.Type)
 	b.WriteString(d.Name)
 }
+
+// AllocateMode are possible modes to p9.File.Allocate().
+type AllocateMode struct {
+	KeepSize      bool
+	PunchHole     bool
+	NoHideStale   bool
+	CollapseRange bool
+	ZeroRange     bool
+	InsertRange   bool
+	Unshare       bool
+}
+
+// ToLinux converts to a value compatible with fallocate(2)'s mode.
+func (a *AllocateMode) ToLinux() uint32 {
+	rv := uint32(0)
+	if a.KeepSize {
+		rv |= unix.FALLOC_FL_KEEP_SIZE
+	}
+	if a.PunchHole {
+		rv |= unix.FALLOC_FL_PUNCH_HOLE
+	}
+	if a.NoHideStale {
+		rv |= unix.FALLOC_FL_NO_HIDE_STALE
+	}
+	if a.CollapseRange {
+		rv |= unix.FALLOC_FL_COLLAPSE_RANGE
+	}
+	if a.ZeroRange {
+		rv |= unix.FALLOC_FL_ZERO_RANGE
+	}
+	if a.InsertRange {
+		rv |= unix.FALLOC_FL_INSERT_RANGE
+	}
+	if a.Unshare {
+		rv |= unix.FALLOC_FL_UNSHARE_RANGE
+	}
+	return rv
+}
+
+// Decode implements encoder.Decode.
+func (a *AllocateMode) Decode(b *buffer) {
+	mask := b.Read32()
+	a.KeepSize = mask&0x01 != 0
+	a.PunchHole = mask&0x02 != 0
+	a.NoHideStale = mask&0x04 != 0
+	a.CollapseRange = mask&0x08 != 0
+	a.ZeroRange = mask&0x10 != 0
+	a.InsertRange = mask&0x20 != 0
+	a.Unshare = mask&0x40 != 0
+}
+
+// Encode implements encoder.Encode.
+func (a *AllocateMode) Encode(b *buffer) {
+	mask := uint32(0)
+	if a.KeepSize {
+		mask |= 0x01
+	}
+	if a.PunchHole {
+		mask |= 0x02
+	}
+	if a.NoHideStale {
+		mask |= 0x04
+	}
+	if a.CollapseRange {
+		mask |= 0x08
+	}
+	if a.ZeroRange {
+		mask |= 0x10
+	}
+	if a.InsertRange {
+		mask |= 0x20
+	}
+	if a.Unshare {
+		mask |= 0x40
+	}
+	b.Write32(mask)
+}
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index a36a499a1..c2a2885ae 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 6
+	highestSupportedVersion uint32 = 7
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -143,3 +143,8 @@ func VersionSupportsAnonymous(v uint32) bool {
 func VersionSupportsMultiUser(v uint32) bool {
 	return v >= 6
 }
+
+// versionSupportsTallocate returns true if version v supports Allocate().
+func versionSupportsTallocate(v uint32) bool {
+	return v >= 7
+}
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index 5e005bc2e..22e1530e9 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -29,6 +29,7 @@ import (
 type Device struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index acbbd5466..a992253e6 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -46,6 +46,7 @@ const (
 type Device struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 6b11afa44..17d68b5c4 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -30,6 +30,7 @@ import (
 type fullDevice struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
@@ -59,7 +60,6 @@ func (f *fullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type fullFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
@@ -69,6 +69,7 @@ type fullFileOperations struct {
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	readZeros                       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*fullFileOperations)(nil)
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 069212b6d..ee13183c8 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -29,6 +29,7 @@ import (
 type nullDevice struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
@@ -60,17 +61,17 @@ func (n *nullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type nullFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRead             `state:"nosave"`
-	fsutil.FileNoopWrite            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoopWrite            `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*nullFileOperations)(nil)
@@ -101,16 +102,16 @@ func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 
 // +stateify savable
 type zeroFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNoopWrite            `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	readZeros                       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*zeroFileOperations)(nil)
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index de0f3e5e5..b0a412382 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -29,6 +29,7 @@ import (
 type randomDevice struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
@@ -57,16 +58,16 @@ func (*randomDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type randomFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
-	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
-	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNoopWrite            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*randomFileOperations)(nil)
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 01098675d..44f43b965 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -113,5 +113,6 @@ go_test(
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
+        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 28686f3b3..ad0518b8f 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -149,7 +149,7 @@ func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error {
 	}
 
 	// Invalidate COW mappings that may exist beyond the new size in case the file
-	// is being shrunk. Other mappinsg don't need to be invalidated because
+	// is being shrunk. Other mappings don't need to be invalidated because
 	// translate will just return identical mappings after invalidation anyway,
 	// and SIGBUS will be raised and handled when the mappings are touched.
 	//
@@ -167,6 +167,14 @@ func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error {
 	return nil
 }
 
+// Allocate reserves space in the backing file.
+func (h *HostMappable) Allocate(ctx context.Context, offset int64, length int64) error {
+	h.truncateMu.RLock()
+	err := h.backingFile.Allocate(ctx, offset, length)
+	h.truncateMu.RUnlock()
+	return err
+}
+
 // Write writes to the file backing this mappable.
 func (h *HostMappable) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
 	h.truncateMu.RLock()
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index b6366d906..151be1d0d 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -34,6 +34,7 @@ type SimpleFileInode struct {
 	InodeNoExtendedAttributes `state:"nosave"`
 	InodeNoopRelease          `state:"nosave"`
 	InodeNoopWriteOut         `state:"nosave"`
+	InodeNotAllocatable       `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
 	InodeNotMappable          `state:"nosave"`
 	InodeNotOpenable          `state:"nosave"`
@@ -61,6 +62,7 @@ type NoReadWriteFileInode struct {
 	InodeNoExtendedAttributes `state:"nosave"`
 	InodeNoopRelease          `state:"nosave"`
 	InodeNoopWriteOut         `state:"nosave"`
+	InodeNotAllocatable       `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
 	InodeNotMappable          `state:"nosave"`
 	InodeNotSocket            `state:"nosave"`
@@ -465,3 +467,26 @@ func (InodeDenyWriteChecker) Check(ctx context.Context, inode *fs.Inode, p fs.Pe
 	}
 	return fs.ContextCanAccessFile(ctx, inode, p)
 }
+
+//InodeNotAllocatable can be used by Inodes that do not support Allocate().
+type InodeNotAllocatable struct{}
+
+func (InodeNotAllocatable) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return syserror.EOPNOTSUPP
+}
+
+// InodeNoopAllocate implements fs.InodeOperations.Allocate as a noop.
+type InodeNoopAllocate struct{}
+
+// Allocate implements fs.InodeOperations.Allocate.
+func (InodeNoopAllocate) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return nil
+}
+
+// InodeIsDirAllocate implements fs.InodeOperations.Allocate for directories.
+type InodeIsDirAllocate struct{}
+
+// Allocate implements fs.InodeOperations.Allocate.
+func (InodeIsDirAllocate) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return syserror.EISDIR
+}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 76644e69d..03cad37f3 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -135,6 +135,10 @@ type CachedFileObject interface {
 	// the file was opened.
 	SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error
 
+	// Allocate allows the caller to reserve disk space for the inode.
+	// It's equivalent to fallocate(2) with 'mode=0'.
+	Allocate(ctx context.Context, offset int64, length int64) error
+
 	// Sync instructs the remote filesystem to sync the file to stable storage.
 	Sync(ctx context.Context) error
 
@@ -336,6 +340,30 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode,
 	return nil
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length int64) error {
+	newSize := offset + length
+
+	// c.attr.Size is protected by both c.attrMu and c.dataMu.
+	c.attrMu.Lock()
+	defer c.attrMu.Unlock()
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+
+	if newSize <= c.attr.Size {
+		return nil
+	}
+
+	now := ktime.NowFromContext(ctx)
+	if err := c.backingFile.Allocate(ctx, offset, length); err != nil {
+		return err
+	}
+
+	c.attr.Size = newSize
+	c.touchModificationTimeLocked(now)
+	return nil
+}
+
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
 	c.attrMu.Lock()
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 3f10efc12..be3d4b6fc 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 type noopBackingFile struct{}
@@ -50,6 +51,10 @@ func (noopBackingFile) FD() int {
 	return -1
 }
 
+func (noopBackingFile) Allocate(ctx context.Context, offset int64, length int64) error {
+	return nil
+}
+
 func TestSetPermissions(t *testing.T) {
 	ctx := contexttest.Context(t)
 
@@ -237,6 +242,10 @@ func (*sliceBackingFile) FD() int {
 	return -1
 }
 
+func (f *sliceBackingFile) Allocate(ctx context.Context, offset int64, length int64) error {
+	return syserror.EOPNOTSUPP
+}
+
 type noopMappingSpace struct{}
 
 // Invalidate implements memmap.MappingSpace.Invalidate.
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 842a34af8..be53ac4d9 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -59,6 +59,13 @@ func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9
 	return err
 }
 
+func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := c.file.Allocate(mode, offset, length)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (c *contextFile) rename(ctx context.Context, directory contextFile, name string) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := c.file.Rename(directory.file, name)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index f6f20844d..dcb3b2880 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -322,6 +322,15 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 	return unstable(ctx, valid, pattr, i.s.mounter, i.s.client), nil
 }
 
+func (i *inodeFileState) Allocate(ctx context.Context, offset, length int64) error {
+	i.handlesMu.RLock()
+	defer i.handlesMu.RUnlock()
+
+	// No options are supported for now.
+	mode := p9.AllocateMode{}
+	return i.writeHandles.File.allocate(ctx, mode, uint64(offset), uint64(length))
+}
+
 // session extracts the gofer's session from the MountSource.
 func (i *inodeOperations) session() *session {
 	return i.fileState.s
@@ -498,6 +507,21 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 	return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)})
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error {
+	// This can only be called for files anyway.
+	if i.session().cachePolicy.useCachingInodeOps(inode) {
+		return i.cachingInodeOps.Allocate(ctx, offset, length)
+	}
+	if i.session().cachePolicy == cacheRemoteRevalidating {
+		return i.fileState.hostMappable.Allocate(ctx, offset, length)
+	}
+
+	// No options are supported for now.
+	mode := p9.AllocateMode{}
+	return i.fileState.file.allocate(ctx, mode, uint64(offset), uint64(length))
+}
+
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
 	if !i.session().cachePolicy.cacheUAttrs(inode) {
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 20e077f77..d36ac9a87 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -163,6 +163,11 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 	return unstableAttr(i.mops, &s), nil
 }
 
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error {
+	return syscall.Fallocate(i.FD(), 0, offset, length)
+}
+
 // inodeOperations implements fs.InodeOperations.
 var _ fs.InodeOperations = (*inodeOperations)(nil)
 
@@ -397,6 +402,19 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size in
 	return i.cachingInodeOps.Truncate(ctx, inode, size)
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error {
+	// Is the file not memory-mappable?
+	if !canMap(inode) {
+		// Then just send the call to the FD, the host will synchronize the metadata
+		// update with any host inode and page cache.
+		return i.fileState.Allocate(ctx, offset, length)
+	}
+	// Otherwise we need to go through cachingInodeOps, even if the host page
+	// cache is in use, to invalidate private copies of truncated pages.
+	return i.cachingInodeOps.Allocate(ctx, offset, length)
+}
+
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
 	// Have we been using host kernel metadata caches?
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index d764ef93d..22f316daf 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -340,6 +340,13 @@ func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error {
 	return i.InodeOperations.Truncate(ctx, i, size)
 }
 
+func (i *Inode) Allocate(ctx context.Context, d *Dirent, offset int64, length int64) error {
+	if i.overlay != nil {
+		return overlayAllocate(ctx, i.overlay, d, offset, length)
+	}
+	return i.InodeOperations.Allocate(ctx, i, offset, length)
+}
+
 // Readlink calls i.InodeOperations.Readlnk with i as the Inode.
 func (i *Inode) Readlink(ctx context.Context) (string, error) {
 	if i.overlay != nil {
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index ac287e1e4..abafe4791 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -223,6 +223,10 @@ type InodeOperations interface {
 	// Implementations need not check that length >= 0.
 	Truncate(ctx context.Context, inode *Inode, size int64) error
 
+	// Allocate allows the caller to reserve disk space for the inode.
+	// It's equivalent to fallocate(2) with 'mode=0'.
+	Allocate(ctx context.Context, inode *Inode, offset int64, length int64) error
+
 	// WriteOut writes cached Inode state to a backing filesystem in a
 	// synchronous manner.
 	//
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 3d015328e..ead487097 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -582,6 +582,13 @@ func overlayTruncate(ctx context.Context, o *overlayEntry, d *Dirent, size int64
 	return o.upper.InodeOperations.Truncate(ctx, o.upper, size)
 }
 
+func overlayAllocate(ctx context.Context, o *overlayEntry, d *Dirent, offset, length int64) error {
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.Allocate(ctx, o.upper, offset, length)
+}
+
 func overlayReadlink(ctx context.Context, o *overlayEntry) (string, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 66b3da2d0..52ce1d29e 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -422,6 +422,7 @@ type inode struct {
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotAllocatable       `state:"nosave"`
 	fsutil.InodeNotDirectory         `state:"nosave"`
 	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index cf359a1f1..a71144b2c 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -150,6 +150,11 @@ func (n *MockInodeOperations) Truncate(ctx context.Context, inode *Inode, size i
 	return nil
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (n *MockInodeOperations) Allocate(ctx context.Context, inode *Inode, offset, length int64) error {
+	return nil
+}
+
 // Remove implements fs.InodeOperations.Remove.
 func (n *MockInodeOperations) Remove(context.Context, *Inode, string) error {
 	return nil
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index b03807043..379569823 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -55,6 +55,7 @@ func (i *taskOwnedInodeOps) UnstableAttr(ctx context.Context, inode *fs.Inode) (
 type staticFileInodeOps struct {
 	fsutil.InodeDenyWriteChecker     `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 10ea1f55d..6b0ae9e60 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -93,6 +93,7 @@ type SeqFile struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 	fsutil.InodeNotDirectory   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
 	fsutil.InodeNotSocket      `state:"nosave"`
@@ -183,7 +184,6 @@ func (s *SeqFile) updateSourceLocked(ctx context.Context, record int) {
 //
 // +stateify savable
 type seqFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
@@ -192,6 +192,7 @@ type seqFileOperations struct {
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	seqFile *SeqFile
 }
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index d649da0f1..5df3cee13 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -38,6 +38,7 @@ type idMapInodeOperations struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 	fsutil.InodeNotDirectory   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
 	fsutil.InodeNotSocket      `state:"nosave"`
@@ -81,7 +82,6 @@ func (imio *idMapInodeOperations) GetFile(ctx context.Context, dirent *fs.Dirent
 
 // +stateify savable
 type idMapFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
@@ -90,6 +90,7 @@ type idMapFileOperations struct {
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	iops *idMapInodeOperations
 }
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index a6b6a5c33..eb98b59cc 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -50,6 +50,7 @@ type CreateOps struct {
 // +stateify savable
 type Dir struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeIsDirAllocate  `state:"nosave"`
 	fsutil.InodeIsDirTruncate  `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 9406a07ca..a7cb1bb86 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -30,6 +30,7 @@ type Socket struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 	fsutil.InodeNotDirectory   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
 	fsutil.InodeNotSymlink     `state:"nosave"`
@@ -67,7 +68,6 @@ func (s *Socket) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFl
 
 // +stateify savable
 type socketFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
@@ -78,6 +78,7 @@ type socketFileOperations struct {
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*socketFileOperations)(nil)
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index f7835fe05..dd2585b02 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -29,10 +29,11 @@ type Symlink struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 	fsutil.InodeNotDirectory   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
-	fsutil.InodeNotTruncatable `state:"nosave"`
 	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
 	fsutil.InodeVirtual        `state:"nosave"`
 
 	fsutil.InodeSimpleAttributes
@@ -88,7 +89,6 @@ func (s *Symlink) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileF
 
 // +stateify savable
 type symlinkFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
@@ -99,6 +99,7 @@ type symlinkFileOperations struct {
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*symlinkFileOperations)(nil)
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index db91de435..bacc93af8 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -30,12 +30,13 @@ type cpunum struct {
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotAllocatable       `state:"nosave"`
 	fsutil.InodeNotDirectory         `state:"nosave"`
 	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNotTruncatable       `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
 
 	fsutil.InodeSimpleAttributes
 	fsutil.InodeStaticFileGetter
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index f89d86c83..c90062a22 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -259,6 +259,33 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in
 	return nil
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (f *fileInodeOperations) Allocate(ctx context.Context, _ *fs.Inode, offset, length int64) error {
+	newSize := offset + length
+
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	f.dataMu.Lock()
+	defer f.dataMu.Unlock()
+
+	if newSize <= f.attr.Size {
+		return nil
+	}
+
+	// Check if current seals allow growth.
+	if f.seals&linux.F_SEAL_GROW != 0 {
+		return syserror.EPERM
+	}
+
+	f.attr.Size = newSize
+
+	now := ktime.NowFromContext(ctx)
+	f.attr.ModificationTime = now
+	f.attr.StatusChangeTime = now
+
+	return nil
+}
+
 // AddLink implements fs.InodeOperations.AddLink.
 func (f *fileInodeOperations) AddLink() {
 	f.attrMu.Lock()
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 832914453..6ad5c5adb 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -242,11 +242,16 @@ func (d *Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, n
 	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
-// StatFS implments fs.InodeOperations.StatFS.
+// StatFS implements fs.InodeOperations.StatFS.
 func (*Dir) StatFS(context.Context) (fs.Info, error) {
 	return fsInfo, nil
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (d *Dir) Allocate(ctx context.Context, node *fs.Inode, offset, length int64) error {
+	return d.ramfsDir.Allocate(ctx, node, offset, length)
+}
+
 // Symlink is a symlink.
 //
 // +stateify savable
@@ -281,6 +286,7 @@ func (s *Symlink) StatFS(context.Context) (fs.Info, error) {
 type Socket struct {
 	ramfs.Socket
 	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 }
 
 // NewSocket returns a new socket with the provided permissions.
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 0fc777e67..8dc40e1f2 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -53,13 +53,14 @@ import (
 // +stateify savable
 type dirInodeOperations struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeIsDirAllocate        `state:"nosave"`
+	fsutil.InodeIsDirTruncate        `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
 	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotRenameable        `state:"nosave"`
-	fsutil.InodeNotSymlink           `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
-	fsutil.InodeNotTruncatable       `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
 	fsutil.InodeVirtual              `state:"nosave"`
 
 	fsutil.InodeSimpleAttributes
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 99188dddf..7c3739360 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -191,3 +191,7 @@ func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) {
 		*wakeupChan = nil
 	}
 }
+
+func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return syserror.EPIPE
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 893322647..1764bb4b6 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1900,9 +1900,9 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 }
 
 // Fallocate implements linux system call fallocate(2).
-// (well, not really, but at least we return the expected error codes)
 func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := kdefs.FD(args[0].Int())
+	mode := args[1].Int64()
 	offset := args[2].Int64()
 	length := args[3].Int64()
 
@@ -1915,8 +1915,42 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	if offset < 0 || length <= 0 {
 		return 0, nil, syserror.EINVAL
 	}
+	if mode != 0 {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOTSUP
+	}
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ESPIPE
+	}
+	if fs.IsDir(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EISDIR
+	}
+	if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ENODEV
+	}
+	size := offset + length
+	if size < 0 {
+		return 0, nil, syserror.EFBIG
+	}
+	if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil {
+		return 0, nil, err
+	}
+
+	// File length modified, generate notification.
+	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
 
-	return 0, nil, syserror.EOPNOTSUPP
+	return 0, nil, nil
 }
 
 // Flock implements linux syscall flock(2).
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index c1b33c551..c369e4d64 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -99,7 +99,7 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			// args: cmd, ...
 			tr = newArgsTracker(0)
 
-		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX:
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX, syscall.SYS_FALLOCATE:
 			// args: fd/addr, cmd, ...
 			tr = newArgsTracker(1)
 
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index a1ad49fb2..4faab2946 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -62,8 +62,14 @@ var allowedSyscalls = seccomp.SyscallRules{
 	},
 	syscall.SYS_EXIT:       {},
 	syscall.SYS_EXIT_GROUP: {},
-	syscall.SYS_FCHMOD:     {},
-	syscall.SYS_FCHOWNAT:   {},
+	syscall.SYS_FALLOCATE: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_FCHMOD:   {},
+	syscall.SYS_FCHOWNAT: {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 3a0806837..b185015b6 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -731,6 +731,18 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	return err
 }
 
+// Allocate implements p9.File.
+func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
+	if !l.isOpen() {
+		return syscall.EBADF
+	}
+
+	if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
 // Rename implements p9.File; this should never be called.
 func (l *localFile) Rename(p9.File, string) error {
 	panic("rename called directly")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index d99733fc9..7ff4e4883 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -649,6 +649,8 @@ cc_binary(
     srcs = ["fallocate.cc"],
     linkstatic = 1,
     deps = [
+        ":file_base",
+        "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:temp_path",
         "//test/util:test_main",
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index 61b8acc7a..1c3d00287 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -12,45 +12,130 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <errno.h>
 #include <fcntl.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <syscall.h>
+#include <time.h>
 #include <unistd.h>
 
 #include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
 namespace testing {
-
 namespace {
 
-// These tests are very rudimentary because fallocate is not
-// implemented.  We just want to make sure the expected error codes are
-// returned.
+int fallocate(int fd, int mode, off_t offset, off_t len) {
+  return syscall(__NR_fallocate, fd, mode, offset, len);
+}
+
+class AllocateTest : public FileTest {
+  void SetUp() override { FileTest::SetUp(); }
+};
+
+TEST_F(AllocateTest, Fallocate) {
+  // Check that it starts at size zero.
+  struct stat buf;
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 0);
+
+  // Grow to ten bytes.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 10);
 
-TEST(FallocateTest, NotImplemented) {
-  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
+  // Allocate to a smaller size should be noop.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 10);
 
-  // Test that a completely unassigned fallocate mode returns EOPNOTSUPP.
-  ASSERT_THAT(fallocate(fd.get(), 0x80, 0, 32768),
-              SyscallFailsWithErrno(EOPNOTSUPP));
+  // Grow again.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 20);
+
+  // Grow with offset.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 30);
+
+  // Grow with offset beyond EOF.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 40);
 }
 
-TEST(FallocateTest, BadOffset) {
-  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
-  ASSERT_THAT(fallocate(fd.get(), 0, -1, 32768), SyscallFailsWithErrno(EINVAL));
+TEST_F(AllocateTest, FallocateInvalid) {
+  // Invalid FD
+  EXPECT_THAT(fallocate(-1, 0, 0, 10), SyscallFailsWithErrno(EBADF));
+
+  // Negative offset and size.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, -1, 10),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, -1),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, -1, -1),
+              SyscallFailsWithErrno(EINVAL));
 }
 
-TEST(FallocateTest, BadLength) {
-  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
-  ASSERT_THAT(fallocate(fd.get(), 0, 0, -1), SyscallFailsWithErrno(EINVAL));
+TEST_F(AllocateTest, FallocateReadonly) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+  EXPECT_THAT(fallocate(fd.get(), 0, 0, 10), SyscallFailsWithErrno(EBADF));
 }
 
-}  // namespace
+TEST_F(AllocateTest, FallocatePipe) {
+  int pipes[2];
+  EXPECT_THAT(pipe(pipes), SyscallSucceeds());
+  auto cleanup = Cleanup([&pipes] {
+    EXPECT_THAT(close(pipes[0]), SyscallSucceeds());
+    EXPECT_THAT(close(pipes[1]), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(fallocate(pipes[1], 0, 0, 10), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST_F(AllocateTest, FallocateChar) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDWR));
+  EXPECT_THAT(fallocate(fd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+}
+
+TEST_F(AllocateTest, FallocateRlimit) {
+  // Get the current rlimit and restore after test run.
+  struct rlimit initial_lim;
+  ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  auto cleanup = Cleanup([&initial_lim] {
+    EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  });
+
+  // Try growing past the file size limit.
+  sigset_t new_mask;
+  sigemptyset(&new_mask);
+  sigaddset(&new_mask, SIGXFSZ);
+  sigprocmask(SIG_BLOCK, &new_mask, nullptr);
 
+  struct rlimit setlim = {};
+  setlim.rlim_cur = 1024;
+  setlim.rlim_max = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 1025),
+              SyscallFailsWithErrno(EFBIG));
+
+  struct timespec timelimit = {};
+  timelimit.tv_sec = 10;
+  EXPECT_EQ(sigtimedwait(&new_mask, nullptr, &timelimit), SIGXFSZ);
+  ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds());
+}
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 9f2b12c624a4a07c6662d1a5f1bced28b6eb86da Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 13 May 2019 00:49:32 -0700
Subject: gvisor/runsc/tests: set timeout for http.Get()

WaitForHTTP tries GET requests on a port until the call succeeds or timeout.

But we want to be sure that one of our attempts will not stuck for
the whole timeout.

All timeouts are increased to 30 seconds, because test cases with smaller
timeouts fail sometimes even for the native container runtime (runc).

PiperOrigin-RevId: 247888467
Change-Id: I03cfd3275286bc686a78fd26da43231d20667851
---
 kokoro/run_tests.sh                        |  1 +
 runsc/test/image/image_test.go             | 41 +++++++++++++++++++++---------
 runsc/test/integration/integration_test.go |  4 +--
 runsc/test/testutil/testutil.go            |  9 ++++++-
 4 files changed, 40 insertions(+), 15 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index fbe353a1e..aa88b5cbf 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -150,6 +150,7 @@ run_docker_tests() {
     bazel test \
       "${BAZEL_BUILD_FLAGS[@]}" \
       --test_env=RUNSC_RUNTIME="${RUNTIME}${v}" \
+      --test_output=all \
       //runsc/test/image:image_test \
       //runsc/test/integration:integration_test
   done
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index 8322dd001..b969731b0 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -24,6 +24,7 @@ package image
 import (
 	"fmt"
 	"io/ioutil"
+	"log"
 	"net/http"
 	"os"
 	"path/filepath"
@@ -46,7 +47,7 @@ func TestHelloWorld(t *testing.T) {
 	}
 }
 
-func testHTTPServer(port int) error {
+func runHTTPRequest(port int) error {
 	url := fmt.Sprintf("http://localhost:%d/not-found", port)
 	resp, err := http.Get(url)
 	if err != nil {
@@ -78,6 +79,26 @@ func testHTTPServer(port int) error {
 	return nil
 }
 
+func testHTTPServer(t *testing.T, port int) {
+	const requests = 10
+	ch := make(chan error, requests)
+	for i := 0; i < requests; i++ {
+		go func() {
+			start := time.Now()
+			err := runHTTPRequest(port)
+			log.Printf("Response time %v: %v", time.Since(start).String(), err)
+			ch <- err
+		}()
+	}
+
+	for i := 0; i < requests; i++ {
+		err := <-ch
+		if err != nil {
+			t.Errorf("testHTTPServer(%d) failed: %v", port, err)
+		}
+	}
+}
+
 func TestHttpd(t *testing.T) {
 	if err := testutil.Pull("httpd"); err != nil {
 		t.Fatalf("docker pull failed: %v", err)
@@ -103,13 +124,11 @@ func TestHttpd(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Errorf("WaitForHTTP() timeout: %v", err)
 	}
 
-	if err := testHTTPServer(port); err != nil {
-		t.Fatalf("testHTTPServer(%d) failed: %v", port, err)
-	}
+	testHTTPServer(t, port)
 }
 
 func TestNginx(t *testing.T) {
@@ -137,13 +156,11 @@ func TestNginx(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Errorf("WaitForHTTP() timeout: %v", err)
 	}
 
-	if err := testHTTPServer(port); err != nil {
-		t.Fatalf("testHTTPServer(%d) failed: %v", port, err)
-	}
+	testHTTPServer(t, port)
 }
 
 func TestMysql(t *testing.T) {
@@ -240,7 +257,7 @@ func TestTomcat(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index de17dd3c2..c51cab3ae 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -68,7 +68,7 @@ func TestLifeCycle(t *testing.T) {
 	if err != nil {
 		t.Fatal("docker.FindPort(80) failed: ", err)
 	}
-	if err := testutil.WaitForHTTP(port, 10*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
 		t.Fatal("WaitForHTTP() timeout:", err)
 	}
 	client := http.Client{Timeout: time.Duration(2 * time.Second)}
@@ -138,7 +138,7 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 20*time.Second); err != nil {
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
 		t.Fatal("WaitForHTTP() timeout:", err)
 	}
 
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 6a4c045a8..9efb1ba8e 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -23,6 +23,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"log"
 	"math/rand"
 	"net/http"
 	"os"
@@ -266,8 +267,14 @@ func Poll(cb func() error, timeout time.Duration) error {
 // WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
 func WaitForHTTP(port int, timeout time.Duration) error {
 	cb := func() error {
-		resp, err := http.Get(fmt.Sprintf("http://localhost:%d/", port))
+		c := &http.Client{
+			// Calculate timeout to be able to do minimum 5 attempts.
+			Timeout: timeout / 5,
+		}
+		url := fmt.Sprintf("http://localhost:%d/", port)
+		resp, err := c.Get(url)
 		if err != nil {
+			log.Printf("Waiting %s: %v", url, err)
 			return err
 		}
 		resp.Body.Close()
-- 
cgit v1.2.3


From fff21b99e45136510a0148eada57ff28966dc27e Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 14 May 2019 16:00:53 -0700
Subject: kokoro: run tests with a default docker container runtime

We want to know that our environment set up properly
and docker tests pass with a native runtime.

PiperOrigin-RevId: 248229294
Change-Id: I06c221e5eeed6e01bdd1aa935333c57e8eadc498
---
 kokoro/run_tests.sh           | 11 +++++++++++
 runsc/test/testutil/docker.go |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index aa88b5cbf..9d630362e 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -142,10 +142,21 @@ EOF
 run_docker_tests() {
   cd ${WORKSPACE_DIR}
 
+  # Run tests with a default runtime (runc).
+  bazel test \
+    "${BAZEL_BUILD_FLAGS[@]}" \
+    --test_env=RUNSC_RUNTIME="" \
+    --test_output=all \
+    //runsc/test/image:image_test
+
   # These names are used to exclude tests not supported in certain
   # configuration, e.g. save/restore not supported with hostnet.
   declare -a variations=("" "-kvm" "-hostnet" "-overlay")
   for v in "${variations[@]}"; do
+    # FIXME(b/132073574): we need to flush arp tables, otherwise tests fail with
+    # timeout.
+    sudo ip neigh show
+    sudo ip neigh flush dev docker0
     # Run runsc tests with docker that are tagged manual.
     bazel test \
       "${BAZEL_BUILD_FLAGS[@]}" \
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index e103e930c..81f5a9ef0 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -31,8 +31,8 @@ import (
 )
 
 func getRuntime() string {
-	r := os.Getenv("RUNSC_RUNTIME")
-	if r == "" {
+	r, ok := os.LookupEnv("RUNSC_RUNTIME")
+	if !ok {
 		return "runsc-test"
 	}
 	return r
-- 
cgit v1.2.3


From 85380ff03d21da417ad74d28b293c768d7effb4f Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 15 May 2019 11:10:56 -0700
Subject: gvisor/runsc: use a veth link address instead of generating a new one

PiperOrigin-RevId: 248367340
Change-Id: Id792afcfff9c9d2cfd62cae21048316267b4a924
---
 kokoro/run_tests.sh      |  4 ----
 runsc/boot/network.go    | 24 +++++++-----------------
 runsc/sandbox/network.go |  1 +
 3 files changed, 8 insertions(+), 21 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 9d630362e..b3f333f2f 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -153,10 +153,6 @@ run_docker_tests() {
   # configuration, e.g. save/restore not supported with hostnet.
   declare -a variations=("" "-kvm" "-hostnet" "-overlay")
   for v in "${variations[@]}"; do
-    # FIXME(b/132073574): we need to flush arp tables, otherwise tests fail with
-    # timeout.
-    sudo ip neigh show
-    sudo ip neigh flush dev docker0
     # Run runsc tests with docker that are tagged manual.
     bazel test \
       "${BAZEL_BUILD_FLAGS[@]}" \
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 598ec969e..0a154d90b 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"math/rand"
 	"net"
 	"syscall"
 
@@ -52,11 +51,12 @@ type DefaultRoute struct {
 
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
-	Name       string
-	MTU        int
-	Addresses  []net.IP
-	Routes     []Route
-	GSOMaxSize uint32
+	Name        string
+	MTU         int
+	Addresses   []net.IP
+	Routes      []Route
+	GSOMaxSize  uint32
+	LinkAddress []byte
 }
 
 // LoopbackLink configures a loopback li nk.
@@ -134,7 +134,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
 		}
 
-		mac := tcpip.LinkAddress(generateRndMac())
+		mac := tcpip.LinkAddress(link.LinkAddress)
 		linkEP, err := fdbased.New(&fdbased.Options{
 			FD:                 newFD,
 			MTU:                uint32(link.MTU),
@@ -220,13 +220,3 @@ func ipToAddressMask(ip net.IP) tcpip.AddressMask {
 	_, addr := ipToAddressAndProto(ip)
 	return tcpip.AddressMask(addr)
 }
-
-// generateRndMac returns a random local MAC address.
-// Copied from eth_random_addr() (include/linux/etherdevice.h)
-func generateRndMac() net.HardwareAddr {
-	mac := make(net.HardwareAddr, 6)
-	rand.Read(mac)
-	mac[0] &^= 0x1 // clear multicast bit
-	mac[0] |= 0x2  // set local assignment bit (IEEE802)
-	return mac
-}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 2a68d7043..0460d5f1a 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -246,6 +246,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 		if err != nil {
 			return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
 		}
+		link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr)
 
 		if enableGSO {
 			gso, err := isGSOEnabled(fd, iface.Name)
-- 
cgit v1.2.3


From ecb0f00e10017e82698c326b4d83294c9e20dfbd Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 15 May 2019 14:35:30 -0700
Subject: Cleanup around urpc file payload handling

urpc always closes all files once the RPC function returns.

PiperOrigin-RevId: 248406857
Change-Id: I400a8562452ec75c8e4bddc2154948567d572950
---
 runsc/boot/controller.go  | 22 ++++++----------------
 runsc/boot/loader.go      | 35 ++++++++++++++++-------------------
 runsc/boot/loader_test.go |  1 -
 runsc/cmd/boot.go         |  2 +-
 4 files changed, 23 insertions(+), 37 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 86f06bff1..f09c1bd85 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -211,12 +211,6 @@ type StartArgs struct {
 func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	log.Debugf("containerManager.Start: %+v", args)
 
-	defer func() {
-		for _, f := range args.FilePayload.Files {
-			f.Close()
-		}
-	}()
-
 	// Validate arguments.
 	if args == nil {
 		return errors.New("start missing arguments")
@@ -305,21 +299,19 @@ type RestoreOpts struct {
 func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	log.Debugf("containerManager.Restore")
 
-	var specFile *os.File
-	deviceFD := -1
+	var specFile, deviceFile *os.File
 	switch numFiles := len(o.FilePayload.Files); numFiles {
 	case 2:
-		var err error
 		// The device file is donated to the platform.
 		// Can't take ownership away from os.File. dup them to get a new FD.
-		deviceFD, err = syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+		fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd()))
 		if err != nil {
 			return fmt.Errorf("failed to dup file: %v", err)
 		}
+		deviceFile = os.NewFile(uintptr(fd), "platform device")
 		fallthrough
 	case 1:
 		specFile = o.FilePayload.Files[0]
-		defer specFile.Close()
 	case 0:
 		return fmt.Errorf("at least one file must be passed to Restore")
 	default:
@@ -331,7 +323,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k.Pause()
 	cm.l.k.Destroy()
 
-	p, err := createPlatform(cm.l.conf, deviceFD)
+	p, err := createPlatform(cm.l.conf, deviceFile)
 	if err != nil {
 		return fmt.Errorf("creating platform: %v", err)
 	}
@@ -357,7 +349,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	if eps, ok := networkStack.(*epsocket.Stack); ok {
 		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
 	}
-	info, err := o.FilePayload.Files[0].Stat()
+	info, err := specFile.Stat()
 	if err != nil {
 		return err
 	}
@@ -366,9 +358,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	// Load the state.
-	loadOpts := state.LoadOpts{
-		Source: o.FilePayload.Files[0],
-	}
+	loadOpts := state.LoadOpts{Source: specFile}
 	if err := loadOpts.Load(k, networkStack); err != nil {
 		return err
 	}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 05122a6a8..6ac6b94dd 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -152,8 +152,8 @@ type Args struct {
 	Conf *Config
 	// ControllerFD is the FD to the URPC controller.
 	ControllerFD int
-	// DeviceFD is an optional argument that is passed to the platform.
-	DeviceFD int
+	// Device is an optional argument that is passed to the platform.
+	Device *os.File
 	// GoferFDs is an array of FDs used to connect with the Gofer.
 	GoferFDs []int
 	// StdioFDs is the stdio for the application.
@@ -183,7 +183,7 @@ func New(args Args) (*Loader, error) {
 	}
 
 	// Create kernel and platform.
-	p, err := createPlatform(args.Conf, args.DeviceFD)
+	p, err := createPlatform(args.Conf, args.Device)
 	if err != nil {
 		return nil, fmt.Errorf("creating platform: %v", err)
 	}
@@ -401,17 +401,17 @@ func (l *Loader) Destroy() {
 	l.watchdog.Stop()
 }
 
-func createPlatform(conf *Config, deviceFD int) (platform.Platform, error) {
+func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
 	switch conf.Platform {
 	case PlatformPtrace:
 		log.Infof("Platform: ptrace")
 		return ptrace.New()
 	case PlatformKVM:
 		log.Infof("Platform: kvm")
-		if deviceFD < 0 {
-			return nil, fmt.Errorf("kvm device FD must be provided")
+		if deviceFile == nil {
+			return nil, fmt.Errorf("kvm device file must be provided")
 		}
-		return kvm.New(os.NewFile(uintptr(deviceFD), "kvm device"))
+		return kvm.New(deviceFile)
 	default:
 		return nil, fmt.Errorf("invalid platform %v", conf.Platform)
 	}
@@ -590,18 +590,22 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("creating new process: %v", err)
 	}
 
+	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
+	var stdioFDs []int
+	for _, f := range files[:3] {
+		stdioFDs = append(stdioFDs, int(f.Fd()))
+	}
+
 	// Can't take ownership away from os.File. dup them to get a new FDs.
-	var ioFDs []int
-	for _, f := range files {
+	var goferFDs []int
+	for _, f := range files[3:] {
 		fd, err := syscall.Dup(int(f.Fd()))
 		if err != nil {
 			return fmt.Errorf("failed to dup file: %v", err)
 		}
-		ioFDs = append(ioFDs, fd)
+		goferFDs = append(goferFDs, fd)
 	}
 
-	stdioFDs := ioFDs[:3]
-	goferFDs := ioFDs[3:]
 	if err := setupContainerFS(
 		&procArgs,
 		spec,
@@ -616,13 +620,6 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("configuring container FS: %v", err)
 	}
 
-	// setFileSystemForProcess dup'd stdioFDs, so we can close them.
-	for i, fd := range stdioFDs {
-		if err := syscall.Close(fd); err != nil {
-			return fmt.Errorf("closing stdio FD #%d: %v", i, fd)
-		}
-	}
-
 	ctx := procArgs.NewContext(l.k)
 	mns := k.RootMountNamespace()
 	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 9a864ad3f..4603f751d 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -115,7 +115,6 @@ func createLoader() (*Loader, func(), error) {
 		Spec:         spec,
 		Conf:         conf,
 		ControllerFD: fd,
-		DeviceFD:     -1,
 		GoferFDs:     []int{sandEnd},
 		StdioFDs:     stdio,
 	}
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index ac937f7bc..3a547d4aa 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -213,7 +213,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Spec:         spec,
 		Conf:         conf,
 		ControllerFD: b.controllerFD,
-		DeviceFD:     b.deviceFD,
+		Device:       os.NewFile(uintptr(b.deviceFD), "platform device"),
 		GoferFDs:     b.ioFDs.GetArray(),
 		StdioFDs:     b.stdioFDs.GetArray(),
 		Console:      b.console,
-- 
cgit v1.2.3


From fc9f7e3590492e3236e886a533974a786be47124 Mon Sep 17 00:00:00 2001
From: Liu Hua <sdu.liu@huawei.com>
Date: Thu, 16 May 2019 16:19:34 -0700
Subject: tiny fix: avoid panicing when OpenSpec failed

Signed-off-by: Liu Hua <sdu.liu@huawei.com>
Change-Id: I11a4620394a10a7d92036b0341e0c21ad50bd122
PiperOrigin-RevId: 248621810
---
 runsc/specutils/specutils.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'runsc')

diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index c72207fb4..2888f55db 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -149,7 +149,7 @@ func OpenSpec(bundleDir string) (*os.File, error) {
 func ReadSpec(bundleDir string) (*specs.Spec, error) {
 	specFile, err := OpenSpec(bundleDir)
 	if err != nil {
-		return nil, fmt.Errorf("error opening spec file %q: %v", specFile.Name(), err)
+		return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
 	}
 	defer specFile.Close()
 	return ReadSpecFromFile(bundleDir, specFile)
-- 
cgit v1.2.3


From 4a842836e560322bb3944b59ff43b9d60cc0f867 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 17 May 2019 13:46:18 -0700
Subject: Return EPERM for mknod

This more directly matches what Linux does with unsupported
nodes.

PiperOrigin-RevId: 248780425
Change-Id: I17f3dd0b244f6dc4eb00e2e42344851b8367fbec
---
 pkg/sentry/fs/gofer/path.go | 4 ++--
 pkg/sentry/fs/host/inode.go | 2 +-
 runsc/fsgofer/fsgofer.go    | 5 ++++-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index babfa4560..148e2f038 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -281,9 +281,9 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo. Gofer nodes do not support the
-// creation of fifos and always returns EOPNOTSUPP.
+// creation of fifos and always returns EPERM.
 func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return syscall.EOPNOTSUPP
+	return syscall.EPERM
 }
 
 // Remove implements InodeOperations.Remove.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index ebf2154bc..7a230e426 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -287,7 +287,7 @@ func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, st
 
 // CreateFifo implements fs.InodeOperations.CreateFifo.
 func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return syserror.EOPNOTSUPP
+	return syserror.EPERM
 }
 
 // Remove implements fs.InodeOperations.Remove.
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index b185015b6..2cf50290a 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -860,7 +860,10 @@ func (l *localFile) Link(target p9.File, newName string) error {
 //
 // Not implemented.
 func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
-	return p9.QID{}, syscall.ENOSYS
+	// From mknod(2) man page:
+	// "EPERM: [...] if the filesystem containing pathname does not support
+	// the type of node requested."
+	return p9.QID{}, syscall.EPERM
 }
 
 // UnlinkAt implements p9.File.
-- 
cgit v1.2.3


From 9006304dfecf3670ad03c9629f9a4ac3273c386a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 23 May 2019 04:15:18 -0700
Subject: Initial support for bind mounts

Separate MountSource from Mount. This is needed to allow
mounts to be shared by multiple containers within the same
pod.

PiperOrigin-RevId: 249617810
Change-Id: Id2944feb7e4194951f355cbe6d4944ae3c02e468
---
 pkg/sentry/fs/mock.go        |   1 -
 pkg/sentry/fs/mount.go       |  73 +-----------
 pkg/sentry/fs/mount_test.go  | 167 ++++++++++++++++++----------
 pkg/sentry/fs/mounts.go      | 258 +++++++++++++++++++++++++++++--------------
 pkg/sentry/fs/proc/mounts.go |  48 ++++----
 runsc/boot/fs.go             |  16 +--
 6 files changed, 317 insertions(+), 246 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 064943c5b..ff04e9b22 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -62,7 +62,6 @@ func NewMockMountSource(cache *DirentCache) *MountSource {
 	return &MountSource{
 		MountSourceOperations: &MockMountSourceOps{keep: keep},
 		fscache:               cache,
-		children:              make(map[*MountSource]struct{}),
 	}
 }
 
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 63fcf4380..41e0d285b 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -17,7 +17,6 @@ package fs
 import (
 	"bytes"
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/refs"
@@ -89,15 +88,7 @@ func (i InodeMappings) String() string {
 // one mount source. Each file object may only be represented using one inode
 // object in a sentry instance.
 //
-// This is an amalgamation of structs super_block, vfsmount, and mount, while
-// MountSourceOperations is akin to struct super_operations.
-//
-// Hence, mount source also contains common mounted file system state, such as
-// mount flags, the root Dirent, and children mounts. For now, this
-// amalgamation implies that a mount source cannot be shared by multiple mounts
-// (e.g. cannot be mounted at different locations).
-//
-// TODO(b/63601033): Move mount-specific information out of MountSource.
+// TODO(b/63601033): Move Flags out of MountSource to Mount.
 //
 // +stateify savable
 type MountSource struct {
@@ -128,22 +119,6 @@ type MountSource struct {
 	//
 	// direntRefs must be atomically changed.
 	direntRefs uint64
-
-	// mu protects the fields below, which are set by the MountNamespace
-	// during MountSource/Unmount.
-	mu sync.Mutex `state:"nosave"`
-
-	// id is a unique id for this mount.
-	id uint64
-
-	// root is the root Dirent of this mount.
-	root *Dirent
-
-	// parent is the parent MountSource, or nil if this MountSource is the root.
-	parent *MountSource
-
-	// children are the child MountSources of this MountSource.
-	children map[*MountSource]struct{}
 }
 
 // DefaultDirentCacheSize is the number of Dirents that the VFS can hold an
@@ -162,53 +137,7 @@ func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags Mou
 		Flags:                 flags,
 		FilesystemType:        fsType,
 		fscache:               NewDirentCache(DefaultDirentCacheSize),
-		children:              make(map[*MountSource]struct{}),
-	}
-}
-
-// Parent returns the parent mount, or nil if this mount is the root.
-func (msrc *MountSource) Parent() *MountSource {
-	msrc.mu.Lock()
-	defer msrc.mu.Unlock()
-	return msrc.parent
-}
-
-// ID returns the ID of this mount.
-func (msrc *MountSource) ID() uint64 {
-	msrc.mu.Lock()
-	defer msrc.mu.Unlock()
-	return msrc.id
-}
-
-// Children returns the (immediate) children of this MountSource.
-func (msrc *MountSource) Children() []*MountSource {
-	msrc.mu.Lock()
-	defer msrc.mu.Unlock()
-
-	ms := make([]*MountSource, 0, len(msrc.children))
-	for c := range msrc.children {
-		ms = append(ms, c)
 	}
-	return ms
-}
-
-// Submounts returns all mounts that are descendants of this mount.
-func (msrc *MountSource) Submounts() []*MountSource {
-	var ms []*MountSource
-	for _, c := range msrc.Children() {
-		ms = append(ms, c)
-		ms = append(ms, c.Submounts()...)
-	}
-	return ms
-}
-
-// Root returns the root dirent of this mount. Callers must call DecRef on the
-// returned dirent.
-func (msrc *MountSource) Root() *Dirent {
-	msrc.mu.Lock()
-	defer msrc.mu.Unlock()
-	msrc.root.IncRef()
-	return msrc.root
 }
 
 // DirentRefs returns the current mount direntRefs.
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index 9f7fbeff2..2e2716643 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -32,6 +32,27 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool {
 	return false
 }
 
+func mountPathsAre(root *Dirent, got []*Mount, want ...string) error {
+	gotPaths := make(map[string]struct{}, len(got))
+	gotStr := make([]string, len(got))
+	for i, g := range got {
+		groot := g.Root()
+		name, _ := groot.FullName(root)
+		groot.DecRef()
+		gotStr[i] = name
+		gotPaths[name] = struct{}{}
+	}
+	if len(got) != len(want) {
+		return fmt.Errorf("mount paths are different, got: %q, want: %q", gotStr, want)
+	}
+	for _, w := range want {
+		if _, ok := gotPaths[w]; !ok {
+			return fmt.Errorf("no mount with path %q found", w)
+		}
+	}
+	return nil
+}
+
 // TestMountSourceOnlyCachedOnce tests that a Dirent that is mounted over only ends
 // up in a single Dirent Cache. NOTE(b/63848693): Having a dirent in multiple
 // caches causes major consistency issues.
@@ -91,8 +112,7 @@ func TestMountSourceOnlyCachedOnce(t *testing.T) {
 	}
 }
 
-// Test that mounts have proper parent/child relationships.
-func TestMountSourceParentChildRelationship(t *testing.T) {
+func TestAllMountsUnder(t *testing.T) {
 	ctx := contexttest.Context(t)
 
 	rootCache := NewDirentCache(100)
@@ -122,101 +142,130 @@ func TestMountSourceParentChildRelationship(t *testing.T) {
 		if err != nil {
 			t.Fatalf("could not find path %q in mount manager: %v", p, err)
 		}
+
 		submountInode := NewMockInode(ctx, NewMockMountSource(nil), StableAttr{
 			Type: Directory,
 		})
 		if err := mm.Mount(ctx, d, submountInode); err != nil {
 			t.Fatalf("could not mount at %q: %v", p, err)
 		}
+		d.DecRef()
 	}
 
-	// mm root should contain all submounts (and does not include the root
-	// mount).
-	allMountSources := rootDirent.Inode.MountSource.Submounts()
-	if err := mountPathsAre(rootDirent, allMountSources, paths...); err != nil {
+	// mm root should contain all submounts (and does not include the root mount).
+	rootMnt := mm.FindMount(rootDirent)
+	submounts := mm.AllMountsUnder(rootMnt)
+	allPaths := append(paths, "/")
+	if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil {
 		t.Error(err)
 	}
 
 	// Each mount should have a unique ID.
 	foundIDs := make(map[uint64]struct{})
-	for _, m := range allMountSources {
-		id := m.ID()
-		if _, ok := foundIDs[id]; ok {
-			t.Errorf("got multiple mounts with id %d", id)
+	for _, m := range submounts {
+		if _, ok := foundIDs[m.ID]; ok {
+			t.Errorf("got multiple mounts with id %d", m.ID)
 		}
-		foundIDs[id] = struct{}{}
+		foundIDs[m.ID] = struct{}{}
 	}
 
 	// Root mount should have no parent.
-	rootMountSource := mm.root.Inode.MountSource
-	if p := rootMountSource.Parent(); p != nil {
+	if p := rootMnt.ParentID; p != invalidMountID {
 		t.Errorf("root.Parent got %v wanted nil", p)
 	}
 
-	// Root mount should have 2 children: foo and waldo.
-	rootChildren := rootMountSource.Children()
-	if err := mountPathsAre(rootDirent, rootChildren, "/foo", "/waldo"); err != nil {
-		t.Error(err)
-	}
-	// All root mount children should have root as parent.
-	for _, c := range rootChildren {
-		if p := c.Parent(); p != rootMountSource {
-			t.Errorf("root mount child got parent %+v, wanted root mount", p)
-		}
-	}
-
-	// "foo" mount should have two children: /foo/bar, and /foo/qux.
+	// Check that "foo" mount has 3 children.
 	maxTraversals = 0
 	d, err := mm.FindLink(ctx, rootDirent, nil, "/foo", &maxTraversals)
 	if err != nil {
 		t.Fatalf("could not find path %q in mount manager: %v", "/foo", err)
 	}
-	fooMountSource := d.Inode.MountSource
-	fooMountSourceChildren := fooMountSource.Children()
-	if err := mountPathsAre(rootDirent, fooMountSourceChildren, "/foo/bar", "/foo/qux"); err != nil {
-		t.Error(err)
-	}
-	// Each child should have fooMountSource as parent.
-	for _, c := range fooMountSourceChildren {
-		if p := c.Parent(); p != fooMountSource {
-			t.Errorf("foo mount child got parent %+v, wanted foo mount", p)
-		}
-	}
-	// Submounts of foo are /foo/bar, /foo/qux, and /foo/bar/baz.
-	if err := mountPathsAre(rootDirent, fooMountSource.Submounts(), "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
+	defer d.DecRef()
+	submounts = mm.AllMountsUnder(mm.FindMount(d))
+	if err := mountPathsAre(rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
 		t.Error(err)
 	}
 
-	// "waldo" mount should have no submounts or children.
+	// "waldo" mount should have no children.
 	maxTraversals = 0
 	waldo, err := mm.FindLink(ctx, rootDirent, nil, "/waldo", &maxTraversals)
 	if err != nil {
 		t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err)
 	}
-	waldoMountSource := waldo.Inode.MountSource
-	if got := len(waldoMountSource.Children()); got != 0 {
-		t.Errorf("waldo got %d children, wanted 0", got)
-	}
-	if got := len(waldoMountSource.Submounts()); got != 0 {
-		t.Errorf("waldo got %d children, wanted 0", got)
+	defer waldo.DecRef()
+	submounts = mm.AllMountsUnder(mm.FindMount(waldo))
+	if err := mountPathsAre(rootDirent, submounts, "/waldo"); err != nil {
+		t.Error(err)
 	}
 }
 
-func mountPathsAre(root *Dirent, got []*MountSource, want ...string) error {
-	if len(got) != len(want) {
-		return fmt.Errorf("mount paths have different lengths: got %d want %d", len(got), len(want))
+func TestUnmount(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	rootCache := NewDirentCache(100)
+	rootInode := NewMockInode(ctx, NewMockMountSource(rootCache), StableAttr{
+		Type: Directory,
+	})
+	mm, err := NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		t.Fatalf("NewMountNamespace failed: %v", err)
 	}
-	gotPaths := make(map[string]struct{}, len(got))
-	for _, g := range got {
-		groot := g.Root()
-		n, _ := groot.FullName(root)
-		groot.DecRef()
-		gotPaths[n] = struct{}{}
+	rootDirent := mm.Root()
+	defer rootDirent.DecRef()
+
+	// Add mounts at the following paths:
+	paths := []string{
+		"/foo",
+		"/foo/bar",
+		"/foo/bar/goo",
+		"/foo/bar/goo/abc",
+		"/foo/abc",
+		"/foo/def",
+		"/waldo",
+		"/wally",
 	}
-	for _, w := range want {
-		if _, ok := gotPaths[w]; !ok {
-			return fmt.Errorf("no mount with path %q found", w)
+
+	var maxTraversals uint
+	for _, p := range paths {
+		maxTraversals = 0
+		d, err := mm.FindLink(ctx, rootDirent, nil, p, &maxTraversals)
+		if err != nil {
+			t.Fatalf("could not find path %q in mount manager: %v", p, err)
+		}
+
+		submountInode := NewMockInode(ctx, NewMockMountSource(nil), StableAttr{
+			Type: Directory,
+		})
+		if err := mm.Mount(ctx, d, submountInode); err != nil {
+			t.Fatalf("could not mount at %q: %v", p, err)
+		}
+		d.DecRef()
+	}
+
+	allPaths := make([]string, len(paths)+1)
+	allPaths[0] = "/"
+	copy(allPaths[1:], paths)
+
+	rootMnt := mm.FindMount(rootDirent)
+	for i := len(paths) - 1; i >= 0; i-- {
+		maxTraversals = 0
+		p := paths[i]
+		d, err := mm.FindLink(ctx, rootDirent, nil, p, &maxTraversals)
+		if err != nil {
+			t.Fatalf("could not find path %q in mount manager: %v", p, err)
+		}
+
+		if err := mm.Unmount(ctx, d, false); err != nil {
+			t.Fatalf("could not unmount at %q: %v", p, err)
+		}
+		d.DecRef()
+
+		// Remove the path that has been unmounted and the check that the remaining
+		// mounts are still there.
+		allPaths = allPaths[:len(allPaths)-1]
+		submounts := mm.AllMountsUnder(rootMnt)
+		if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil {
+			t.Error(err)
 		}
 	}
-	return nil
 }
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 01eb4607e..a5c52d7ba 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -16,6 +16,7 @@ package fs
 
 import (
 	"fmt"
+	"math"
 	"path"
 	"strings"
 	"sync"
@@ -35,6 +36,94 @@ import (
 // sane.
 const DefaultTraversalLimit = 10
 
+const invalidMountID = math.MaxUint64
+
+// Mount represents a mount in the file system. It holds the root dirent for the
+// mount. It also points back to the dirent or mount where it was mounted over,
+// so that it can be restored when unmounted. The chained mount can be either:
+//   - Mount: when it's mounted on top of another mount point.
+//   - Dirent: when it's mounted on top of a dirent. In this case the mount is
+//     called an "undo" mount and only 'root' is set. All other fields are
+//     either invalid or nil.
+//
+// +stateify savable
+type Mount struct {
+	// ID is a unique id for this mount. It may be invalidMountID if this is
+	// used to cache a dirent that was mounted over.
+	ID uint64
+
+	// ParentID is the parent's mount unique id. It may be invalidMountID if this
+	// is the root mount or if this is used to cache a dirent that was mounted
+	// over.
+	ParentID uint64
+
+	// root is the root Dirent of this mount. A reference on this Dirent must be
+	// held through the lifetime of the Mount which contains it.
+	root *Dirent
+
+	// previous is the existing dirent or mount that this object was mounted over.
+	// It's nil for the root mount and for the last entry in the chain (always an
+	// "undo" mount).
+	previous *Mount
+}
+
+// newMount creates a new mount, taking a reference on 'root'. Caller must
+// release the reference when it's done with the mount.
+func newMount(id, pid uint64, root *Dirent) *Mount {
+	root.IncRef()
+	return &Mount{
+		ID:       id,
+		ParentID: pid,
+		root:     root,
+	}
+}
+
+// newRootMount creates a new root mount (no parent), taking a reference on
+// 'root'. Caller must release the reference when it's done with the mount.
+func newRootMount(id uint64, root *Dirent) *Mount {
+	root.IncRef()
+	return &Mount{
+		ID:       id,
+		ParentID: invalidMountID,
+		root:     root,
+	}
+}
+
+// newUndoMount creates a new undo mount, taking a reference on 'd'. Caller must
+// release the reference when it's done with the mount.
+func newUndoMount(d *Dirent) *Mount {
+	d.IncRef()
+	return &Mount{
+		ID:       invalidMountID,
+		ParentID: invalidMountID,
+		root:     d,
+	}
+}
+
+// Root returns the root dirent of this mount. Callers must call DecRef on the
+// returned dirent.
+func (m *Mount) Root() *Dirent {
+	m.root.IncRef()
+	return m.root
+}
+
+// IsRoot returns true if the mount has no parent.
+func (m *Mount) IsRoot() bool {
+	return !m.IsUndo() && m.ParentID == invalidMountID
+}
+
+// IsUndo returns true if 'm' is an undo mount that should be used to restore
+// the original dirent during unmount only and it's not a valid mount.
+func (m *Mount) IsUndo() bool {
+	if m.ID == invalidMountID {
+		if m.ParentID != invalidMountID {
+			panic(fmt.Sprintf("Undo mount with valid parentID: %+v", m))
+		}
+		return true
+	}
+	return false
+}
+
 // MountNamespace defines a collection of mounts.
 //
 // +stateify savable
@@ -55,13 +144,16 @@ type MountNamespace struct {
 	// mu protects mounts and mountID counter.
 	mu sync.Mutex `state:"nosave"`
 
-	// mounts is a map of the last mounted Dirent -> stack of old Dirents
-	// that were mounted over, with the oldest mounted Dirent first and
-	// more recent mounted Dirents at the end of the slice.
-	//
-	// A reference to all Dirents in mounts (keys and values) must be held
-	// to ensure the Dirents are recoverable when unmounting.
-	mounts map[*Dirent][]*Dirent
+	// mounts is a map of mounted Dirent -> Mount object. There are three
+	// possible cases:
+	//   - Dirent is mounted over a mount point: the stored Mount object will be
+	//     the Mount for that mount point.
+	//   - Dirent is mounted over a regular (non-mount point) Dirent: the stored
+	//     Mount object will be an "undo" mount containing the mounted-over
+	//     Dirent.
+	//   - Dirent is the root mount: the stored Mount object will be a root mount
+	//     containing the Dirent itself.
+	mounts map[*Dirent]*Mount
 
 	// mountID is the next mount id to assign.
 	mountID uint64
@@ -72,18 +164,18 @@ type MountNamespace struct {
 func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
 	creds := auth.CredentialsFromContext(ctx)
 
-	root.MountSource.mu.Lock()
-	defer root.MountSource.mu.Unlock()
-
-	// Set the root dirent and id on the root mount.
+	// Set the root dirent and id on the root mount. The reference returned from
+	// NewDirent will be donated to the MountNamespace constructed below.
 	d := NewDirent(root, "/")
-	root.MountSource.root = d
-	root.MountSource.id = 1
+
+	mnts := map[*Dirent]*Mount{
+		d: newRootMount(1, d),
+	}
 
 	return &MountNamespace{
 		userns:  creds.UserNamespace,
 		root:    d,
-		mounts:  make(map[*Dirent][]*Dirent),
+		mounts:  mnts,
 		mountID: 2,
 	}, nil
 }
@@ -110,10 +202,9 @@ func (mns *MountNamespace) FlushMountSourceRefs() {
 
 func (mns *MountNamespace) flushMountSourceRefsLocked() {
 	// Flush mounts' MountSource references.
-	for current, stack := range mns.mounts {
-		current.Inode.MountSource.FlushDirentRefs()
-		for _, prev := range stack {
-			prev.Inode.MountSource.FlushDirentRefs()
+	for _, mp := range mns.mounts {
+		for ; mp != nil; mp = mp.previous {
+			mp.root.Inode.MountSource.FlushDirentRefs()
 		}
 	}
 
@@ -136,12 +227,11 @@ func (mns *MountNamespace) destroy() {
 	mns.flushMountSourceRefsLocked()
 
 	// Teardown mounts.
-	for current, mp := range mns.mounts {
+	for _, mp := range mns.mounts {
 		// Drop the mount reference on all mounted dirents.
-		for _, d := range mp {
-			d.DecRef()
+		for ; mp != nil; mp = mp.previous {
+			mp.root.DecRef()
 		}
-		current.DecRef()
 	}
 	mns.mounts = nil
 
@@ -208,46 +298,34 @@ func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error
 }
 
 // Mount mounts a `inode` over the subtree at `node`.
-func (mns *MountNamespace) Mount(ctx context.Context, node *Dirent, inode *Inode) error {
-	return mns.withMountLocked(node, func() error {
-		// replacement already has one reference taken; this is the mount
-		// reference.
-		replacement, err := node.mount(ctx, inode)
+func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode *Inode) error {
+	return mns.withMountLocked(mountPoint, func() error {
+		replacement, err := mountPoint.mount(ctx, inode)
 		if err != nil {
 			return err
 		}
-
-		// Set child/parent dirent relationship.
-		parentMountSource := node.Inode.MountSource
-		childMountSource := inode.MountSource
-		parentMountSource.mu.Lock()
-		defer parentMountSource.mu.Unlock()
-		childMountSource.mu.Lock()
-		defer childMountSource.mu.Unlock()
-
-		parentMountSource.children[childMountSource] = struct{}{}
-		childMountSource.parent = parentMountSource
+		defer replacement.DecRef()
 
 		// Set the mount's root dirent and id.
-		childMountSource.root = replacement
-		childMountSource.id = mns.mountID
+		parentMnt := mns.findMountLocked(mountPoint)
+		childMnt := newMount(mns.mountID, parentMnt.ID, replacement)
 		mns.mountID++
 
-		// Drop node from its dirent cache.
-		node.dropExtendedReference()
+		// Drop mountPoint from its dirent cache.
+		mountPoint.dropExtendedReference()
 
-		// If node is already a mount point, push node on the stack so it can
+		// If mountPoint is already a mount, push mountPoint on the stack so it can
 		// be recovered on unmount.
-		if stack, ok := mns.mounts[node]; ok {
-			mns.mounts[replacement] = append(stack, node)
-			delete(mns.mounts, node)
+		if prev := mns.mounts[mountPoint]; prev != nil {
+			childMnt.previous = prev
+			mns.mounts[replacement] = childMnt
+			delete(mns.mounts, mountPoint)
 			return nil
 		}
 
 		// Was not already mounted, just add another mount point.
-		// Take a reference on node so it can be recovered on unmount.
-		node.IncRef()
-		mns.mounts[replacement] = []*Dirent{node}
+		childMnt.previous = newUndoMount(mountPoint)
+		mns.mounts[replacement] = childMnt
 		return nil
 	})
 }
@@ -268,13 +346,13 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 	// This takes locks to prevent further walks to Dirents in this mount
 	// under the assumption that `node` is the root of the mount.
 	return mns.withMountLocked(node, func() error {
-		origs, ok := mns.mounts[node]
+		orig, ok := mns.mounts[node]
 		if !ok {
 			// node is not a mount point.
 			return syserror.EINVAL
 		}
 
-		if len(origs) == 0 {
+		if orig.previous == nil {
 			panic("cannot unmount initial dirent")
 		}
 
@@ -298,44 +376,62 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 			}
 		}
 
-		// Lock the parent MountSource first, if it exists. We are
-		// holding mns.Lock, so the parent can not change out
-		// from under us.
-		parent := m.Parent()
-		if parent != nil {
-			parent.mu.Lock()
-			defer parent.mu.Unlock()
+		prev := orig.previous
+		if err := node.unmount(ctx, prev.root); err != nil {
+			return err
 		}
 
-		// Lock the mount that is being unmounted.
-		m.mu.Lock()
-		defer m.mu.Unlock()
-
-		if m.parent != nil {
-			// Sanity check.
-			if _, ok := m.parent.children[m]; !ok {
-				panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent))
+		if prev.previous == nil {
+			if !prev.IsUndo() {
+				panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev))
 			}
-			delete(m.parent.children, m)
+			// Drop mount reference taken at the end of MountNamespace.Mount.
+			prev.root.DecRef()
+		} else {
+			mns.mounts[prev.root] = prev
 		}
+		delete(mns.mounts, node)
 
-		original := origs[len(origs)-1]
-		if err := node.unmount(ctx, original); err != nil {
-			return err
-		}
+		return nil
+	})
+}
+
+// FindMount returns the mount that 'd' belongs to. It walks the dirent back
+// until a mount is found. It may return nil if no mount was found.
+func (mns *MountNamespace) FindMount(d *Dirent) *Mount {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+	renameMu.Lock()
+	defer renameMu.Unlock()
 
-		switch {
-		case len(origs) > 1:
-			mns.mounts[original] = origs[:len(origs)-1]
-		case len(origs) == 1:
-			// Drop mount reference taken at the end of
-			// MountNamespace.Mount.
-			original.DecRef()
+	return mns.findMountLocked(d)
+}
+
+func (mns *MountNamespace) findMountLocked(d *Dirent) *Mount {
+	for {
+		if mnt := mns.mounts[d]; mnt != nil {
+			return mnt
+		}
+		if d.parent == nil {
+			return nil
 		}
+		d = d.parent
+	}
+}
 
-		delete(mns.mounts, node)
-		return nil
-	})
+// AllMountsUnder returns a slice of all mounts under the parent, including
+// itself.
+func (mns *MountNamespace) AllMountsUnder(parent *Mount) []*Mount {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+
+	var rv []*Mount
+	for _, mp := range mns.mounts {
+		if !mp.IsUndo() && mp.root.descendantOf(parent.root) {
+			rv = append(rv, mp)
+		}
+	}
+	return rv
 }
 
 // FindLink returns an Dirent from a given node, which may be a symlink.
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index b5e01301f..1f7817947 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -27,7 +27,7 @@ import (
 
 // forEachMountSource runs f for the process root mount and  each mount that is a
 // descendant of the root.
-func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
+func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
 	var fsctx *kernel.FSContext
 	t.WithMuLocked(func(t *kernel.Task) {
 		fsctx = t.FSContext()
@@ -46,16 +46,14 @@ func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
 	}
 	defer rootDir.DecRef()
 
-	if rootDir.Inode == nil {
-		panic(fmt.Sprintf("root dirent has nil inode: %+v", rootDir))
-	}
-	if rootDir.Inode.MountSource == nil {
-		panic(fmt.Sprintf("root dirent has nil mount: %+v", rootDir))
+	mnt := t.MountNamespace().FindMount(rootDir)
+	if mnt == nil {
+		// Has it just been unmounted?
+		return
 	}
-
-	ms := append(rootDir.Inode.MountSource.Submounts(), rootDir.Inode.MountSource)
+	ms := t.MountNamespace().AllMountsUnder(mnt)
 	sort.Slice(ms, func(i, j int) bool {
-		return ms[i].ID() < ms[j].ID()
+		return ms[i].ID < ms[j].ID
 	})
 	for _, m := range ms {
 		mroot := m.Root()
@@ -89,26 +87,27 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 	}
 
 	var buf bytes.Buffer
-	forEachMountSource(mif.t, func(mountPath string, m *fs.MountSource) {
+	forEachMount(mif.t, func(mountPath string, m *fs.Mount) {
 		// Format:
 		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
 		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
 
 		// (1) MountSource ID.
-		fmt.Fprintf(&buf, "%d ", m.ID())
+		fmt.Fprintf(&buf, "%d ", m.ID)
 
 		// (2)  Parent ID (or this ID if there is no parent).
-		pID := m.ID()
-		if p := m.Parent(); p != nil {
-			pID = p.ID()
+		pID := m.ID
+		if !m.IsRoot() && !m.IsUndo() {
+			pID = m.ParentID
 		}
 		fmt.Fprintf(&buf, "%d ", pID)
 
 		// (3) Major:Minor device ID. We don't have a superblock, so we
 		// just use the root inode device number.
 		mroot := m.Root()
+		defer mroot.DecRef()
+
 		sa := mroot.Inode.StableAttr
-		mroot.DecRef()
 		fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
 
 		// (4) Root: the pathname of the directory in the filesystem
@@ -122,14 +121,15 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		fmt.Fprintf(&buf, "%s ", mountPath)
 
 		// (6) Mount options.
+		flags := mroot.Inode.MountSource.Flags
 		opts := "rw"
-		if m.Flags.ReadOnly {
+		if flags.ReadOnly {
 			opts = "ro"
 		}
-		if m.Flags.NoAtime {
+		if flags.NoAtime {
 			opts += ",noatime"
 		}
-		if m.Flags.NoExec {
+		if flags.NoExec {
 			opts += ",noexec"
 		}
 		fmt.Fprintf(&buf, "%s ", opts)
@@ -139,7 +139,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		fmt.Fprintf(&buf, "- ")
 
 		// (9) Filesystem type.
-		fmt.Fprintf(&buf, "%s ", m.FilesystemType)
+		fmt.Fprintf(&buf, "%s ", mroot.Inode.MountSource.FilesystemType)
 
 		// (10) Mount source: filesystem-specific information or "none".
 		fmt.Fprintf(&buf, "none ")
@@ -171,7 +171,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 	}
 
 	var buf bytes.Buffer
-	forEachMountSource(mf.t, func(mountPath string, m *fs.MountSource) {
+	forEachMount(mf.t, func(mountPath string, m *fs.Mount) {
 		// Format:
 		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
 		//
@@ -182,11 +182,15 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		// Only ro/rw option is supported for now.
 		//
 		// The "needs dump"and fsck flags are always 0, which is allowed.
+		root := m.Root()
+		defer root.DecRef()
+
+		flags := root.Inode.MountSource.Flags
 		opts := "rw"
-		if m.Flags.ReadOnly {
+		if flags.ReadOnly {
 			opts = "ro"
 		}
-		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, m.FilesystemType, opts, 0, 0)
+		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0)
 	})
 
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 1611dda2c..bc05b3491 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -685,27 +685,21 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 
 	// Iterate through all submounts and unmount them. We unmount lazily by
 	// setting detach=true, so we can unmount in any order.
-	for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
+	mnt := mns.FindMount(containerRootDirent)
+	for _, m := range mns.AllMountsUnder(mnt) {
 		root := m.Root()
 		defer root.DecRef()
 
 		// Do a best-effort unmount by flushing the refs and unmount
 		// with "detach only = true". Unmount returns EINVAL when the mount point
 		// doesn't exist, i.e. it has already been unmounted.
-		log.Debugf("Unmounting container submount %q", root.BaseName())
-		m.FlushDirentRefs()
+		log.Debugf("Unmounting container mount %q", root.BaseName())
+		root.Inode.MountSource.FlushDirentRefs()
 		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
-			return fmt.Errorf("unmounting container submount %q: %v", root.BaseName(), err)
+			return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err)
 		}
 	}
 
-	// Unmount the container root itself.
-	log.Debugf("Unmounting container root %q", containerRoot)
-	containerRootDirent.Inode.MountSource.FlushDirentRefs()
-	if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
-		return fmt.Errorf("unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
-	}
-
 	// Get a reference to the parent directory and remove the root
 	// container directory.
 	maxTraversals = 0
-- 
cgit v1.2.3


From c091e6236922f1e9af7afbe811fd03ec297aae16 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 23 May 2019 06:46:55 -0700
Subject: Set sticky bit to /tmp

This is generally done for '/tmp' to prevent accidental
deletion of files. More details here:
http://man7.org/linux/man-pages/man1/chmod.1.html#RESTRICTED_DELETION_FLAG_OR_STICKY_BIT

PiperOrigin-RevId: 249633207
Change-Id: I444a5b406fdef664f5677b2f20f374972613a02b
---
 runsc/boot/fs.go | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'runsc')

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index bc05b3491..4b1557b9a 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -290,7 +290,7 @@ func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, r
 		}
 	}
 
-	if err := mountTmp(ctx, conf, mns, root, fds, mounts); err != nil {
+	if err := mountTmp(ctx, conf, mns, root, mounts); err != nil {
 		return fmt.Errorf("mount submount %q: %v", "tmp", err)
 	}
 
@@ -551,9 +551,8 @@ func subtargets(root string, mnts []specs.Mount) []string {
 func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
 	ctx := procArgs.NewContext(k)
 
-	// Create the FD map, which will set stdin, stdout, and stderr.  If
-	// console is true, then ioctl calls will be passed through to the host
-	// fd.
+	// Create the FD map, which will set stdin, stdout, and stderr.  If console
+	// is true, then ioctl calls will be passed through to the host fd.
 	fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
 	if err != nil {
 		return fmt.Errorf("importing fds: %v", err)
@@ -725,7 +724,7 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, mounts []specs.Mount) error {
+func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error {
 	for _, m := range mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
@@ -763,8 +762,11 @@ func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *f
 		tmpMount := specs.Mount{
 			Type:        tmpfs,
 			Destination: "/tmp",
+			// Sticky bit is added to prevent accidental deletion of files from
+			// another user. This is normally done for /tmp.
+			Options: []string{"mode=1777"},
 		}
-		return mountSubmount(ctx, conf, mns, root, fds, tmpMount, mounts)
+		return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts)
 
 	default:
 		return err
-- 
cgit v1.2.3


From 409e8eea60f096b34c9005b302dc821f38ac19ed Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 23 May 2019 22:27:36 -0700
Subject: runsc/do: do a proper cleanup if a command failed due to internal
 errors

Fatalf calls os.Exit and a process exits without calling defer callbacks.

Should we do this for other runsc commands?

PiperOrigin-RevId: 249776310
Change-Id: If9d8b54d0ae37db443895906eb33bd9e9b600cc9
---
 runsc/cmd/cmd.go | 11 +++++++++--
 runsc/cmd/do.go  | 16 ++++++++--------
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index aa7b1a636..a2fc377d1 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -22,19 +22,26 @@ import (
 	"strconv"
 	"syscall"
 
+	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-// Fatalf logs to stderr and exits with a failure status code.
-func Fatalf(s string, args ...interface{}) {
+// Errorf logs to stderr and returns subcommands.ExitFailure.
+func Errorf(s string, args ...interface{}) subcommands.ExitStatus {
 	// If runsc is being invoked by docker or cri-o, then we might not have
 	// access to stderr, so we log a serious-looking warning in addition to
 	// writing to stderr.
 	log.Warningf("FATAL ERROR: "+s, args...)
 	fmt.Fprintf(os.Stderr, s+"\n", args...)
 	// Return an error that is unlikely to be used by the application.
+	return subcommands.ExitFailure
+}
+
+// Fatalf logs to stderr and exits with a failure status code.
+func Fatalf(s string, args ...interface{}) {
+	Errorf(s, args...)
 	os.Exit(128)
 }
 
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index c5e72f32b..425db8efe 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -89,16 +89,16 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 
 	hostname, err := os.Hostname()
 	if err != nil {
-		Fatalf("Error to retrieve hostname: %v", err)
+		return Errorf("Error to retrieve hostname: %v", err)
 	}
 
 	absRoot, err := resolvePath(c.root)
 	if err != nil {
-		Fatalf("Error resolving root: %v", err)
+		return Errorf("Error resolving root: %v", err)
 	}
 	absCwd, err := resolvePath(c.cwd)
 	if err != nil {
-		Fatalf("Error resolving current directory: %v", err)
+		return Errorf("Error resolving current directory: %v", err)
 	}
 
 	spec := &specs.Spec{
@@ -121,18 +121,18 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	if conf.Network != boot.NetworkNone {
 		clean, err := c.setupNet(cid, spec)
 		if err != nil {
-			Fatalf("Error setting up network: %v", err)
+			return Errorf("Error setting up network: %v", err)
 		}
 		defer clean()
 	}
 
 	out, err := json.Marshal(spec)
 	if err != nil {
-		Fatalf("Error to marshal spec: %v", err)
+		return Errorf("Error to marshal spec: %v", err)
 	}
 	tmpDir, err := ioutil.TempDir("", "runsc-do")
 	if err != nil {
-		Fatalf("Error to create tmp dir: %v", err)
+		return Errorf("Error to create tmp dir: %v", err)
 	}
 	defer os.RemoveAll(tmpDir)
 
@@ -141,12 +141,12 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 
 	cfgPath := filepath.Join(tmpDir, "config.json")
 	if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil {
-		Fatalf("Error write spec: %v", err)
+		return Errorf("Error write spec: %v", err)
 	}
 
 	ws, err := container.Run(cid, spec, conf, tmpDir, "", "", "", false)
 	if err != nil {
-		Fatalf("running container: %v", err)
+		return Errorf("running container: %v", err)
 	}
 
 	*waitStatus = ws
-- 
cgit v1.2.3


From 1e42b4cfcad9ff4becb1041b14107815f585becf Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 28 May 2019 11:16:45 -0700
Subject: Update internal flag name and documentation

Updates #234

PiperOrigin-RevId: 250323553
---
 runsc/cmd/exec.go | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index ad2508405..52fd7ac4b 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -40,6 +40,8 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
+const privateClearStatusFlag = "private-clear-status"
+
 // Exec implements subcommands.Command for the "exec" command.
 type Exec struct {
 	cwd string
@@ -102,8 +104,9 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
 	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
 
-	// clear-status is expected to only be set when we fork due to --detach being set.
-	f.BoolVar(&ex.clearStatus, "clear-status", true, "clear the status of the exec'd process upon completion")
+	// This flag clears the status of the exec'd process upon completion. It is
+	// only used when we fork due to --detach being set on the parent.
+	f.BoolVar(&ex.clearStatus, privateClearStatusFlag, true, "private flag, do not use")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
@@ -210,10 +213,10 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	// Add the rest of the args, excluding the "detach" flag.
 	for _, a := range os.Args[1:] {
 		if strings.Contains(a, "detach") {
-			// Replace with the "clear-status" flag, which tells
+			// Replace with the "private-clear-status" flag, which tells
 			// the new process it's a detached child and shouldn't
 			// clear the exit status of the sentry process.
-			args = append(args, "--clear-status=false")
+			args = append(args, fmt.Sprintf("--%s=false", privateClearStatusFlag))
 		} else {
 			args = append(args, a)
 		}
-- 
cgit v1.2.3


From 673358c0d94f82ac56d9f4f6e7aec7ff5761e1cc Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 28 May 2019 11:47:46 -0700
Subject: runsc/do: allow to run commands in a host network namespace

PiperOrigin-RevId: 250329795
---
 kokoro/run_tests.sh | 12 ++++++++++++
 runsc/cmd/do.go     | 14 ++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'runsc')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index b3f333f2f..6a7c1fdb6 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -182,6 +182,17 @@ run_syscall_tests() {
     --test_tag_filters=runsc_ptrace //test/syscalls/...
 }
 
+run_runsc_do_tests() {
+  local runsc=$(find bazel-bin/runsc -type f -executable -name "runsc" | head -n1)
+
+  # run runsc do without root privileges.
+  unshare -Ur ${runsc} --network=none --TESTONLY-unsafe-nonroot do true
+  unshare -Ur ${runsc} --TESTONLY-unsafe-nonroot --network=host do --netns=false true
+
+  # run runsc do with root privileges.
+  sudo -n -E ${runsc} do true
+}
+
 # Find and rename all test xml and log files so that Sponge can pick them up.
 # XML files must be named sponge_log.xml, and log files must be named
 # sponge_log.log. We move all such files into KOKORO_ARTIFACTS_DIR, in a
@@ -234,6 +245,7 @@ main() {
   run_root_tests
 
   run_syscall_tests
+  run_runsc_do_tests
 
   # Build other flavors too.
   build_everything dbg
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 425db8efe..c057f3087 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -39,9 +39,10 @@ import (
 // Do implements subcommands.Command for the "do" command. It sets up a simple
 // sandbox and executes the command inside it. See Usage() for more details.
 type Do struct {
-	root string
-	cwd  string
-	ip   string
+	root             string
+	cwd              string
+	ip               string
+	networkNamespace bool
 }
 
 // Name implements subcommands.Command.Name.
@@ -71,6 +72,7 @@ func (c *Do) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
 	f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory")
 	f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox")
+	f.BoolVar(&c.networkNamespace, "netns", true, "run in a new network namespace")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -118,7 +120,11 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	specutils.LogSpec(spec)
 
 	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
-	if conf.Network != boot.NetworkNone {
+	if !c.networkNamespace {
+		if conf.Network != boot.NetworkHost {
+			Fatalf("The current network namespace can be used only if --network=host is set", nil)
+		}
+	} else if conf.Network != boot.NetworkNone {
 		clean, err := c.setupNet(cid, spec)
 		if err != nil {
 			return Errorf("Error setting up network: %v", err)
-- 
cgit v1.2.3


From b52e571a6188ce90b5a13b002753230780119db9 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 28 May 2019 23:02:07 -0700
Subject: runsc/do: don't specify the read-only flag for the root mount

The root mount is an overlay mount.

PiperOrigin-RevId: 250429317
---
 runsc/cmd/do.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'runsc')

diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index c057f3087..8ea59046c 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -105,8 +105,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 
 	spec := &specs.Spec{
 		Root: &specs.Root{
-			Path:     absRoot,
-			Readonly: true,
+			Path: absRoot,
 		},
 		Process: &specs.Process{
 			Cwd:          absCwd,
-- 
cgit v1.2.3


From 035a8fa38ed21da2e06db22d3dfd6122610fb856 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 29 May 2019 11:30:59 -0700
Subject: Add support for collecting execution trace to runsc.

Updates #220

PiperOrigin-RevId: 250532302
---
 pkg/sentry/control/pprof.go | 44 ++++++++++++++++++++++++++++++++++++
 runsc/boot/controller.go    |  2 ++
 runsc/cmd/debug.go          | 55 ++++++++++++++++++++++++++++++++++-----------
 runsc/sandbox/sandbox.go    | 35 +++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 13 deletions(-)

(limited to 'runsc')

diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 94ed149f2..d63916600 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -18,6 +18,7 @@ import (
 	"errors"
 	"runtime"
 	"runtime/pprof"
+	"runtime/trace"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
@@ -52,6 +53,9 @@ type Profile struct {
 
 	// cpuFile is the current CPU profile output file.
 	cpuFile *fd.FD
+
+	// traceFile is the current execution trace output file.
+	traceFile *fd.FD
 }
 
 // StartCPUProfile is an RPC stub which starts recording the CPU profile in a
@@ -122,3 +126,43 @@ func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error {
 	}
 	return nil
 }
+
+// StartTrace is an RPC stub which starts collection of an execution trace.
+func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+
+	output, err := fd.NewFromFile(o.FilePayload.Files[0])
+	if err != nil {
+		return err
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Returns an error if profiling is already started.
+	if err := trace.Start(output); err != nil {
+		output.Close()
+		return err
+	}
+
+	p.traceFile = output
+	return nil
+}
+
+// StopTrace is an RPC stub which stops collection of an ongoing execution
+// trace and flushes the trace data. It takes no argument.
+func (p *Profile) StopTrace(_, _ *struct{}) error {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if p.traceFile == nil {
+		return errors.New("Execution tracing not start")
+	}
+
+	trace.Stop()
+	p.traceFile.Close()
+	p.traceFile = nil
+	return nil
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index f09c1bd85..72ab9ef86 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -101,6 +101,8 @@ const (
 	StartCPUProfile = "Profile.StartCPUProfile"
 	StopCPUProfile  = "Profile.StopCPUProfile"
 	HeapProfile     = "Profile.HeapProfile"
+	StartTrace      = "Profile.StartTrace"
+	StopTrace       = "Profile.StopTrace"
 )
 
 // ControlSocketAddr generates an abstract unix socket name for the given ID.
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 000f694c7..27eb51172 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -35,6 +35,7 @@ type Debug struct {
 	profileHeap  string
 	profileCPU   string
 	profileDelay int
+	trace        string
 }
 
 // Name implements subcommands.Command.
@@ -59,6 +60,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
 	f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
+	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 }
 
@@ -122,35 +124,62 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("     *** Stack dump ***\n%s", stacks)
 	}
-	if d.profileCPU != "" {
-		f, err := os.Create(d.profileCPU)
+	if d.profileHeap != "" {
+		f, err := os.Create(d.profileHeap)
 		if err != nil {
 			Fatalf(err.Error())
 		}
 		defer f.Close()
 
-		if err := c.Sandbox.StartCPUProfile(f); err != nil {
+		if err := c.Sandbox.HeapProfile(f); err != nil {
 			Fatalf(err.Error())
 		}
-		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
-		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+		log.Infof("Heap profile written to %q", d.profileHeap)
+	}
 
-		if err := c.Sandbox.StopCPUProfile(); err != nil {
+	delay := false
+	if d.profileCPU != "" {
+		delay = true
+		f, err := os.Create(d.profileCPU)
+		if err != nil {
 			Fatalf(err.Error())
 		}
-		log.Infof("CPU profile written to %q", d.profileCPU)
+		defer func() {
+			f.Close()
+			if err := c.Sandbox.StopCPUProfile(); err != nil {
+				Fatalf(err.Error())
+			}
+			log.Infof("CPU profile written to %q", d.profileCPU)
+		}()
+		if err := c.Sandbox.StartCPUProfile(f); err != nil {
+			Fatalf(err.Error())
+		}
+		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
 	}
-	if d.profileHeap != "" {
-		f, err := os.Create(d.profileHeap)
+	if d.trace != "" {
+		delay = true
+		f, err := os.Create(d.trace)
 		if err != nil {
 			Fatalf(err.Error())
 		}
-		defer f.Close()
-
-		if err := c.Sandbox.HeapProfile(f); err != nil {
+		defer func() {
+			f.Close()
+			if err := c.Sandbox.StopTrace(); err != nil {
+				Fatalf(err.Error())
+			}
+			log.Infof("Trace written to %q", d.trace)
+		}()
+		if err := c.Sandbox.StartTrace(f); err != nil {
 			Fatalf(err.Error())
 		}
-		log.Infof("Heap profile written to %q", d.profileHeap)
+		log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace)
+
 	}
+
+	if delay {
+		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+
+	}
+
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index bc69a9d61..47a66afb2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -883,6 +883,41 @@ func (s *Sandbox) StopCPUProfile() error {
 	return nil
 }
 
+// StartTrace start trace  writing to the given file.
+func (s *Sandbox) StartTrace(f *os.File) error {
+	log.Debugf("Trace start %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.StartTrace, &opts, nil); err != nil {
+		return fmt.Errorf("starting sandbox %q trace: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StopTrace stops a previously started trace..
+func (s *Sandbox) StopTrace() error {
+	log.Debugf("Trace stop %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.StopTrace, nil, nil); err != nil {
+		return fmt.Errorf("stopping sandbox %q trace: %v", s.ID, err)
+	}
+	return nil
+}
+
 // DestroyContainer destroys the given container. If it is the root container,
 // then the entire sandbox is destroyed.
 func (s *Sandbox) DestroyContainer(cid string) error {
-- 
cgit v1.2.3